In [13]:
import os
import csv
import struct
import chardet
import numpy as np
import collections
from typing import *
from time import time
import seaborn as sns
from sklearn.svm import SVC
from skimage.feature import hog
from numpy import random as rnd
import matplotlib.pyplot as plt
import matplotlib.pyplot as pyplot
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from IPython.display import clear_output
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

In [14]:
!ls

Logistic Regression.ipynb
NHIS_OPEN_GJ_2017_100.csv
NHIS_OPEN_GJ_2018_100.csv
README.md


# Data Reading

In [125]:
class ReadData():
    def __init__(self, year=2017, required_info_index=None):
        self.year = year
        self.infos = []
        self.data = []
        self.labels = []
        self.no_data = []
        self.data_infos = []
        self.required_info_index = required_info_index
#         self.required_info_index = [2, 3, 5, 6, 7, 12, 13, 15, 16, 17, 18, 
#                                19, 20, 21, 22, 23, 24, 25, 26]
        self.no_data_count = collections.defaultdict(int)
        
    def read(self, no_npz = True, DATASIZE=None, FBS_PIVOT=120, calculate_BMI=True):
        year = self.year
        print(f'Reading year {self.year}')
        if not DATASIZE:
            DATASIZE = float('inf')
        if year == 2017:
            FILENAME = 'NHIS_OPEN_GJ_2017_100.csv'
        elif year == 2018:
            FILENAME = 'NHIS_OPEN_GJ_2018_100.csv'
        else:
            raise Exception('year should be 2017 or 2018. default=2017')
            
        
        with open(FILENAME, 'rb') as f:
            ENCODING = chardet.detect(f.readline())['encoding']
            
        f = open(FILENAME, 'r', encoding=ENCODING)
        rdr = csv.reader(f)
        
        required_info_index = self.required_info_index

        for n, col in enumerate(rdr):
            required_data = [info for index, info in enumerate(col) if index in required_info_index]
            if n == 0:
                self.infos.append(col)
                self.infos = self.infos[0]
                self.data_infos.append(required_data)
                self.data_infos = self.data_infos[0]
                FBS_INDEX = self.infos.index('식전혈당(공복혈당)')
                
                if calculate_BMI:
                    HEIGHT_INDEX, WEIGHT_INDEX = self.data_infos.index('신장(5Cm단위)'), self.data_infos.index('체중(5Kg단위)')
                    if WEIGHT_INDEX < HEIGHT_INDEX:
                        raise Exception('selected_features에서 신장을 체중보다 먼저 오도록 설정해주세요.')
                    self.data_infos.append('BMI')
                    
            else:
                FBS = col[FBS_INDEX]
                if '' in required_data or FBS == '':
#                     print(required_data)
                    l = [self.data_infos[i] for i, d in enumerate(required_data) if d == '']
                    for s in l: self.no_data_count[s] += 1
                    self.no_data.append(n)
                else:
                    if calculate_BMI:
                        height, weight = int(required_data.pop(HEIGHT_INDEX)), int(required_data.pop(WEIGHT_INDEX-1))
                        BMI = round(weight / (height * height / 10000), 2)
                        required_data.append(BMI)
                        
                    self.data.append(required_data)
                    self.labels.append(1) if int(FBS) >= FBS_PIVOT else self.labels.append(0)
                    
            if n >= DATASIZE:
                break

        f.close()
        if calculate_BMI:
            self.data_infos.pop(HEIGHT_INDEX)
            self.data_infos.pop(WEIGHT_INDEX-1)
        X = np.array(self.data)
        X = X.astype(np.float64)
        y = np.array(self.labels)
        
        print(f"Reading year {year}'s data done")
        print(f'Size of X = {len(X)}, y = {len(y)}')
        print(f'당뇨병 판별 공복혈당 기준 수치 : {FBS_PIVOT}')
        counts = np.unique(y, return_counts = True)[1]
        print(f'당뇨병 판별 환자 수 : {counts[1]}, 비당뇨병 판별수 : {counts[0]}\n')
        print(f'X has {X.shape[1]} features')
        print(f'Size of data with empty value = {len(self.no_data)}')
        print(f'Selected Features : {self.data_infos}\n')
        print(f'Sample Data : {X[0]}')
        return [X, y]

# Input Features

In [123]:
default_features = ['기준년도', '가입자일련번호', '성별코드', '연령대코드(5세단위)', '시도코드',
                    '신장(5Cm단위)', '체중(5Kg단위)', '허리둘레', '시력(좌)', '시력(우)',
                    '청력(좌)', '청력(우)', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)',
                    '총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤', '혈색소', 
                    '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피',
                    '흡연상태', '음주여부', '구강검진수검여부', '치아우식증유무', '결손치유무',
                    '치아마모증유무', '제3대구치(사랑니)이상', '치석', '데이터공개일자']

selected_features = ['연령대코드(5세단위)', '신장(5Cm단위)', '체중(5Kg단위)', '트리글리세라이드', 
                     '수축기혈압', '이완기혈압', 'LDL콜레스테롤']
features_index = []

for feature in selected_features:
    if feature not in default_features:
        raise Exception(f"'{feature}'는 데이터에 없습니다.")
    features_index.append(default_features.index(feature))
features_index.sort()

# Read Data

In [127]:
# obj = ReadData(year=2017 or 2018, required_info_index = features_index)
data2017 = ReadData(year=2017, required_info_index = features_index)
# X, y = obj.read(calculate_BMI = 체중, 키를 BMI지수로 환산, FBS_PIVOT =  당노병 판단 기준 혈당)
X, y = data2017.read(calculate_BMI=True, FBS_PIVOT=140)

Reading year 2017
Reading year 2017's data done
Size of X = 997068, y = 997068
당뇨병 판별 공복혈당 기준 수치 : 140
당뇨병 판별 환자 수 : 46063, 비당뇨병 판별수 : 951005

X has 6 features
Size of data with empty value = 2932
Selected Features are ['연령대코드(5세단위)', '수축기혈압', '이완기혈압', '트리글리세라이드', 'LDL콜레스테롤', 'BMI']

Sample Data : [  8.   120.    80.    92.   126.    25.95]


In [128]:
dict(data2017.no_data_count)

{'LDL콜레스테롤': 2912, '트리글리세라이드': 45, '수축기혈압': 19, '이완기혈압': 18}

In [6]:
# obj = ReadData(year=2017 or 2018, required_info_index = features_index)
data2018 = ReadData(year=2018, required_info_index = features_index)
# X, y = obj.read(calculate_BMI = 체중, 키를 BMI지수로 환산, FBS_PIVOT =  당노병 판단 기준 혈당)
X, y = data2018.read(calculate_BMI=True, FBS_PIVOT=140)
clear_output()
dict(data2018.no_data_count)

{'총콜레스테롤': 667244,
 '트리글리세라이드': 667252,
 'HDL콜레스테롤': 667254,
 'LDL콜레스테롤': 674122,
 '음주여부': 354943,
 '요단백': 10570,
 '수축기혈압': 5730,
 '이완기혈압': 5730,
 '혈색소': 5914,
 '혈청크레아티닌': 5906,
 '(혈청지오티)AST': 5906,
 '(혈청지오티)ALT': 5906,
 '감마지티피': 5909,
 '허리둘레': 414,
 '흡연상태': 234}

# Split Training, Test Set
(If required)

In [43]:
def split_set(X, y):
    # X_train, X_test, y_train, y_test
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Reset Labels with new FBS Pivot

In [44]:
def reset_fbs(fbs:int, year):
    read_new = ReadData(year=year)
    X, y = read_new.read(FBS_PIVOT=fbs)
    return [X,y]

# Logistic Regression

In [67]:
class DiabetesModelingLR():
    def __init__(self, X=None, y=None):
        self.X = X
        self.y = y
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        self.clf = None
        self.training_size = None
        self.testing_size = None
        self.CROSSVAL=False
        self.SPLIT_DATA = True
        self.RESET_FBS = False
    def LR(self):
        X, y = self.X, self.y
        if self.RESET_FBS:
            FBS = 120
            year = 2017
            X, y = reset_fbs(fbs=FBS, year=year)
        if self.SPLIT_DATA:
            X_train, X_test, y_train, y_test = split_set(X, y)
            self.X_train, self.X_test, self.y_train, self.y_test = [
                X_train, X_test, y_train, y_test]
        if X_train.shape[1] != X_test.shape[1]:
            raise Exception('Training Set과 Test Set의 Feature가 다릅니다.')

        self.training_size = len(X_train)
        self.testing_size = len(X_test)
        self.clf = LogisticRegression(C=1e-2, multi_class='ovr',
                                 penalty='l2', solver='liblinear', tol=0.1)
        if self.CROSSVAL:
            self.clf = LogisticRegressionCV(C=1, multi_class='ovr', cv=5,
                                 penalty='l2', solver='saga', tol=0.1)

        self.clf.fit(X_train, y_train)
        clear_output()
        y_predict = self.clf.predict(X_test)
        
    def show_result(self):
        clf = self.clf
        print('Training size : %d Testing Size : %d' %(self.training_size, self.testing_size))
        counts = np.unique(self.y, return_counts = True)[1]
        print(f'당뇨병 판별 환자 수 : {counts[1]}, 비당뇨병 판별수 : {counts[0]}\n')
        print(f'Accuracy score: {round(accuracy_score(self.y_test, clf.predict(self.X_test)), 5)}\n')

        y_predict = clf.predict(self.X_test)
        print(classification_report(self.y_test, y_predict, target_names=['건강','당뇨']))
        cm = confusion_matrix(self.y_test, y_predict)
        print('Confusion Matrix\n', cm)

In [68]:
model2017LR = DiabetesModelingLR(X,y)
model2017LR.LR()

In [69]:
model2017LR.show_result()

Training size : 79768 Testing Size : 19942
당뇨병 판별 환자 수 : 10274, 비당뇨병 판별수 : 89436

Accuracy score: 0.8960485407682278

              precision    recall  f1-score   support

          건강       0.90      1.00      0.95     17887
          당뇨       0.33      0.01      0.02      2055

    accuracy                           0.90     19942
   macro avg       0.62      0.50      0.48     19942
weighted avg       0.84      0.90      0.85     19942

Confusion Matrix
 [[17851    36]
 [ 2037    18]]


# Support Vector Machine

In [92]:
class DiabetesModelingSVM():
    def __init__(self, X=None, y=None):
        self.X = X
        self.y = y
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        self.clf = None
        self.training_size = None
        self.testing_size = None
        self.CROSSVAL=False
        self.SPLIT_DATA = True
        self.RESET_FBS = False
    def SVM(self):
        X, y = self.X, self.y
        if self.RESET_FBS:
            FBS = 120
            year = 2017
            X, y = reset_fbs(fbs=FBS, year=year)
        if self.SPLIT_DATA:
            X_train, X_test, y_train, y_test = split_set(X, y)
            self.X_train, self.X_test, self.y_train, self.y_test = [
                X_train, X_test, y_train, y_test]
        if X_train.shape[1] != X_test.shape[1]:
            raise Exception('Training Set과 Test Set의 Feature가 다릅니다.')

        self.training_size = len(X_train)
        self.testing_size = len(X_test)
        self.clf = SVC(random_state=42, decision_function_shape='ovo', kernel='rbf',
                      gamma=0.1, C=0.0001)
#         self.clf = SVC(C=1e-2, multi_class='ovr',
#                                  penalty='l2', solver='liblinear', tol=0.1)
#         if self.CROSSVAL:
#             self.clf = LogisticRegressionCV(C=1, multi_class='ovr', cv=5,
#                                  penalty='l2', solver='saga', tol=0.1)

        self.clf.fit(X_train, y_train)
        clear_output()
        y_predict = self.clf.predict(X_test)
        
    def show_result(self):
        clf = self.clf
        print('Training size : %d Testing Size : %d' %(self.training_size, self.testing_size))
        counts = np.unique(self.y, return_counts = True)[1]
        print(f'당뇨병 판별 환자 수 : {counts[1]}, 비당뇨병 판별수 : {counts[0]}\n')
        print(f'Accuracy score: {round(accuracy_score(self.y_test, clf.predict(self.X_test)), 5)}\n')

        y_predict = clf.predict(self.X_test)
        print(classification_report(self.y_test, y_predict, target_names=['건강','당뇨']))
        cm = confusion_matrix(self.y_test, y_predict)
        print('Confusion Matrix\n', cm)

## Data Resize Required

In [93]:
model2017SVM = DiabetesModelingSVM(X,y)
model2017SVM.SVM()
# 1h 22m

In [94]:
model2017SVM.show_result()
# 52m

Training size : 797654 Testing Size : 199414
당뇨병 판별 환자 수 : 46063, 비당뇨병 판별수 : 951005

Accuracy score: 0.9537996329244687



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          건강       0.95      1.00      0.98    190201
          당뇨       0.00      0.00      0.00      9213

    accuracy                           0.95    199414
   macro avg       0.48      0.50      0.49    199414
weighted avg       0.91      0.95      0.93    199414

Confusion Matrix
 [[190201      0]
 [  9213      0]]


# Grid Search

In [114]:
class GridSearch():
    def __init__(self, grid_parameters, model=LogisticRegression, X=X, y=y):
        self.model = model
        self.grid_parameters = grid_parameters
        self.X = X
        self.y = y
        self.clf = None
        self.best_parameters = None
        self.best_estimator = None
        
    def make_estimator(self, model, X=None, y=None, default_parameters=None,
                       grid_parameters=None, cross_validation:int=5):
        clf = model(**default_parameters)
        estimator = GridSearchCV(clf, grid_parameters, cv=cross_validation, n_jobs=-1)
        estimator.fit(X,y)
        
        return estimator
    
    def grid_search(self):
        X_train, X_test, y_train, y_test = split_set(self.X, self.y)
        self.clf = self.make_estimator(X=X_train, y=y_train, model=self.model,
                                       default_parameters=dict(solver='lbfgs'),
                                       grid_parameters=self.grid_parameters)
        self.best_parameters = clf.best_params_
        self.best_estimator = clf.best_estimator_

        # get best accuracy on training set
        print('Input Grid Parameters')
        print(self.grid_parameters)
        
        print('Accuracy on training set')
        print(round(self.clf.best_score_, 5))

        print('Accuracy on test set')
        print(round(self.best_estimator.score(X_test, y_test), 5))

        print(f'Best Parameters : {self.best_parameters}')

In [115]:
grid_parameters = {
    'C': [1e-1, 1, 1000]
#     'penalty': ['l1', 'l2'],
#     'batch_size':[1, 10, 60, 100, 600]
#     'epochs':[5, 10, 25, 50, 200]
#     'C': [1e-4, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1000], # 1000 means no regularization
#     'solver': ['saga']
}
grid_search_2017_LR = GridSearch(grid_parameters=grid_parameters, model=LogisticRegression)
grid_search_2017_LR.grid_search()

Input Grid Parameters
{'C': [0.1, 1, 1000]}
Accuracy on training set
0.95346
Accuracy on test set
0.9538
Best Parameters : {'C': 0.0001, 'gamma': 0.1}
