In [6]:
import os
import csv
import struct
import chardet
import numpy as np
import collections
from typing import *
from time import time
import seaborn as sns
from sklearn.svm import SVC
from skimage.feature import hog
from numpy import random as rnd
import matplotlib.pyplot as plt
import matplotlib.pyplot as pyplot
from sklearn.utils import shuffle
from sklearn.externals import joblib
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from IPython.display import clear_output
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV



In [2]:
!ls

Logistic Regression.ipynb [31mNHIS_OPEN_GJ_2018_100.csv[m[m [31mSVM.ipynb[m[m
[31mNHIS_OPEN_GJ_2017_100.csv[m[m README.md


# Data Reading

In [27]:
class ReadData():
    def __init__(self, year=2017, required_info_index=None):
        self.year = year
        self.infos = []
        self.data = []
        self.labels = []
        self.no_data = []
        self.data_infos = []
        self.required_info_index = required_info_index
#         self.required_info_index = [2, 3, 5, 6, 7, 12, 13, 15, 16, 17, 18, 
#                                19, 20, 21, 22, 23, 24, 25, 26]
        self.no_data_count = collections.defaultdict(int)
        
    def read(self, no_npz = True, DATASIZE=None, FBS_PIVOT=120):
        year = self.year
        print(f'Reading year {self.year}')
        if not DATASIZE:
            DATASIZE = float('inf')
        if year == 2017:
            FILENAME = 'NHIS_OPEN_GJ_2017_100.csv'
        elif year == 2018:
            FILENAME = 'NHIS_OPEN_GJ_2018_100.csv'
        else:
            raise Exception('year should be 2017 or 2018. default=2017')
            
        with open(FILENAME, 'rb') as f:
            ENCODING = chardet.detect(f.readline())['encoding']
            
        f = open(FILENAME, 'r', encoding=ENCODING)
        rdr = csv.reader(f)
        
        required_info_index = self.required_info_index

        for n, col in enumerate(rdr):
            required_data = [info for index, info in enumerate(col) if index in required_info_index]
            if n == 0:
                self.infos.append(col)
                self.infos = self.infos[0]
                self.data_infos.append(required_data)
                self.data_infos = self.data_infos[0]
                FBS_INDEX = self.infos.index('식전혈당(공복혈당)')
            else:
                FBS = col[FBS_INDEX]
                if '' in required_data or FBS == '':
#                     print(required_data)
                    l = [self.data_infos[i] for i, d in enumerate(required_data) if d == '']
                    for s in l: self.no_data_count[s] += 1
                    self.no_data.append(n)
                else:
                    self.data.append(required_data)
                    self.labels.append(1) if int(FBS) >= FBS_PIVOT else self.labels.append(0)
                    
            if n >= DATASIZE:
                break

        f.close()
        
        X = np.array(self.data)
        X = X.astype(np.float64)
        y = np.array(self.labels)
        
        print(f"Reading year {year}'s data done")
        print(f'Size of X = {len(X)}, y = {len(y)}')
        print(f'X has {X.shape[1]} features')
        print(f'Size of data with empty value = {len(self.no_data)}')
        print(f'Selected Features are {self.data_infos}\n')
        return [X, y]

# Input Features

In [52]:
default_features = ['기준년도', '가입자일련번호', '성별코드', '연령대코드(5세단위)', '시도코드',
                    '신장(5Cm단위)', '체중(5Kg단위)', '허리둘레', '시력(좌)', '시력(우)',
                    '청력(좌)', '청력(우)', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)',
                    '총콜레스테롤', '트리글리세라이드', 'HDL콜레스테롤', 'LDL콜레스테롤', '혈색소', 
                    '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피',
                    '흡연상태', '음주여부', '구강검진수검여부', '치아우식증유무', '결손치유무',
                    '치아마모증유무', '제3대구치(사랑니)이상', '치석', '데이터공개일자']

selected_features = ['연령대코드(5세단위)', '체중(5Kg단위)', '트리글리세라이드', '허리둘레', 
                     '수축기혈압', '이완기혈압', '총콜레스테롤', 'LDL콜레스테롤']
# selected_features = ['성별코드', '연령대코드(5세단위)', '신장(5Cm단위)', '체중(5Kg단위)', '허리둘레',
#                      '수축기혈압', '이완기혈압', '총콜레스테롤', '트리글리세라이드', 'LDL콜레스테롤',
#                      '요단백', '혈청크레아티닌', '(혈청지오티)ALT', '감마지티피', '흡연상태', '음주여부']
features_index = []

for feature in selected_features:
    if feature not in default_features:
        raise Exception(f"'{feature}'는 데이터에 없습니다.")
    features_index.append(default_features.index(feature))
features_index.sort()

# Read Data

In [53]:
data2017 = ReadData(year=2017, required_info_index = features_index)
# data2018 = ReadData(year=2018)
X, y = data2017.read()
# X_train, y_train = data2017.read()
# X_test, y_test = data2018.read()

Reading year 2017
Reading year 2017's data done
Size of X = 996807, y = 996807
X has 8 features
Size of data with empty value = 3193
Selected Features are ['연령대코드(5세단위)', '체중(5Kg단위)', '허리둘레', '수축기혈압', '이완기혈압', '총콜레스테롤', '트리글리세라이드', 'LDL콜레스테롤']



In [37]:
data2017.data_infos

['연령대코드(5세단위)', '체중(5Kg단위)', '허리둘레', '수축기혈압', '이완기혈압', '총콜레스테롤', 'LDL콜레스테롤']

In [13]:
data2017.data_infos
#HDL AST 혈색소 빼기

['성별코드',
 '연령대코드(5세단위)',
 '신장(5Cm단위)',
 '체중(5Kg단위)',
 '허리둘레',
 '수축기혈압',
 '이완기혈압',
 '총콜레스테롤',
 '트리글리세라이드',
 'LDL콜레스테롤',
 '요단백',
 '혈청크레아티닌',
 '(혈청지오티)ALT',
 '감마지티피',
 '흡연상태',
 '음주여부']

In [5]:
dict(data2017.no_data_count)

{'요단백': 4592,
 'LDL콜레스테롤': 2912,
 '음주여부': 536,
 '허리둘레': 266,
 '흡연상태': 144,
 '총콜레스테롤': 43,
 '트리글리세라이드': 45,
 'HDL콜레스테롤': 44,
 '혈색소': 52,
 '혈청크레아티닌': 43,
 '(혈청지오티)AST': 41,
 '(혈청지오티)ALT': 42,
 '감마지티피': 42,
 '수축기혈압': 19,
 '이완기혈압': 18}

In [6]:
data2018 = ReadData(year=2018)
data2018.read()
clear_output()
dict(data2018.no_data_count)

{'총콜레스테롤': 667244,
 '트리글리세라이드': 667252,
 'HDL콜레스테롤': 667254,
 'LDL콜레스테롤': 674122,
 '음주여부': 354943,
 '요단백': 10570,
 '수축기혈압': 5730,
 '이완기혈압': 5730,
 '혈색소': 5914,
 '혈청크레아티닌': 5906,
 '(혈청지오티)AST': 5906,
 '(혈청지오티)ALT': 5906,
 '감마지티피': 5909,
 '허리둘레': 414,
 '흡연상태': 234}

# Split Training, Test Set
(If required)

In [30]:
def split_set(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    # X_train, X_test, y_train, y_test

# Reset Labels with new FBS Pivot

In [31]:
def reset_fbs(fbs:int, year):
    read_new = ReadData(year=year)
    X, y = read_new.read(FBS_PIVOT=fbs)
    return [X,y]

# Logistic Regression

In [49]:
class DiabetesModeling():
    def __init__(self, X=None, y=None):
        self.X = X
        self.y = y
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        self.clf = None
        self.training_size = None
        self.testing_size = None
        self.CROSSVAL=False
        self.SPLIT_DATA = True
        self.RESET_FBS = False
    def LR(self):
        X, y = self.X, self.y
        if self.RESET_FBS:
            FBS = 120
            year = 2017
            X, y = reset_fbs(fbs=FBS, year=year)
        if self.SPLIT_DATA:
            X_train, X_test, y_train, y_test = split_set(X, y)
            self.X_train, self.X_test, self.y_train, self.y_test = [
                X_train, X_test, y_train, y_test]
        if X_train.shape[1] != X_test.shape[1]:
            raise Exception('Training Set과 Test Set의 Feature가 다릅니다.')

        self.training_size = len(X_train)
        self.testing_size = len(X_test)
        self.clf = LogisticRegression(C=1e-2, multi_class='ovr',
                                 penalty='l2', solver='liblinear', tol=0.1)
        if self.CROSSVAL:
            self.clf = LogisticRegressionCV(C=1, multi_class='ovr', cv=5,
                                 penalty='l2', solver='saga', tol=0.1)

        self.clf.fit(X_train, y_train)
        clear_output()
        y_predict = self.clf.predict(X_test)
        
    def show_result(self):
        clf = self.clf
        print('Training size : %d Testing Size : %d' %(self.training_size, self.testing_size))
        counts = np.unique(self.y, return_counts = True)[1]
        print(f'당뇨병 판별 환자 수 : {counts[1]}, 비당뇨병 판별수 : {counts[0]}\n')
        print(f'Accuracy score: {accuracy_score(self.y_test, clf.predict(self.X_test))}\n')

        y_predict = clf.predict(self.X_test)
        print(classification_report(self.y_test, y_predict, target_names=['건강','당뇨']))
        cm = confusion_matrix(self.y_test, y_predict)
        print('Confusion Matrix\n', cm)

In [54]:
model2017 = DiabetesModeling(X,y)
model2017.LR()

In [55]:
model2017.show_result()

Training size : 797445 Testing Size : 199362
당뇨병 판별 환자 수 : 101692, 비당뇨병 판별수 : 895115

Accuracy score: 0.8966703785074387

              precision    recall  f1-score   support

          건강       0.90      1.00      0.95    179024
          당뇨       0.29      0.01      0.02     20338

    accuracy                           0.90    199362
   macro avg       0.59      0.50      0.48    199362
weighted avg       0.84      0.90      0.85    199362

Confusion Matrix
 [[178581    443]
 [ 20157    181]]


# Grid Search

In [14]:
def grid_search(X=None, y=None, model=LogisticRegression, default_parameters=None,
                grid_parameters=None, cross_validation:int=5):
    clf = model(**default_parameters)
    estimator = GridSearchCV(clf, grid_parameters, cv=cross_validation, n_jobs=-1)
    estimator.fit(X,y)
    return estimator

In [None]:
X_train, X_test, y_train, y_test = split_set(X,y)
grid_parameters = {
#     'penalty': ['l1', 'l2'],
    'C': [1e-1, 1, 1000],
#     'batch_size':[1, 10, 60, 100, 600]
#     'epochs':[5, 10, 25, 50, 200]
#     'C': [1e-4, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1000], # 1000 means no regularization
#     'solver': ['saga']
}
clf = grid_search(X=X_train, y=y_train, model=LogisticRegression,
                  default_parameters=dict(solver='lbfgs'),
                  grid_parameters=grid_parameters)
best_parameters = clf.best_params_
best_estimator = clf.best_estimator_

# get best accuracy on training set
print('Accuracy on training set')
print(clf.best_score_)

# compute accuracy on test set using best logistic regression
print('Accuracy on test set')
print(best_estimator.score(X_test, y_test))

print(f'Best Parameters : {best_parameters}')

# Support Vector Machine

In [56]:
class DiabetesModeling():
    def __init__(self, X=None, y=None):
        self.X = X
        self.y = y
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        self.clf = None
        self.training_size = None
        self.testing_size = None
        self.CROSSVAL=False
        self.SPLIT_DATA = True
        self.RESET_FBS = False
    def SVM(self):
        X, y = self.X, self.y
        if self.RESET_FBS:
            FBS = 120
            year = 2017
            X, y = reset_fbs(fbs=FBS, year=year)
        if self.SPLIT_DATA:
            X_train, X_test, y_train, y_test = split_set(X, y)
            self.X_train, self.X_test, self.y_train, self.y_test = [
                X_train, X_test, y_train, y_test]
        if X_train.shape[1] != X_test.shape[1]:
            raise Exception('Training Set과 Test Set의 Feature가 다릅니다.')

        self.training_size = len(X_train)
        self.testing_size = len(X_test)
        self.clf = SVC(random_state=42, decision_function_shape='ovo', kernel='rbf',
                      gamma=0.1, C=100)
#         self.clf = SVC(C=1e-2, multi_class='ovr',
#                                  penalty='l2', solver='liblinear', tol=0.1)
#         if self.CROSSVAL:
#             self.clf = LogisticRegressionCV(C=1, multi_class='ovr', cv=5,
#                                  penalty='l2', solver='saga', tol=0.1)

        self.clf.fit(X_train, y_train)
        clear_output()
        y_predict = self.clf.predict(X_test)
        
    def show_result(self):
        clf = self.clf
        print('Training size : %d Testing Size : %d' %(self.training_size, self.testing_size))
        counts = np.unique(self.y, return_counts = True)[1]
        print(f'당뇨병 판별 환자 수 : {counts[1]}, 비당뇨병 판별수 : {counts[0]}\n')
        print(f'Accuracy score: {accuracy_score(self.y_test, clf.predict(self.X_test))}\n')

        y_predict = clf.predict(self.X_test)
        print(classification_report(self.y_test, y_predict, target_names=['건강','당뇨']))
        cm = confusion_matrix(self.y_test, y_predict)
        print('Confusion Matrix\n', cm)

## Data Resize Required

In [None]:
model2017 = DiabetesModeling(X,y)
model2017.SVM()



In [None]:
model2017.show_result()