In [64]:
import os
import csv
import struct
import chardet
import numpy as np
from typing import *
from time import time
from numpy import random as rnd
import matplotlib.pyplot as plt
import matplotlib.pyplot as pyplot
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from IPython.display import clear_output
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
!ls

Logistic Regression.ipynb [31mNHIS_OPEN_GJ_2017_100.csv[m[m [31mNHIS_OPEN_GJ_2018_100.csv[m[m


# Data Reading

In [170]:
import csv
import chardet
import collections
from typing import *
class ReadData():
    def __init__(self, year=2017):
        self.year = year
        self.infos = []
        self.data = []
        self.labels = []
        self.no_data = []
        self.data_infos = []
        self.required_info_index = [2, 3, 5, 6, 7, 12, 13, 15, 16, 17, 18, 
                               19, 20, 21, 22, 23, 24, 25, 26]
        self.no_data_count = collections.defaultdict(int)
        
    def read(self, no_npz = True, DATASIZE=None, FBS_PIVOT=120):
        year = self.year
        print(f'Reading year {self.year}')
        if not DATASIZE:
            DATASIZE = float('inf')
        if year == 2017:
            FILENAME = 'NHIS_OPEN_GJ_2017_100.csv'
        elif year == 2018:
            FILENAME = 'NHIS_OPEN_GJ_2018_100.csv'
        else:
            raise Exception('year should be 2017 or 2018. default=2017')
            
        with open(FILENAME, 'rb') as f:
            ENCODING = chardet.detect(f.readline())['encoding']
            
        f = open(FILENAME, 'r', encoding=ENCODING)
        rdr = csv.reader(f)
        
        required_info_index = self.required_info_index

        for n, col in enumerate(rdr):
            required_data = [info for index, info in enumerate(col) if index in required_info_index]
            if n == 0:
                self.infos.append(col)
                self.infos = self.infos[0]
                self.data_infos.append(required_data)
                self.data_infos = self.data_infos[0]
                FBS_INDEX = self.infos.index('식전혈당(공복혈당)')
            else:
                FBS = col[FBS_INDEX]
                if '' in required_data or FBS == '':
#                     print(required_data)
                    l = [self.data_infos[i] for i, d in enumerate(required_data) if d == '']
                    for s in l: self.no_data_count[s] += 1
                    self.no_data.append(n)
                else:
                    self.data.append(required_data)
                    self.labels.append(1) if int(FBS) >= FBS_PIVOT else self.labels.append(0)
                    
            if n >= DATASIZE:
                break

        f.close()
        
        X_train = np.array(self.data)
        y_train = np.array(self.labels)
        
        print(f"Reading year {year}'s data done")
        print(f'Size of X_train = {len(X_train)}, y_train = {len(y_train)}')
        print(f'Size of data with empty value = {len(self.no_data)}\n')
        return [X_train, y_train]

In [171]:
data2017 = ReadData(year=2017)
data2018 = ReadData(year=2018)
X_train, y_train = data2017.read()
X_test, y_test = data2018.read()

Reading year 2017
Reading year 2017's data done
Size of X_train = 991603, y_train = 991603
Size of data with empty value = 8397

Reading year 2018
Reading year 2018's data done
Size of X_train = 195525, y_train = 195525
Size of data with empty value = 804475



In [172]:
dict(data2017.no_data_count)

{'요단백': 4592,
 'LDL콜레스테롤': 2912,
 '음주여부': 536,
 '허리둘레': 266,
 '흡연상태': 144,
 '총콜레스테롤': 43,
 '트리글리세라이드': 45,
 'HDL콜레스테롤': 44,
 '혈색소': 52,
 '혈청크레아티닌': 43,
 '(혈청지오티)AST': 41,
 '(혈청지오티)ALT': 42,
 '감마지티피': 42,
 '수축기혈압': 19,
 '이완기혈압': 18}

In [173]:
dict(data2018.no_data_count)

{'총콜레스테롤': 667244,
 '트리글리세라이드': 667252,
 'HDL콜레스테롤': 667254,
 'LDL콜레스테롤': 674122,
 '음주여부': 354943,
 '요단백': 10570,
 '수축기혈압': 5730,
 '이완기혈압': 5730,
 '혈색소': 5914,
 '혈청크레아티닌': 5906,
 '(혈청지오티)AST': 5906,
 '(혈청지오티)ALT': 5906,
 '감마지티피': 5909,
 '허리둘레': 414,
 '흡연상태': 234}