In [1]:
# 기본
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
from matplotlib import font_manager, rc
import platform
from tqdm import tqdm
import sklearn
from sklearn import linear_model
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from sklearn.model_selection import train_test_split

# time
import datetime

# crawling
import requests
import lxml.html
import sqlite3
from pandas.io import sql
from bs4 import BeautifulSoup

# 한글 폰트 설정
if platform.system() == 'Windows':
# 윈도우인 경우
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)

#### Cancer 생성
- baseline 1기, 1기에 진단받지 않은 사람들 대상으로 2-7기에 진단받으면 Cancer 종류명 = 1, 아니면 Cancer 종류명 = 0

#### Cancer variables list
- 폐암 : LCA, LCAAG
- 위암 : GCA, GCAAG
- 간암 : HCCCA, HCCCAAG
- 대장암 : COLCA, COLCAAG
- 췌장암 : PACA, PACAAG
- 자궁암 : UTCA, UTCAAG
- 유방암 : BRCA, BRCAAG
- 갑상선암 : THYCA, THYCAAG
- 전립선암 : PROCA, PROCAAG
- 담낭 및 기타 담도암 : GALLCA, GALLCAAG

In [2]:
final = pd.read_csv('data\\MME_final.csv', encoding = 'euc-kr', low_memory = False)
final

Unnamed: 0,기수,NIHID,SEX,VISITALL,PHYSTB,PHYSIT,PHYACTL,PHYACTM,PHYACTH,AEROBFQ,...,FMCDMAG,FMHEA,FMFHEA,FMFHEAAG,FMMHEA,FMMHEAAG,FMBHEA,FMBHEAAG,FMCHEA,FMCHEAAG
0,A01,NIH1604000095,2,6,1.0,5.0,7.0,6.0,4.0,,...,,1.0,1.0,,1.0,,1.0,,,
1,A01,NIH1604000171,1,7,0.0,3.0,8.0,8.0,0.0,,...,,1.0,1.0,,1.0,,1.0,,,
2,A01,NIH1604000338,1,7,2.0,5.0,3.0,0.0,7.0,,...,,1.0,1.0,,1.0,,1.0,,,
3,A01,NIH1604000362,2,7,0.0,5.0,4.0,2.0,2.0,,...,,1.0,1.0,,1.0,,1.0,,,
4,A01,NIH1604000424,1,4,8.0,5.0,8.0,8.0,0.0,,...,,1.0,1.0,,1.0,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70205,A07,NIH1604999643,2,7,,,,,,,...,,1.0,1.0,,1.0,,1.0,,1.0,
70206,A07,NIH1604999732,1,6,,,,,,,...,,1.0,1.0,,1.0,,1.0,,1.0,
70207,A07,NIH1604999772,1,6,,,,,,,...,,1.0,1.0,,1.0,,1.0,,1.0,
70208,A07,NIH1604999929,2,4,,,,,,,...,,1.0,1.0,,1.0,,1.0,,1.0,


In [17]:
# cancer df

lca = final[['기수', 'NIHID', 'LCA', 'LCAAG', 'AGE']].copy()
gca = final[['기수', 'NIHID', 'GCA', 'GCAAG', 'AGE']].copy()
hccca = final[['기수', 'NIHID', 'HCCCA', 'HCCCAAG', 'AGE']].copy()
colca = final[['기수', 'NIHID', 'COLCA', 'COLCAAG', 'AGE']].copy()
paca = final[['기수', 'NIHID', 'PACA', 'PACAAG', 'AGE']].copy()
utca = final[['기수', 'NIHID', 'UTCA', 'UTCAAG', 'AGE']].copy()
brca = final[['기수', 'NIHID', 'BRCA', 'BRCAAG', 'AGE']].copy()
thyca = final[['기수', 'NIHID', 'THYCA', 'THYCAAG', 'AGE']].copy()
proca = final[['기수', 'NIHID', 'PROCA', 'PROCAAG', 'AGE']].copy()
gallca = final[['기수', 'NIHID', 'GALLCA', 'GALLCAAG', 'AGE']].copy()

In [18]:
# 자동화 위해 column 명 바꾸기 (temporarily)

lca.rename(columns = {'LCA' : 'CANCER', 'LCAAG': 'CANCER_AGE'}, inplace = True)
gca.rename(columns = {'GCA' : 'CANCER', 'GCAAG': 'CANCER_AGE'}, inplace = True)
hccca.rename(columns = {'HCCCA' : 'CANCER', 'HCCCAAG': 'CANCER_AGE'}, inplace = True)
colca.rename(columns = {'COLCA' : 'CANCER', 'COLCAAG': 'CANCER_AGE'}, inplace = True)
paca.rename(columns = {'PACA' : 'CANCER', 'PACAAG': 'CANCER_AGE'}, inplace = True)
utca.rename(columns = {'UTCA' : 'CANCER', 'UTCAAG': 'CANCER_AGE'}, inplace = True)
brca.rename(columns = {'BRCA' : 'CANCER', 'BRCAAG': 'CANCER_AGE'}, inplace = True)
thyca.rename(columns = {'THYCA' : 'CANCER', 'THYCAAG': 'CANCER_AGE'}, inplace = True)
proca.rename(columns = {'PROCA' : 'CANCER', 'PROCAAG': 'CANCER_AGE'}, inplace = True)
gallca.rename(columns = {'GALLCA' : 'CANCER', 'GALLCAAG': 'CANCER_AGE'}, inplace = True)

In [19]:
# cancer list

cancer_list = [lca, gca, hccca, colca, paca, utca, brca, thyca, proca, gallca]

In [32]:
# 1차때 진단받은 사람 제거

for cancer in cancer_list: 
    cancer_1st = cancer.loc[(cancer['기수'] == 'A01') & (cancer['CANCER'] == 2)]
    unique_id_1st = cancer_1st['NIHID'].unique().tolist()

    for id in unique_id_1st: 
        cancer = cancer.loc[cancer['NIHID'] != id].reset_index(drop = True)

In [34]:
lca

Unnamed: 0,기수,NIHID,CANCER,CANCER_AGE,AGE
0,A01,NIH1604000095,1.0,,43.0
1,A01,NIH1604000171,1.0,,42.0
2,A01,NIH1604000338,1.0,,69.0
3,A01,NIH1604000362,1.0,,48.0
4,A01,NIH1604000424,1.0,,47.0
...,...,...,...,...,...
70205,A07,NIH1604999643,1.0,,55.0
70206,A07,NIH1604999732,1.0,,75.0
70207,A07,NIH1604999772,1.0,,55.0
70208,A07,NIH1604999929,1.0,,53.0
