In [1]:
# 기본
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
from matplotlib import font_manager, rc
import platform
from tqdm import tqdm
import sklearn
from sklearn import linear_model
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
from sklearn.model_selection import train_test_split

# time
import datetime

# crawling
import requests
import lxml.html
import sqlite3
from pandas.io import sql
from bs4 import BeautifulSoup

# 한글 폰트 설정
if platform.system() == 'Windows':
# 윈도우인 경우
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)

#### Cancer 생성
- baseline 1기, 1기에 진단받지 않은 사람들 대상으로 2-7기에 진단받으면 Cancer 종류명 = 1, 아니면 Cancer 종류명 = 0

#### Cancer variables list
- 폐암 : LCA, LCAAG
- 위암 : GCA, GCAAG
- 간암 : HCCCA, HCCCAAG
- 대장암 : COLCA, COLCAAG
- 췌장암 : PACA, PACAAG
- 자궁암 : UTCA, UTCAAG
- 유방암 : BRCA, BRCAAG
- 갑상선암 : THYCA, THYCAAG
- 전립선암 : PROCA, PROCAAG
- 담낭 및 기타 담도암 : GALLCA, GALLCAAG

In [3]:
final = pd.read_csv('data\\MME_final.csv', encoding = 'euc-kr', low_memory = False)
final

Unnamed: 0,기수,NIHID,SEX,VISITALL,PHYSTB,PHYSIT,PHYACTL,PHYACTM,PHYACTH,AEROBFQ,...,FMCDMAG,FMHEA,FMFHEA,FMFHEAAG,FMMHEA,FMMHEAAG,FMBHEA,FMBHEAAG,FMCHEA,FMCHEAAG
0,A01,NIH1604000095,2,6,1.0,5.0,7.0,6.0,4.0,,...,,1.0,1.0,,1.0,,1.0,,,
1,A01,NIH1604000171,1,7,0.0,3.0,8.0,8.0,0.0,,...,,1.0,1.0,,1.0,,1.0,,,
2,A01,NIH1604000338,1,7,2.0,5.0,3.0,0.0,7.0,,...,,1.0,1.0,,1.0,,1.0,,,
3,A01,NIH1604000362,2,7,0.0,5.0,4.0,2.0,2.0,,...,,1.0,1.0,,1.0,,1.0,,,
4,A01,NIH1604000424,1,4,8.0,5.0,8.0,8.0,0.0,,...,,1.0,1.0,,1.0,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70205,A07,NIH1604999643,2,7,,,,,,,...,,1.0,1.0,,1.0,,1.0,,1.0,
70206,A07,NIH1604999732,1,6,,,,,,,...,,1.0,1.0,,1.0,,1.0,,1.0,
70207,A07,NIH1604999772,1,6,,,,,,,...,,1.0,1.0,,1.0,,1.0,,1.0,
70208,A07,NIH1604999929,2,4,,,,,,,...,,1.0,1.0,,1.0,,1.0,,1.0,


In [4]:
# cancer df

lca = final[['기수', 'NIHID', 'LCA', 'LCAAG', 'AGE']].copy()
gca = final[['기수', 'NIHID', 'GCA', 'GCAAG', 'AGE']].copy()
hccca = final[['기수', 'NIHID', 'HCCCA', 'HCCCAAG', 'AGE']].copy()
colca = final[['기수', 'NIHID', 'COLCA', 'COLCAAG', 'AGE']].copy()
paca = final[['기수', 'NIHID', 'PACA', 'PACAAG', 'AGE']].copy()
utca = final[['기수', 'NIHID', 'UTCA', 'UTCAAG', 'AGE']].copy()
brca = final[['기수', 'NIHID', 'BRCA', 'BRCAAG', 'AGE']].copy()
thyca = final[['기수', 'NIHID', 'THYCA', 'THYCAAG', 'AGE']].copy()
proca = final[['기수', 'NIHID', 'PROCA', 'PROCAAG', 'AGE']].copy()
gallca = final[['기수', 'NIHID', 'GALLCA', 'GALLCAAG', 'AGE']].copy()

In [None]:
# # 자동화 위해 column 명 바꾸기 (temporarily)

# lca.rename(columns = {'LCA' : 'CANCER', 'LCAAG': 'CANCER_AGE'}, inplace = True)
# gca.rename(columns = {'GCA' : 'CANCER', 'GCAAG': 'CANCER_AGE'}, inplace = True)
# hccca.rename(columns = {'HCCCA' : 'CANCER', 'HCCCAAG': 'CANCER_AGE'}, inplace = True)
# colca.rename(columns = {'COLCA' : 'CANCER', 'COLCAAG': 'CANCER_AGE'}, inplace = True)
# paca.rename(columns = {'PACA' : 'CANCER', 'PACAAG': 'CANCER_AGE'}, inplace = True)
# utca.rename(columns = {'UTCA' : 'CANCER', 'UTCAAG': 'CANCER_AGE'}, inplace = True)
# brca.rename(columns = {'BRCA' : 'CANCER', 'BRCAAG': 'CANCER_AGE'}, inplace = True)
# thyca.rename(columns = {'THYCA' : 'CANCER', 'THYCAAG': 'CANCER_AGE'}, inplace = True)
# proca.rename(columns = {'PROCA' : 'CANCER', 'PROCAAG': 'CANCER_AGE'}, inplace = True)
# gallca.rename(columns = {'GALLCA' : 'CANCER', 'GALLCAAG': 'CANCER_AGE'}, inplace = True)

In [5]:
# cancer list

cancer_list = [lca, gca, hccca, colca, paca, utca, brca, thyca, proca, gallca]

In [6]:
# 1차때 진단받은 사람 제거

new_cancer_list = []
for cancer in cancer_list: 
    cancer_1st = cancer.loc[(cancer['기수'] == 'A01') & (cancer.iloc[:, 2] == 2)]
    unique_id_1st = cancer_1st['NIHID'].unique().tolist()

    for id in unique_id_1st: 
        cancer = cancer.loc[cancer['NIHID'] != id].reset_index(drop = True)

    new_cancer_list.append(cancer)

In [7]:
new_cancer_list

[        기수          NIHID  LCA  LCAAG   AGE
 0      A01  NIH1604000095  1.0    NaN  43.0
 1      A01  NIH1604000171  1.0    NaN  42.0
 2      A01  NIH1604000338  1.0    NaN  69.0
 3      A01  NIH1604000362  1.0    NaN  48.0
 4      A01  NIH1604000424  1.0    NaN  47.0
 ...    ...            ...  ...    ...   ...
 70156  A07  NIH1604999643  1.0    NaN  55.0
 70157  A07  NIH1604999732  1.0    NaN  75.0
 70158  A07  NIH1604999772  1.0    NaN  55.0
 70159  A07  NIH1604999929  1.0    NaN  53.0
 70160  A07  NIH1604999956  NaN    NaN   NaN
 
 [70161 rows x 5 columns],
         기수          NIHID  GCA  GCAAG   AGE
 0      A01  NIH1604000095  1.0    NaN  43.0
 1      A01  NIH1604000171  1.0    NaN  42.0
 2      A01  NIH1604000338  1.0    NaN  69.0
 3      A01  NIH1604000362  1.0    NaN  48.0
 4      A01  NIH1604000424  1.0    NaN  47.0
 ...    ...            ...  ...    ...   ...
 70037  A07  NIH1604999643  1.0    NaN  55.0
 70038  A07  NIH1604999732  1.0    NaN  75.0
 70039  A07  NIH1604999772

In [8]:
new_cancer_list[0]

Unnamed: 0,기수,NIHID,LCA,LCAAG,AGE
0,A01,NIH1604000095,1.0,,43.0
1,A01,NIH1604000171,1.0,,42.0
2,A01,NIH1604000338,1.0,,69.0
3,A01,NIH1604000362,1.0,,48.0
4,A01,NIH1604000424,1.0,,47.0
...,...,...,...,...,...
70156,A07,NIH1604999643,1.0,,55.0
70157,A07,NIH1604999732,1.0,,75.0
70158,A07,NIH1604999772,1.0,,55.0
70159,A07,NIH1604999929,1.0,,53.0


In [9]:
# ID별 2-7기 df 생성 후 Cancer 진단여부 열 추가

diagnosed_cancer_list = []

for cancer in new_cancer_list: 
    finalDf = pd.DataFrame()
    unique_id = cancer['NIHID'].unique().tolist()

    for id in unique_id: 
        count = 0
        df = pd.DataFrame(cancer.loc[cancer['NIHID'] == id].reset_index(drop = True))

        for i in range(1, len(df) - 1):    # 2기부터 7기까지 검사
            if df.iloc[i][2] == 2: 
                count += 1
                break; 
        if count > 0: 
            df['realCancer'] = 1
        else: 
            df['realCancer'] = 0

        finalDf = pd.concat([finalDf, df], axis = 0)
        finalDf.reset_index(drop = True, inplace = True)

    diagnosed_cancer_list.append(finalDf)

In [10]:
diagnosed_cancer_list[1]

Unnamed: 0,기수,NIHID,GCA,GCAAG,AGE,realCancer
0,A01,NIH1604000095,1.0,,43.0,0
1,A02,NIH1604000095,1.0,,45.0,0
2,A03,NIH1604000095,1.0,,47.0,0
3,A04,NIH1604000095,1.0,,,0
4,A05,NIH1604000095,1.0,,51.0,0
...,...,...,...,...,...,...
70037,A03,NIH1604999956,,,,0
70038,A04,NIH1604999956,,,,0
70039,A05,NIH1604999956,,,,0
70040,A06,NIH1604999956,,,,0


In [11]:
len(diagnosed_cancer_list)

10

In [133]:
# 전체 한꺼번에 안돌아감 (너무 많음)
# 하나씩 cancer에 담아서 실행할것

# cancer = diagnosed_cancer_list[0]
# cancer = diagnosed_cancer_list[1]
# cancer = diagnosed_cancer_list[2]
# cancer = diagnosed_cancer_list[3]
# cancer = diagnosed_cancer_list[4]
# cancer = diagnosed_cancer_list[5]
# cancer = diagnosed_cancer_list[6]
# cancer = diagnosed_cancer_list[7]
# cancer = diagnosed_cancer_list[8]
cancer = diagnosed_cancer_list[9]

In [134]:
cancer

Unnamed: 0,기수,NIHID,GALLCA,GALLCAAG,AGE,realCancer
0,A01,NIH1604000095,1.0,,43.0,0
1,A02,NIH1604000095,1.0,,45.0,0
2,A03,NIH1604000095,1.0,,47.0,0
3,A04,NIH1604000095,1.0,,,0
4,A05,NIH1604000095,1.0,,51.0,0
...,...,...,...,...,...,...
70198,A03,NIH1604999956,,,,0
70199,A04,NIH1604999956,,,,0
70200,A05,NIH1604999956,,,,0
70201,A06,NIH1604999956,,,,0


In [135]:
cancer['realCancer'].value_counts()

0    70182
1       21
Name: realCancer, dtype: int64

In [136]:
finalDf2 = pd.DataFrame()
unique_id = cancer['NIHID'].unique().tolist()

for id in unique_id: 
    df = pd.DataFrame(cancer.loc[cancer['NIHID'] == id].reset_index(drop = True))
    first_age = cancer['AGE'][cancer['기수'] == 'A01']
    last_exam_age = 0

    for i in range(len(df) - 1, -1, -1):   # last exam age 찾기
        if (pd.isnull(cancer['AGE'][i]) == False): 
            last_exam_age = cancer['AGE'][i]
            break; 

    for i in range(1, len(df)): 
        if (pd.isnull(df.iloc[:, 3][i]) == False): 
            diagnosed_age = df.iloc[:, 3][i]
            break; 

    if df['realCancer'][0] == 0:    # 진단 받지 않은 사람이라면
        time_l = last_exam_age - first_age  # TIME은 마지막 검진 나이 - 1차 검진 나이
    if df['realCancer'][0] == 1:    # 진단 받은 사람이라면
        time_l = diagnosed_age - first_age # TIME은 진단 나이 - 1차 검진 나이

    df['TIME'] = time_l
    finalDf2 = pd.concat([finalDf2, df], axis = 0)
    finalDf2.reset_index(drop = True, inplace = True)

In [137]:
finalDf2

Unnamed: 0,기수,NIHID,GALLCA,GALLCAAG,AGE,realCancer,TIME
0,A01,NIH1604000095,1.0,,43.0,0,12.0
1,A02,NIH1604000095,1.0,,45.0,0,
2,A03,NIH1604000095,1.0,,47.0,0,
3,A04,NIH1604000095,1.0,,,0,
4,A05,NIH1604000095,1.0,,51.0,0,
...,...,...,...,...,...,...,...
70198,A03,NIH1604999956,,,,0,
70199,A04,NIH1604999956,,,,0,
70200,A05,NIH1604999956,,,,0,
70201,A06,NIH1604999956,,,,0,


In [138]:
k = finalDf2[['NIHID', 'realCancer', 'TIME']]
k

Unnamed: 0,NIHID,realCancer,TIME
0,NIH1604000095,0,12.0
1,NIH1604000095,0,
2,NIH1604000095,0,
3,NIH1604000095,0,
4,NIH1604000095,0,
...,...,...,...
70198,NIH1604999956,0,
70199,NIH1604999956,0,
70200,NIH1604999956,0,
70201,NIH1604999956,0,


In [139]:
k.drop_duplicates(inplace = True, subset = ['NIHID'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k.drop_duplicates(inplace = True, subset = ['NIHID'])


In [140]:
k.reset_index(inplace = True, drop = True)
k

Unnamed: 0,NIHID,realCancer,TIME
0,NIH1604000095,0,12.0
1,NIH1604000171,0,12.0
2,NIH1604000338,0,12.0
3,NIH1604000362,0,12.0
4,NIH1604000424,0,12.0
...,...,...,...
10024,NIH1604999643,0,12.0
10025,NIH1604999732,0,12.0
10026,NIH1604999772,0,12.0
10027,NIH1604999929,0,12.0


In [141]:
k['realCancer'].value_counts()

0    10026
1        3
Name: realCancer, dtype: int64

In [142]:
k.rename(columns = {'realCancer' : 'GALLCA'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [143]:
k

Unnamed: 0,NIHID,GALLCA,TIME
0,NIH1604000095,0,12.0
1,NIH1604000171,0,12.0
2,NIH1604000338,0,12.0
3,NIH1604000362,0,12.0
4,NIH1604000424,0,12.0
...,...,...,...
10024,NIH1604999643,0,12.0
10025,NIH1604999732,0,12.0
10026,NIH1604999772,0,12.0
10027,NIH1604999929,0,12.0


In [144]:
k.to_csv('data\\Cancer\\GALLCA.csv', encoding = 'euc-kr', index = False)

-------------------

#### cancer count

In [1]:
# cancer list

cancer_list = ['LCA', 'GCA', 'HCCCA', 'COLCA', 'PACA', 'UTCA', 'BRCA', 'THYCA', 'PROCA', 'GALLCA']

In [7]:
for cancer in cancer_list: 
    df = pd.read_csv('data\\Cancer\\{}.csv'.format(cancer), encoding = 'euc-kr')
    print(df.iloc[:, 1].value_counts())

0    10005
1       18
Name: LCA, dtype: int64
0    9930
1      76
Name: GCA, dtype: int64
0    10012
1       17
Name: HCCCA, dtype: int64
0    9962
1      57
Name: COLCA, dtype: int64
0    10026
1        2
Name: PACA, dtype: int64
0    9987
1      18
Name: UTCA, dtype: int64
0    9965
1      48
Name: BRCA, dtype: int64
0    9989
1      41
Name: THYCA, dtype: int64
0    10021
1        9
Name: PROCA, dtype: int64
0    10026
1        3
Name: GALLCA, dtype: int64


### Create cancer_all

In [12]:
final = pd.read_csv('0. data\\MME_.csv', encoding = 'euc-kr')
final

Unnamed: 0,기수,EDATE,NIHID,AGE,SEX,HEIGHT,WEIGHT,WAIST,GLU0_ORI,R_GTP_TR,...,PHYACTM,PHYACTH,BODYFAT,MET_CAL,PA_NEW,SBP,DBP,eGFR,BMI,DRK_NEW
0,A01,200209.0,NIH1604000095,43.0,2,1.575,48.0,63.0,91.0,16.0,...,3.50,1.75,13400.0,2571.0,2,96.0,63.0,78.564168,19.349962,2
1,A01,200201.0,NIH1604000171,42.0,1,1.755,75.0,81.3,88.0,27.0,...,5.50,0.00,15700.0,2409.0,2,95.0,68.0,92.420166,24.350452,5
2,A01,200210.0,NIH1604000338,69.0,1,1.668,50.4,74.0,82.0,52.0,...,0.00,4.50,,2407.5,2,138.0,69.0,86.839489,18.115004,1
3,A01,200201.0,NIH1604000362,48.0,2,1.556,53.6,76.4,110.0,38.0,...,0.75,0.75,14700.0,886.5,2,102.0,74.0,102.784214,22.138368,1
4,A01,200205.0,NIH1604000424,47.0,1,1.745,88.0,94.5,81.0,5.0,...,5.50,0.00,19500.0,2409.0,2,131.0,84.0,112.380813,28.899599,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70205,A07,201303.0,NIH1604999643,55.0,2,1.641,60.9,74.0,88.0,,...,,,17900.0,,0,92.0,62.0,74.201919,22.615184,1
70206,A07,201407.0,NIH1604999732,75.0,1,1.632,50.8,81.3,110.0,,...,,,10500.0,,0,131.0,66.0,68.312024,19.073193,3
70207,A07,201409.0,NIH1604999772,55.0,1,1.635,66.2,88.3,85.0,,...,,,13900.0,,0,,,50.048887,24.764096,2
70208,A07,201403.0,NIH1604999929,53.0,2,1.530,63.2,85.3,95.0,,...,,,27100.0,,0,112.0,71.0,74.230720,26.998163,2


In [13]:
id = pd.DataFrame(final['NIHID'][final['기수'] == 'A01'])
id

Unnamed: 0,NIHID
0,NIH1604000095
1,NIH1604000171
2,NIH1604000338
3,NIH1604000362
4,NIH1604000424
...,...
10025,NIH1604999643
10026,NIH1604999732
10027,NIH1604999772
10028,NIH1604999929


In [14]:
# cancer list

cancer_list = ['LCA', 'GCA', 'HCCCA', 'COLCA', 'PACA', 'UTCA', 'BRCA', 'THYCA', 'PROCA', 'GALLCA']

In [15]:
for cancer in cancer_list: 
    df = pd.read_csv('0. data\\Cancer\\{}.csv'.format(cancer), encoding = 'euc-kr')
    timename = df.columns[2]
    cancername = df.columns[1]
    df.rename(columns = {timename : cancername + timename}, inplace = True)
    id = pd.merge(id, df, how = 'left', on = 'NIHID')

id

Unnamed: 0,NIHID,LCA,LCATIME,GCA,GCATIME,HCCCA,HCCCATIME,COLCA,COLCATIME,PACA,...,UTCA,UTCATIME,BRCA,BRCATIME,THYCA,THYCATIME,PROCA,PROCATIME,GALLCA,GALLCATIME
0,NIH1604000095,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
1,NIH1604000171,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
2,NIH1604000338,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
3,NIH1604000362,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
4,NIH1604000424,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10025,NIH1604999643,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
10026,NIH1604999732,0.0,12.0,0.0,12.0,1.0,23.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
10027,NIH1604999772,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
10028,NIH1604999929,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,1.0,-1.0,0,12.0,0,12.0,0.0,12.0


In [20]:
id.loc[id['HCCCA'] == 1]

Unnamed: 0,NIHID,LCA,LCATIME,GCA,GCATIME,HCCCA,HCCCATIME,COLCA,COLCATIME,PACA,...,UTCA,UTCATIME,BRCA,BRCATIME,THYCA,THYCATIME,PROCA,PROCATIME,GALLCA,GALLCATIME
281,NIH1604029139,0.0,12.0,0.0,12.0,1.0,8.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
486,NIH1604052206,0.0,12.0,0.0,12.0,1.0,8.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
667,NIH1604070976,0.0,12.0,0.0,12.0,1.0,11.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
967,NIH1604103191,0.0,12.0,0.0,12.0,1.0,28.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
1371,NIH1604144405,0.0,12.0,0.0,12.0,1.0,14.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
2136,NIH1604219303,0.0,12.0,0.0,12.0,1.0,21.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
2461,NIH1604251677,0.0,12.0,0.0,12.0,1.0,23.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
3274,NIH1604329991,0.0,12.0,0.0,12.0,1.0,10.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
4531,NIH1604458416,0.0,12.0,1.0,10.0,1.0,11.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0
5932,NIH1604592222,0.0,12.0,0.0,12.0,1.0,8.0,0.0,12.0,0.0,...,0.0,12.0,0.0,12.0,0,12.0,0,12.0,0.0,12.0


In [21]:
colname = id.columns.tolist()
colname.remove('NIHID')
for row in range(10): 
    i = 1
    for col in colname: 
        print(col, id.iloc[row, i])
        i += 1

LCA 0.0
LCATIME 12.0
GCA 0.0
GCATIME 12.0
HCCCA 0.0
HCCCATIME 12.0
COLCA 0.0
COLCATIME 12.0
PACA 0.0
PACATIME 12.0
UTCA 0.0
UTCATIME 12.0
BRCA 0.0
BRCATIME 12.0
THYCA 0
THYCATIME 12.0
PROCA 0
PROCATIME 12.0
GALLCA 0.0
GALLCATIME 12.0
LCA 0.0
LCATIME 12.0
GCA 0.0
GCATIME 12.0
HCCCA 0.0
HCCCATIME 12.0
COLCA 0.0
COLCATIME 12.0
PACA 0.0
PACATIME 12.0
UTCA 0.0
UTCATIME 12.0
BRCA 0.0
BRCATIME 12.0
THYCA 0
THYCATIME 12.0
PROCA 0
PROCATIME 12.0
GALLCA 0.0
GALLCATIME 12.0
LCA 0.0
LCATIME 12.0
GCA 0.0
GCATIME 12.0
HCCCA 0.0
HCCCATIME 12.0
COLCA 0.0
COLCATIME 12.0
PACA 0.0
PACATIME 12.0
UTCA 0.0
UTCATIME 12.0
BRCA 0.0
BRCATIME 12.0
THYCA 0
THYCATIME 12.0
PROCA 0
PROCATIME 12.0
GALLCA 0.0
GALLCATIME 12.0
LCA 0.0
LCATIME 12.0
GCA 0.0
GCATIME 12.0
HCCCA 0.0
HCCCATIME 12.0
COLCA 0.0
COLCATIME 12.0
PACA 0.0
PACATIME 12.0
UTCA 0.0
UTCATIME 12.0
BRCA 0.0
BRCATIME 12.0
THYCA 0
THYCATIME 12.0
PROCA 0
PROCATIME 12.0
GALLCA 0.0
GALLCATIME 12.0
LCA 0.0
LCATIME 12.0
GCA 0.0
GCATIME 12.0
HCCCA 0.0
HCCCATIME 12

In [24]:
id['Cancer'] = 0
id['Cancer Time'] = 0
id['Cancer name'] = np.nan

In [25]:
for row in range(len(id)): 
    count = 0
    i = 1
    for col in colname: 
        if (id[col][row] == 1) & (col in cancer_list): 
            id['Cancer'][row] = 1
            id['Cancer name'][row] = col
            id['Cancer Time'][row] = id.iloc[row, i + 1]
            count += 1
        i += 1  # col index
    if count == 0: 
        id['Cancer'][row] = 0
        id['Cancer Time'][row] = id.iloc[row, 2]
    i = 0  # 한명에 대해 다 돌면 col index 초기화

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id['Cancer'][row] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id['Cancer Time'][row] = id.iloc[row, 2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id['Cancer'][row] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id['Cancer name'][row] = col
A value is trying to be set on a copy of a slice from 

In [26]:
id

Unnamed: 0,NIHID,LCA,LCATIME,GCA,GCATIME,HCCCA,HCCCATIME,COLCA,COLCATIME,PACA,...,BRCATIME,THYCA,THYCATIME,PROCA,PROCATIME,GALLCA,GALLCATIME,Cancer,Cancer Time,Cancer name
0,NIH1604000095,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,0,12.0,
1,NIH1604000171,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,0,12.0,
2,NIH1604000338,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,0,12.0,
3,NIH1604000362,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,0,12.0,
4,NIH1604000424,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,0,12.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10025,NIH1604999643,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,0,12.0,
10026,NIH1604999732,0.0,12.0,0.0,12.0,1.0,23.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,1,23.0,HCCCA
10027,NIH1604999772,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,0,12.0,
10028,NIH1604999929,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,-1.0,0,12.0,0,12.0,0.0,12.0,1,-1.0,BRCA


In [27]:
id['Cancer Time'].value_counts()

 12.0    9753
 18.0      14
 11.0      13
 6.0       12
 14.0      12
 28.0      12
 8.0       12
 25.0      12
 23.0      11
 21.0      11
 16.0      11
 7.0       11
 10.0      11
 3.0       11
 5.0       10
 19.0      10
 9.0        8
 13.0       8
 17.0       8
 4.0        7
 24.0       7
 22.0       7
 26.0       6
 15.0       6
 20.0       6
 1.0        4
-1.0        4
 30.0       4
 29.0       4
 31.0       3
 2.0        3
 32.0       2
 27.0       2
 0.0        2
-2.0        1
-3.0        1
 33.0       1
-11.0       1
 34.0       1
 35.0       1
Name: Cancer Time, dtype: int64

In [15]:
# pd.reset_option("display.max_rows")

In [28]:
test = id.loc[id['Cancer'] == 1]
test['Cancer Time'].value_counts()

 18.0    14
 11.0    13
 6.0     12
 14.0    12
 25.0    12
 28.0    12
 8.0     12
 23.0    11
 3.0     11
 21.0    11
 7.0     11
 10.0    11
 16.0    11
 5.0     10
 19.0    10
 9.0      8
 17.0     8
 13.0     8
 24.0     7
 4.0      7
 12.0     7
 22.0     7
 26.0     6
 15.0     6
 20.0     6
-1.0      4
 29.0     4
 30.0     4
 1.0      4
 31.0     3
 2.0      3
 0.0      2
 32.0     2
 27.0     2
 34.0     1
-2.0      1
-3.0      1
-11.0     1
 33.0     1
 35.0     1
Name: Cancer Time, dtype: int64

In [29]:
id.loc[id['Cancer'] == 1]

Unnamed: 0,NIHID,LCA,LCATIME,GCA,GCATIME,HCCCA,HCCCATIME,COLCA,COLCATIME,PACA,...,BRCATIME,THYCA,THYCATIME,PROCA,PROCATIME,GALLCA,GALLCATIME,Cancer,Cancer Time,Cancer name
167,NIH1604016947,0.0,12.0,1.0,11.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,1,11.0,GCA
208,NIH1604021459,1.0,6.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,1,6.0,LCA
213,NIH1604022052,0.0,12.0,0.0,12.0,0.0,12.0,1.0,28.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,1,28.0,COLCA
222,NIH1604022722,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,1,5.0,0,12.0,0.0,12.0,1,5.0,THYCA
281,NIH1604029139,0.0,12.0,0.0,12.0,1.0,8.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,1,8.0,HCCCA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9979,NIH1604995551,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,1,6.0,UTCA
10002,NIH1604997534,0.0,12.0,0.0,12.0,0.0,12.0,1.0,20.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,1,20.0,COLCA
10024,NIH1604999640,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,5.0,0,12.0,0,12.0,0.0,12.0,1,5.0,BRCA
10026,NIH1604999732,0.0,12.0,0.0,12.0,1.0,23.0,0.0,12.0,0.0,...,12.0,0,12.0,0,12.0,0.0,12.0,1,23.0,HCCCA


In [17]:
id

Unnamed: 0,NIHID,LCA,LCATIME,GCA,GCATIME,HCCCA,HCCCATIME,COLCA,COLCATIME,PACA,...,BRCA,BRCATIME,THYCA,THYCATIME,PROCA,PROCATIME,GALLCA,GALLCATIME,Cancer,Cancer Time
0,NIH1604000095,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0,12.0,0,12.0,0.0,12.0,0,0
1,NIH1604000171,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0,12.0,0,12.0,0.0,12.0,0,0
2,NIH1604000338,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0,12.0,0,12.0,0.0,12.0,0,0
3,NIH1604000362,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0,12.0,0,12.0,0.0,12.0,0,0
4,NIH1604000424,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0,12.0,0,12.0,0.0,12.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10025,NIH1604999643,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0,12.0,0,12.0,0.0,12.0,0,0
10026,NIH1604999732,0.0,12.0,0.0,12.0,1.0,23.0,0.0,12.0,0.0,...,0.0,12.0,0,12.0,0,12.0,0.0,12.0,1,23
10027,NIH1604999772,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,0.0,12.0,0,12.0,0,12.0,0.0,12.0,0,0
10028,NIH1604999929,0.0,12.0,0.0,12.0,0.0,12.0,0.0,12.0,0.0,...,1.0,-1.0,0,12.0,0,12.0,0.0,12.0,1,-1


In [30]:
ff = id[['NIHID', 'Cancer', 'Cancer Time', 'Cancer name']]
ff

Unnamed: 0,NIHID,Cancer,Cancer Time,Cancer name
0,NIH1604000095,0,12.0,
1,NIH1604000171,0,12.0,
2,NIH1604000338,0,12.0,
3,NIH1604000362,0,12.0,
4,NIH1604000424,0,12.0,
...,...,...,...,...
10025,NIH1604999643,0,12.0,
10026,NIH1604999732,1,23.0,HCCCA
10027,NIH1604999772,0,12.0,
10028,NIH1604999929,1,-1.0,BRCA


In [31]:
ff.isna().sum()

NIHID             0
Cancer            0
Cancer Time       7
Cancer name    9753
dtype: int64

In [94]:
# ff.rename(columns = {'Cancer Time' : 'TIME'}, inplace = True)
# ff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,NIHID,Cancer,TIME
0,NIH1604000095,0,0
1,NIH1604000171,0,0
2,NIH1604000338,0,0
3,NIH1604000362,0,0
4,NIH1604000424,0,0
...,...,...,...
10025,NIH1604999643,0,0
10026,NIH1604999732,1,23
10027,NIH1604999772,0,0
10028,NIH1604999929,1,-1


In [95]:
# tt = ff.loc[ff['TIME'] >= 0].reset_index(drop = True)
# tt

Unnamed: 0,NIHID,Cancer,TIME
0,NIH1604000095,0,0
1,NIH1604000171,0,0
2,NIH1604000338,0,0
3,NIH1604000362,0,0
4,NIH1604000424,0,0
...,...,...,...
10018,NIH1604999640,1,5
10019,NIH1604999643,0,0
10020,NIH1604999732,1,23
10021,NIH1604999772,0,0


In [32]:
ff['Cancer'].value_counts()

0    9753
1     277
Name: Cancer, dtype: int64

In [34]:
ff.to_csv('0. data\\Cancer\\Cancer_All.csv', encoding = 'euc-kr', index = False)