# Prepare Environment

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

Mounted at /content/drive


# Ingest Data

In [3]:
data_path = "Dataset/Data Versioning/Combined_All_V6.csv"

df_raw = pd.read_csv(os.path.join(drive_path, data_path), index_col = 0)
df_raw = df_raw.set_index('SEQN')
df_raw.head()

Unnamed: 0_level_0,Demog1_RIAGENDR,Demog1_RIDAGEYR,Demog1_RIDRETH3,Demog1_DMDMARTL,Demog1_DMDHHSIZ,Demog1_DMDFMSIZ,Demog1_DMDHHSZA,Demog1_DMDHHSZB,Demog1_DMDHHSZE,Demog1_DMDEDUC,...,Dieta1_DR1TPROT,Dieta1_DR1TCARB,Dieta1_DR1TSUGR,Dieta1_DR1TFIBE,Dieta1_DR1TTFAT,Dieta1_DR1TSFAT,Dieta1_DR1TMFAT,Dieta1_DR1TPFAT,Dieta1_DR1TCHOL,Dieta1_DR1TCALC
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93703.0,2.0,2.0,6.0,5.0,5.0,5.0,3.0,5.397605e-79,5.397605e-79,1.0,...,46.795385,183.161302,93.418166,11.005325,49.559349,17.118651,16.306485,10.77397,162.313609,893.39645
93704.0,1.0,2.0,3.0,5.0,4.0,4.0,2.0,5.397605e-79,5.397605e-79,1.0,...,51.58,160.46,76.97,5.9,43.24,11.372,14.333,12.506,144.0,700.0
93705.0,2.0,66.0,4.0,3.0,1.0,1.0,5.397605e-79,5.397605e-79,1.0,2.0,...,20.01,157.45,91.55,8.4,56.98,16.435,16.432,19.786,14.0,314.0
93706.0,1.0,18.0,6.0,5.0,5.0,5.0,5.397605e-79,5.397605e-79,1.0,4.0,...,94.19,89.82,14.73,7.1,137.39,35.169,45.805,49.873,462.0,869.0
93707.0,1.0,13.0,7.0,5.0,7.0,7.0,5.397605e-79,3.0,5.397605e-79,1.0,...,59.48,188.15,84.22,10.9,89.18,33.252,33.712,12.424,585.0,535.0


In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 91 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Demog1_RIAGENDR  9254 non-null   float64
 1   Demog1_RIDAGEYR  9254 non-null   float64
 2   Demog1_RIDRETH3  9254 non-null   float64
 3   Demog1_DMDMARTL  9254 non-null   float64
 4   Demog1_DMDHHSIZ  9254 non-null   float64
 5   Demog1_DMDFMSIZ  9254 non-null   float64
 6   Demog1_DMDHHSZA  9254 non-null   float64
 7   Demog1_DMDHHSZB  9254 non-null   float64
 8   Demog1_DMDHHSZE  9254 non-null   float64
 9   Demog1_DMDEDUC   9254 non-null   float64
 10  Demog1_INDIN2    9254 non-null   float64
 11  Labor1_LBXTC     9254 non-null   float64
 12  Labor1_LBDTCSI   9254 non-null   float64
 13  Labor2_URXVOL1   9254 non-null   float64
 14  Labor2_URDFLOW1  9254 non-null   float64
 15  Labor2_URDTIME1  9254 non-null   float64
 16  Exami1_BPXPLS    9254 non-null   float64
 17  Exami1_BP

# Clean Data

In [5]:
df = df_raw.copy()

# ----------------------------------------------------
# Fill in cvd status for person with less then 20 years old
# ----------------------------------------------------

conditions = (df['Quest16_MCQ160B'].isna()) | (df['Quest16_MCQ160C'].isna()) | (df['Quest16_MCQ160D'].isna()) | (df['Quest16_MCQ160E'].isna()) | (df['Quest16_MCQ160F'].isna())
df.loc[conditions, ['Quest16_MCQ160B', 'Quest16_MCQ160C', 'Quest16_MCQ160D', 'Quest16_MCQ160D', 'Quest16_MCQ160E', 'Quest16_MCQ160F']] = 1.0

# df['Quest16_MCQ160B'] = df['Quest16_MCQ160B'].fillna(1.0)

# ----------------------------------------------------
# Fill in Asthma and others
# ----------------------------------------------------

df[['Quest16_MCQ010', 'Quest16_MCQ220', 'Quest16_MCQ300C', 'Quest16_MCQ300A', 'Quest16_MCQ366A', 'Quest16_MCQ366B']] = df.loc[:, ['Quest16_MCQ010', 'Quest16_MCQ220', 'Quest16_MCQ300C', 'Quest16_MCQ300A', 'Quest16_MCQ366A', 'Quest16_MCQ366B']].fillna(value=9)

# ----------------------------------------------------
# Drop Unnecessary Column
# ----------------------------------------------------

df.drop(['Quest16_MCQ092'], axis=1, inplace=True)

# ----------------------------------------------------
# Label of heart failure disease (dropped other than heart failure)
# ----------------------------------------------------

df.drop(['Quest16_MCQ160C', 'Quest16_MCQ160D', 'Quest16_MCQ160E', 'Quest16_MCQ160F'], axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 86 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Demog1_RIAGENDR  9254 non-null   float64
 1   Demog1_RIDAGEYR  9254 non-null   float64
 2   Demog1_RIDRETH3  9254 non-null   float64
 3   Demog1_DMDMARTL  9254 non-null   float64
 4   Demog1_DMDHHSIZ  9254 non-null   float64
 5   Demog1_DMDFMSIZ  9254 non-null   float64
 6   Demog1_DMDHHSZA  9254 non-null   float64
 7   Demog1_DMDHHSZB  9254 non-null   float64
 8   Demog1_DMDHHSZE  9254 non-null   float64
 9   Demog1_DMDEDUC   9254 non-null   float64
 10  Demog1_INDIN2    9254 non-null   float64
 11  Labor1_LBXTC     9254 non-null   float64
 12  Labor1_LBDTCSI   9254 non-null   float64
 13  Labor2_URXVOL1   9254 non-null   float64
 14  Labor2_URDFLOW1  9254 non-null   float64
 15  Labor2_URDTIME1  9254 non-null   float64
 16  Exami1_BPXPLS    9254 non-null   float64
 17  Exami1_BP

# Rearrange Column

In [6]:
var_mapping_path = 'Dataset/Variable Mapping V2.xlsx'

var_mapping = pd.read_excel(os.path.join(drive_path, var_mapping_path))
var_mapping['group'] = var_mapping['group'].replace({'Dieta2': 'Dieta1'})
var_mapping['model_var_name'] = var_mapping['group'] + "_" + var_mapping['variable']

var_mapping

Unnamed: 0,model_var_name,name,title,variable,desc,is_used,lifestyle,group
0,Demog1_RIAGENDR,Demographics,Demographic Variables and Sample Weights,RIAGENDR,Gender,True,False,Demog1
1,Demog1_RIDAGEYR,Demographics,Demographic Variables and Sample Weights,RIDAGEYR,Age In Years At Screening,True,False,Demog1
2,Demog1_RIDRETH3,Demographics,Demographic Variables and Sample Weights,RIDRETH3,Race/Hispanic Origin W/ Nh Asian,True,False,Demog1
3,Demog1_DMDEDUC3,Demographics,Demographic Variables and Sample Weights,DMDEDUC3,Education Level - Children/Youth 6-19,True,False,Demog1
4,Demog1_DMDEDUC2,Demographics,Demographic Variables and Sample Weights,DMDEDUC2,Education Level - Adults 20+,True,False,Demog1
...,...,...,...,...,...,...,...,...
148,Demog1_DMDEDUC,Demographic,Demographic Variables and Sample Weights,DMDEDUC,Education Level all age,True,False,Demog1
149,Demog1_INDIN2,Demographic,Demographic Variables and Sample Weights,INDIN2,Average income from household income & family ...,True,False,Demog1
150,Exami1_SysPulse,Examination,Blood Pressure,SysPulse,Systolic Pulse Combined,True,False,Exami1
151,Exami1_DiaPulse,Examination,Blood Pressure,DiaPulse,Diastolic Pulse Combined,True,False,Exami1


In [10]:
model_var = pd.DataFrame({'model_var_name': df.columns.tolist()})

model_var = pd.merge(model_var, var_mapping, left_on=model_var['model_var_name'].str.lower(), right_on=var_mapping['model_var_name'].str.lower(), how='outer', indicator=True).query('_merge!="right_only"')

# To Check if all variable is tagged
# model_var[model_var['_merge'] == "left_only"]

model_var = model_var[['model_var_name_x', 'lifestyle']].rename(columns={'model_var_name_x': 'model_var_name'}).drop_duplicates().reset_index(drop=True)

model_var

Unnamed: 0,model_var_name,lifestyle
0,Demog1_RIAGENDR,False
1,Demog1_RIDAGEYR,False
2,Demog1_RIDRETH3,False
3,Demog1_DMDMARTL,False
4,Demog1_DMDHHSIZ,False
...,...,...
81,Dieta1_DR1TSFAT,True
82,Dieta1_DR1TMFAT,True
83,Dieta1_DR1TPFAT,True
84,Dieta1_DR1TCHOL,True


In [38]:
label_cols = list([col for col in model_var['model_var_name'] if "Quest16" in col])

lifestyle_cols = model_var.loc[model_var['lifestyle'] == True, 'model_var_name'].tolist()
charac_cols = model_var.loc[model_var['lifestyle'] == False, 'model_var_name'].tolist()

lifestyle_cols = list(set(lifestyle_cols) - set(label_cols))
charac_cols = list(set(charac_cols) - set(label_cols))

print("Overall Cols:", len(model_var['model_var_name']))
print("Lifestyle Cols:", len(lifestyle_cols))
print(lifestyle_cols)
print("Characteristics Cols:", len(charac_cols))
print(charac_cols)
print("Label Cols:", len(label_cols))
print(label_cols)


Overall Cols: 86
Lifestyle Cols: 28
['Quest19_PAQ640', 'Quest6_DED125', 'Dieta1_DR1TTFAT', 'Quest19_PAD660', 'Dieta1_DR1TPROT', 'Dieta1_DR1TCARB', 'Dieta1_DR1TMFAT', 'Quest19_PAQ610', 'Quest19_PAQ635', 'Quest21_SLQ320', 'Quest21_SLD013', 'Dieta1_DR1TFIBE', 'Dieta1_DR1TCALC', 'Dieta1_DRDINT', 'Quest19_PAQ655', 'Quest21_SLQ330', 'Quest6_DED120', 'Quest19_PAD615', 'Quest21_SLD012', 'Dieta1_DR1TSFAT', 'Dieta1_DR1TCHOL', 'Quest19_PAD645', 'Dieta1_DR1TSUGR', 'Dieta1_DR1DAY', 'Dieta1_DR1TKCAL', 'Exami2_BMXBMI', 'Dieta1_DR1TPFAT', 'Quest21_SLQ300']
Characteristics Cols: 51
['Demog1_RIAGENDR', 'Quest4_CBD121', 'Quest20_PFQ061C', 'Exami1_BPXPLS', 'Quest17_DPQ020', 'Quest2_BPQ020', 'Demog1_RIDAGEYR', 'Quest10_ECQ020', 'Exami1_BPXPULS', 'Exami2_BMXWT', 'Exami1_DiaPulse', 'Demog1_DMDHHSZE', 'Quest22_SMQ020', 'Quest7_DIQ010', 'Quest3_CDQ008', 'Quest17_DPQ030', 'Exami2_BMXHT', 'Quest14_INQ012', 'Demog1_INDIN2', 'Labor1_LBDTCSI', 'Quest20_PFQ061B', 'Quest1_ALQ111', 'Quest14_IND235', 'Labor2_URDTIME1',

In [39]:
final = df.copy()

final = final[lifestyle_cols + charac_cols + label_cols]

final

Unnamed: 0_level_0,Quest19_PAQ640,Quest6_DED125,Dieta1_DR1TTFAT,Quest19_PAD660,Dieta1_DR1TPROT,Dieta1_DR1TCARB,Dieta1_DR1TMFAT,Quest19_PAQ610,Quest19_PAQ635,Quest21_SLQ320,...,Demog1_DMDMARTL,Demog1_RIDRETH3,Quest18_OCQ210,Quest16_MCQ010,Quest16_MCQ160B,Quest16_MCQ220,Quest16_MCQ300C,Quest16_MCQ300A,Quest16_MCQ366A,Quest16_MCQ366B
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93703.0,4.680830,148.854618,49.559349,75.630091,46.795385,183.161302,16.306485,4.020545,9.0,753.0,...,5.0,6.0,9.0,2.0,1.0,9.0,9.0,9.0,9.0,9.0
93704.0,4.680830,148.854618,43.240000,75.630091,51.580000,160.460000,14.333000,4.020545,9.0,753.0,...,5.0,3.0,9.0,2.0,1.0,9.0,9.0,9.0,9.0,9.0
93705.0,4.585774,148.854618,56.980000,71.704698,20.010000,157.450000,16.432000,3.744589,2.0,1380.0,...,3.0,4.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0
93706.0,5.000000,154.092949,137.390000,97.682927,94.190000,89.820000,45.805000,4.073034,1.0,30.0,...,5.0,6.0,9.0,2.0,1.0,9.0,9.0,9.0,2.0,2.0
93707.0,4.793651,148.854618,89.180000,103.243243,59.480000,188.150000,33.712000,3.600000,9.0,528.0,...,5.0,7.0,9.0,2.0,1.0,9.0,9.0,9.0,9.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102952.0,4.587912,148.854618,38.680000,65.134021,52.590000,139.920000,15.611000,3.522293,2.0,1350.0,...,1.0,6.0,9.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
102953.0,4.575342,120.000000,114.960000,61.261719,188.170000,333.410000,44.604000,3.000000,2.0,1380.0,...,4.0,1.0,9.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0
102954.0,4.522321,180.000000,66.580000,62.415385,72.700000,218.100000,23.098000,4.220472,2.0,0.0,...,5.0,4.0,9.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0
102955.0,4.877193,148.854618,44.110000,96.205674,29.620000,192.730000,11.406000,3.897059,9.0,487.0,...,5.0,4.0,9.0,2.0,1.0,9.0,9.0,9.0,9.0,9.0


# Save Data

In [41]:
# target_path = "Dataset/Data Versioning/Trained_V2.csv"

# final.to_csv(os.path.join(drive_path, target_path))