# Prepare Environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import json

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

Mounted at /content/drive


# Ingest Data

In [2]:
data_path = "Dataset/Data Versioning/Combined_All_V5.csv"

df_raw = pd.read_csv(os.path.join(drive_path, data_path), index_col = 0)
df_raw = df_raw.set_index('SEQN')

df_raw.drop(columns = ['Demog1_INDFMPIR'], inplace=True)

df_raw.head()

Unnamed: 0_level_0,Demog1_RIAGENDR,Demog1_RIDAGEYR,Demog1_RIDRETH3,Demog1_DMDEDUC3,Demog1_DMDEDUC2,Demog1_DMDMARTL,Demog1_DMDHHSIZ,Demog1_DMDFMSIZ,Demog1_DMDHHSZA,Demog1_DMDHHSZB,...,Quest21_SLQ330,Quest21_SLD013,Quest22_SMQ020,Quest22_SMQ621,Quest22_SMQ890,Quest22_SMQ900,Quest23_SMD470,Quest24_WHD010,Quest24_WHD020,Quest24_WHD080M
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93703.0,2.0,2.0,6.0,,,,5.0,5.0,3.0,5.397605e-79,...,,,,,,,,,,
93704.0,1.0,2.0,3.0,,,,4.0,4.0,2.0,5.397605e-79,...,,,,,,,,,,
93705.0,2.0,66.0,4.0,,2.0,3.0,1.0,1.0,5.397605e-79,5.397605e-79,...,b'07:00',8.0,1.0,,2.0,2.0,,63.0,165.0,
93706.0,1.0,18.0,6.0,15.0,,,5.0,5.0,5.397605e-79,5.397605e-79,...,b'12:00',11.5,2.0,,2.0,2.0,,68.0,145.0,
93707.0,1.0,13.0,7.0,6.0,,,7.0,7.0,5.397605e-79,3.0,...,,,,1.0,,,1.0,,,


# Feature Engineering

## Demographics Data

In [3]:
demog = df_raw[[col for col in df_raw.columns if "Demog" in col]]

# ----------------------------
# Mapping education level values for < 19 y.o to 20+ y.o education level
# ----------------------------

def Demog1_DMDEDUC3(x):
  if x in [0,1,2,3,4,5,6,7,8,55,66]:
    return 1
  if x in [9,10,11,12]:
    return 2
  if x in [13,14]:
    return 3
  if x in [15]:
    return 4
  if x in [77]:
    return 7
  if x in [99]:
    return 9

demog['Demog1_DMDEDUC3'] = demog['Demog1_DMDEDUC3'].apply(Demog1_DMDEDUC3)
demog['Demog1_DMDEDUC'] = demog['Demog1_DMDEDUC2'].combine_first(demog['Demog1_DMDEDUC3'])
demog = demog.drop(columns=['Demog1_DMDEDUC2', 'Demog1_DMDEDUC3'])

# ----------------------------
# Get average income from household income & family income
# ----------------------------

demog['Demog1_INDHHIN2'] = demog['Demog1_INDHHIN2'].combine_first(demog['Demog1_INDFMIN2'])
demog['Demog1_INDFMIN2'] = demog['Demog1_INDFMIN2'].combine_first(demog['Demog1_INDHHIN2'])
demog['Demog1_INDIN2'] = (demog['Demog1_INDHHIN2'] + demog['Demog1_INDFMIN2']) / 2
demog = demog.drop(columns=['Demog1_INDHHIN2', 'Demog1_INDFMIN2'])

demog.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Demog1_RIAGENDR  9254 non-null   float64
 1   Demog1_RIDAGEYR  9254 non-null   float64
 2   Demog1_RIDRETH3  9254 non-null   float64
 3   Demog1_DMDMARTL  5569 non-null   float64
 4   Demog1_DMDHHSIZ  9254 non-null   float64
 5   Demog1_DMDFMSIZ  9254 non-null   float64
 6   Demog1_DMDHHSZA  9254 non-null   float64
 7   Demog1_DMDHHSZB  9254 non-null   float64
 8   Demog1_DMDHHSZE  9254 non-null   float64
 9   Demog1_DMDEDUC   7691 non-null   float64
 10  Demog1_INDIN2    8786 non-null   float64
dtypes: float64(11)
memory usage: 867.6 KB


## Examination Data

In [4]:
# Examination Data

exam = df_raw[[col for col in df_raw.columns if "Exam" in col]]

# ----------------------------
# Combine 1st, 2nd, and 3rd reading of blood presure in average
# ----------------------------

exam['Exami1_SysPulse'] = (exam['Exami1_BPXSY1'] + exam['Exami1_BPXSY2'] + exam['Exami1_BPXSY3'])/3
exam['Exami1_DiaPulse'] = (exam['Exami1_BPXDI1'] + exam['Exami1_BPXDI2'] + exam['Exami1_BPXDI3'])/3

exam.drop(['Exami1_BPXSY1', 'Exami1_BPXSY2', 'Exami1_BPXSY3', 'Exami1_BPXDI1', 'Exami1_BPXDI2', 'Exami1_BPXDI3'], axis=1, inplace=True)

exam.info()


<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Exami1_BPAARM    6817 non-null   float64
 1   Exami1_BPXPLS    6742 non-null   float64
 2   Exami1_BPXPULS   8281 non-null   float64
 3   Exami1_BPXPTY    6742 non-null   float64
 4   Exami2_BMXWT     8580 non-null   float64
 5   Exami2_BMXHT     8016 non-null   float64
 6   Exami2_BMXBMI    8005 non-null   float64
 7   Exami2_BMXLEG    6703 non-null   float64
 8   Exami2_BMXARML   8177 non-null   float64
 9   Exami2_BMXARMC   8173 non-null   float64
 10  Exami2_BMXWAIST  7601 non-null   float64
 11  Exami2_BMXHIP    6039 non-null   float64
 12  Exami1_SysPulse  6077 non-null   float64
 13  Exami1_DiaPulse  6077 non-null   float64
dtypes: float64(14)
memory usage: 1.1 MB


## Questionnaire Data

In [6]:
quest = df_raw[[col for col in df_raw.columns if "Quest" in col]]

# ------------------------
# Converting Timestamp Columns to Continuous
# ------------------------

timestamp_cols = [
    {'name': 'Quest21_SLQ300',}, # Usual sleep time on weekdays or workdays
    {'name': 'Quest21_SLQ320'}, # Usual sleep time on weekends
    {'name': 'Quest21_SLQ330'}, # Usual wake time on weekends
]

# (HH:MM) (hour and minute) to integer --> change to minute representation (00:00 as 0 and 23:59 as 1439)

def timestamp_to_int(x):
  try:
    x = re.search(r"b'(\d+:\d+)'", x).group(1)
    hour = x.split(":")[0]
    minute = x.split(":")[1]
  except:
    return None

  return int(hour)*60 + int(minute)

for col in timestamp_cols:
  quest[col['name']] = quest[col['name']].apply(timestamp_to_int)

# ------------------------
# Aggregating Pain in Torso Fields
# ------------------------

quest['Quest3_CDQ009'] = quest['Quest3_CDQ009A'].notnull() | quest['Quest3_CDQ009B'].notnull() | quest['Quest3_CDQ009C'].notnull() | quest['Quest3_CDQ009D'].notnull() | quest['Quest3_CDQ009E'].notnull() | quest['Quest3_CDQ009F'].notnull() | quest['Quest3_CDQ009G'].notnull() | quest['Quest3_CDQ009H'].notnull()
quest['Quest3_CDQ009'] = quest['Quest3_CDQ009'].replace({True: 1, False: 0}).astype(float)

# ------------------------
# Remapping Family Smoker Values
# ------------------------

def family_smoker(x):
  if x in (1,2,3):
    return 1
  return 0

quest['Quest23_SMD470'] = quest['Quest23_SMD470'].apply(family_smoker)

quest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 97 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Quest1_ALQ111    5130 non-null   float64
 1   Quest2_BPQ020    6161 non-null   float64
 2   Quest2_BPD035    2137 non-null   float64
 3   Quest2_BPQ040A   2137 non-null   float64
 4   Quest2_BPQ060    4193 non-null   float64
 5   Quest2_BPQ070    4543 non-null   float64
 6   Quest2_BPQ100D   1653 non-null   float64
 7   Quest3_CDQ009A   15 non-null     float64
 8   Quest3_CDQ009B   39 non-null     float64
 9   Quest3_CDQ009C   13 non-null     float64
 10  Quest3_CDQ009D   134 non-null    float64
 11  Quest3_CDQ009E   62 non-null     float64
 12  Quest3_CDQ009F   87 non-null     float64
 13  Quest3_CDQ009G   14 non-null     float64
 14  Quest3_CDQ009H   6 non-null      float64
 15  Quest3_CDQ008    1136 non-null   float64
 16  Quest3_CDQ010    3882 non-null   float64
 17  Quest4_CB