# Prepare Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

Mounted at /content/drive


# Ingest Data

In [None]:
data_path = "Dataset/Data Versioning/Combined_All_V6.csv"

df_raw = pd.read_csv(os.path.join(drive_path, data_path), index_col = 0)
df_raw = df_raw.set_index('SEQN')
df_raw.head()

Unnamed: 0_level_0,Demog1_RIAGENDR,Demog1_RIDAGEYR,Demog1_RIDRETH3,Demog1_DMDMARTL,Demog1_DMDHHSIZ,Demog1_DMDFMSIZ,Demog1_DMDHHSZA,Demog1_DMDHHSZB,Demog1_DMDHHSZE,Demog1_DMDEDUC,...,Dieta1_DR1TPROT,Dieta1_DR1TCARB,Dieta1_DR1TSUGR,Dieta1_DR1TFIBE,Dieta1_DR1TTFAT,Dieta1_DR1TSFAT,Dieta1_DR1TMFAT,Dieta1_DR1TPFAT,Dieta1_DR1TCHOL,Dieta1_DR1TCALC
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93703.0,2.0,2.0,6.0,5.0,5.0,5.0,3.0,5.397605e-79,5.397605e-79,1.0,...,46.795385,183.161302,93.418166,11.005325,49.559349,17.118651,16.306485,10.77397,162.313609,893.39645
93704.0,1.0,2.0,3.0,5.0,4.0,4.0,2.0,5.397605e-79,5.397605e-79,1.0,...,51.58,160.46,76.97,5.9,43.24,11.372,14.333,12.506,144.0,700.0
93705.0,2.0,66.0,4.0,3.0,1.0,1.0,5.397605e-79,5.397605e-79,1.0,2.0,...,20.01,157.45,91.55,8.4,56.98,16.435,16.432,19.786,14.0,314.0
93706.0,1.0,18.0,6.0,5.0,5.0,5.0,5.397605e-79,5.397605e-79,1.0,4.0,...,94.19,89.82,14.73,7.1,137.39,35.169,45.805,49.873,462.0,869.0
93707.0,1.0,13.0,7.0,5.0,7.0,7.0,5.397605e-79,3.0,5.397605e-79,1.0,...,59.48,188.15,84.22,10.9,89.18,33.252,33.712,12.424,585.0,535.0


In [None]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 91 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Demog1_RIAGENDR  9254 non-null   float64
 1   Demog1_RIDAGEYR  9254 non-null   float64
 2   Demog1_RIDRETH3  9254 non-null   float64
 3   Demog1_DMDMARTL  9254 non-null   float64
 4   Demog1_DMDHHSIZ  9254 non-null   float64
 5   Demog1_DMDFMSIZ  9254 non-null   float64
 6   Demog1_DMDHHSZA  9254 non-null   float64
 7   Demog1_DMDHHSZB  9254 non-null   float64
 8   Demog1_DMDHHSZE  9254 non-null   float64
 9   Demog1_DMDEDUC   9254 non-null   float64
 10  Demog1_INDIN2    9254 non-null   float64
 11  Labor1_LBXTC     9254 non-null   float64
 12  Labor1_LBDTCSI   9254 non-null   float64
 13  Labor2_URXVOL1   9254 non-null   float64
 14  Labor2_URDFLOW1  9254 non-null   float64
 15  Labor2_URDTIME1  9254 non-null   float64
 16  Exami1_BPXPLS    9254 non-null   float64
 17  Exami1_BP

# Clean Data

In [None]:
df = df_raw.copy()

# ----------------------------------------------------
# Fill in cvd status for person with less then 20 years old
# ----------------------------------------------------

conditions = (df['Quest16_MCQ160B'].isna()) | (df['Quest16_MCQ160C'].isna()) | (df['Quest16_MCQ160D'].isna()) | (df['Quest16_MCQ160E'].isna()) | (df['Quest16_MCQ160F'].isna())
df.loc[conditions, ['Quest16_MCQ160B', 'Quest16_MCQ160C', 'Quest16_MCQ160D', 'Quest16_MCQ160D', 'Quest16_MCQ160E', 'Quest16_MCQ160F']] = 1.0

# df['Quest16_MCQ160B'] = df['Quest16_MCQ160B'].fillna(1.0)

# ----------------------------------------------------
# Fill in Asthma and others
# ----------------------------------------------------

df[['Quest16_MCQ010', 'Quest16_MCQ220', 'Quest16_MCQ300C', 'Quest16_MCQ300A', 'Quest16_MCQ366A', 'Quest16_MCQ366B']] = df.loc[:, ['Quest16_MCQ010', 'Quest16_MCQ220', 'Quest16_MCQ300C', 'Quest16_MCQ300A', 'Quest16_MCQ366A', 'Quest16_MCQ366B']].fillna(value=9)

# ----------------------------------------------------
# Drop Unnecessary Column
# ----------------------------------------------------

df.drop(['Quest16_MCQ092'], axis=1, inplace=True)

# ----------------------------------------------------
# Label of heart failure disease (dropped other than heart failure)
# ----------------------------------------------------

df.drop(['Quest16_MCQ160C', 'Quest16_MCQ160D', 'Quest16_MCQ160E', 'Quest16_MCQ160F'], axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 86 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Demog1_RIAGENDR  9254 non-null   float64
 1   Demog1_RIDAGEYR  9254 non-null   float64
 2   Demog1_RIDRETH3  9254 non-null   float64
 3   Demog1_DMDMARTL  9254 non-null   float64
 4   Demog1_DMDHHSIZ  9254 non-null   float64
 5   Demog1_DMDFMSIZ  9254 non-null   float64
 6   Demog1_DMDHHSZA  9254 non-null   float64
 7   Demog1_DMDHHSZB  9254 non-null   float64
 8   Demog1_DMDHHSZE  9254 non-null   float64
 9   Demog1_DMDEDUC   9254 non-null   float64
 10  Demog1_INDIN2    9254 non-null   float64
 11  Labor1_LBXTC     9254 non-null   float64
 12  Labor1_LBDTCSI   9254 non-null   float64
 13  Labor2_URXVOL1   9254 non-null   float64
 14  Labor2_URDFLOW1  9254 non-null   float64
 15  Labor2_URDTIME1  9254 non-null   float64
 16  Exami1_BPXPLS    9254 non-null   float64
 17  Exami1_BP

# Select Feature

## Variable that can be used


1. RIDAGEYR - Age
2. Smoking
- SMQ890 - Cigarette
- SMQ900 - E-cigarette
3. Sleep Time (Average for weekend & weekdays)
- SLQ300 - Usual sleep time on weekdays or workdays
- SLQ320 - Usual sleep time on weekends
- SLD012 - Sleep hours - weekdays or workdays
- SLD013 - Sleep hours - weekends
- SLQ330 - Usual wake time on weekends
4. Pain in Chest Area
- CDQ008	Severe pain in chest more than half hour
5. Dietary
- DRDINT	Number of days of intake
- DR1DAY	Intake day of the week
- DR1TKCAL	Energy (kcal)
- DR1TPROT	Protein (gm)
- DR1TCARB	Carbohydrate (gm)
- DR1TSUGR	Total sugars (gm)
- DR1TFIBE	Dietary fiber (gm)
- DR1TTFAT	Total fat (gm)
- DR1TSFAT	Total saturated fatty acids (gm)
- DR1TMFAT	Total monounsaturated fatty acids (gm)
- DR1TPFAT	Total polyunsaturated fatty acids (gm)
- DR1TCHOL	Cholesterol (mg)
- DR1TCALC	Calcium (mg)
6. Activity (SUM)
- PAD615	Minutes vigorous-intensity work
- PAD645	Minutes walk/bicycle for transportation
- PAD660	Minutes vigorous recreational activities
7. Height & Weight
- BMXWT	Weight (kg)
- BMXHT	Standing Height (cm)
- BMXBMI	Body Mass Index (kg/m**2)
8. Systolic & Diatoloc Pressure
- BPXSY1	Systolic: Blood pres (1st rdg) mm Hg
- BPXDI1	Diastolic: Blood pres (1st rdg) mm Hg
- BPXSY2	Systolic: Blood pres (2nd rdg) mm Hg
- BPXDI2	Diastolic: Blood pres (2nd rdg) mm Hg
- BPXSY3	Systolic: Blood pres (3rd rdg) mm Hg
- BPXDI3	Diastolic: Blood pres (3rd rdg) mm Hg


## Feature Selection & Engineering

In [None]:
# -----------------
# 1. RIDAGEYR - Age
# -----------------

age = df.copy()[['Demog1_RIDAGEYR']]

# -----------------
# 2. Smoking
# -----------------

smoking = df.copy()[['Quest22_SMQ890', 'Quest22_SMQ900']]

# -----------------
# 3. Sleep Time (Average)
# -----------------

sleep = df.copy()[['Quest21_SLQ300', 'Quest21_SLQ320', 'Quest21_SLQ330', 'Quest21_SLD012', 'Quest21_SLD013']]

## Averaging Sleep timestamp for weekend & weekdays
sleep['Quest21_SLQ300'] = (sleep['Quest21_SLQ300'] + sleep['Quest21_SLQ320']) / 2

## Averaging Sleep duration for weekend & weekdays
sleep['Quest21_SLD012'] = (sleep['Quest21_SLD012'] + sleep['Quest21_SLD013']) / 2

sleep = sleep.drop(columns=['Quest21_SLQ320', 'Quest21_SLD013'])

# -----------------
# 4. Pain in Chest Area
# -----------------

pain = df.copy()['Quest3_CDQ008']

# -----------------
# 5. Dietary
# -----------------

food = df.copy()[['Dieta1_DR1TKCAL', 'Dieta1_DR1TPROT', 'Dieta1_DR1TCARB', 'Dieta1_DR1TSUGR', 'Dieta1_DR1TFIBE', 'Dieta1_DR1TTFAT', 'Dieta1_DR1TSFAT', 'Dieta1_DR1TMFAT', 'Dieta1_DR1TPFAT', 'Dieta1_DR1TCHOL', 'Dieta1_DR1TCALC']]

# -----------------
# 6. Activity
# -----------------

activity = df.copy()[['Quest19_PAD615', 'Quest19_PAD645', 'Quest19_PAD660']]

activity['Quest19_VigorousActivity'] = activity['Quest19_PAD615'] + activity['Quest19_PAD645'] + activity['Quest19_PAD660']
activity = activity.drop(columns=['Quest19_PAD615', 'Quest19_PAD645', 'Quest19_PAD660'])

# -----------------
# 7. Height & Weight
# -----------------

height_weight = df.copy()[['Exami2_BMXWT', 'Exami2_BMXHT', 'Exami2_BMXBMI']]

# -----------------
# 8. Systolic & Diatoloc Pressure
# -----------------

pressure = df.copy()[['Exami1_SysPulse', 'Exami1_DiaPulse']]


# -----------------
# 8. Label
# -----------------

label = df.copy()[['Quest16_MCQ160B']]


# Rearrange Column

Lifestyle - Characteristics - Label

In [None]:

# Lifestyle
final = smoking.copy()
final = final.join(sleep)
final = final.join(food)
final = final.join(activity)

# Characteristics
final = final.join(age)
final = final.join(height_weight)
final = final.join(pain)
final = final.join(pressure)

# Label
final = final.join(label)

final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9254 entries, 93703.0 to 102956.0
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Quest22_SMQ890            9254 non-null   float64
 1   Quest22_SMQ900            9254 non-null   float64
 2   Quest21_SLQ300            9254 non-null   float64
 3   Quest21_SLQ330            9254 non-null   float64
 4   Quest21_SLD012            9254 non-null   float64
 5   Dieta1_DR1TKCAL           9254 non-null   float64
 6   Dieta1_DR1TPROT           9254 non-null   float64
 7   Dieta1_DR1TCARB           9254 non-null   float64
 8   Dieta1_DR1TSUGR           9254 non-null   float64
 9   Dieta1_DR1TFIBE           9254 non-null   float64
 10  Dieta1_DR1TTFAT           9254 non-null   float64
 11  Dieta1_DR1TSFAT           9254 non-null   float64
 12  Dieta1_DR1TMFAT           9254 non-null   float64
 13  Dieta1_DR1TPFAT           9254 non-null   float64
 14  Die

# Save Data

In [None]:
target_path = "Dataset/Data Versioning/Trained_App.csv"

final.to_csv(os.path.join(drive_path, target_path))