In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import json

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

Mounted at /content/drive


In [2]:
data_path = "Dataset/Data Versioning/Combined_All_V5.csv"

df_raw = pd.read_csv(os.path.join(drive_path, data_path), index_col = 0)
df_raw = df_raw.set_index('SEQN')
df_raw.head()

Unnamed: 0_level_0,Demog1_RIAGENDR,Demog1_RIDAGEYR,Demog1_RIDRETH3,Demog1_DMDEDUC3,Demog1_DMDEDUC2,Demog1_DMDMARTL,Demog1_DMDHHSIZ,Demog1_DMDFMSIZ,Demog1_DMDHHSZA,Demog1_DMDHHSZB,...,Quest21_SLQ330,Quest21_SLD013,Quest22_SMQ020,Quest22_SMQ621,Quest22_SMQ890,Quest22_SMQ900,Quest23_SMD470,Quest24_WHD010,Quest24_WHD020,Quest24_WHD080M
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93703.0,2.0,2.0,6.0,,,,5.0,5.0,3.0,5.397605e-79,...,,,,,,,,,,
93704.0,1.0,2.0,3.0,,,,4.0,4.0,2.0,5.397605e-79,...,,,,,,,,,,
93705.0,2.0,66.0,4.0,,2.0,3.0,1.0,1.0,5.397605e-79,5.397605e-79,...,b'07:00',8.0,1.0,,2.0,2.0,,63.0,165.0,
93706.0,1.0,18.0,6.0,15.0,,,5.0,5.0,5.397605e-79,5.397605e-79,...,b'12:00',11.5,2.0,,2.0,2.0,,68.0,145.0,
93707.0,1.0,13.0,7.0,6.0,,,7.0,7.0,5.397605e-79,3.0,...,,,,1.0,,,1.0,,,


In [3]:
def window_mean_by_age(data, col_name, window):

  def precalculate_window_mean(data, window):
    age_map = {}
    summary = data.groupby('Demog1_RIDAGEYR').agg({col_name:['count', 'sum']})
    summary.columns = ['count', 'sum']
    summary = summary.reset_index()

    for age in summary['Demog1_RIDAGEYR'].drop_duplicates():
      age_map[age] = np.sum(
          summary.loc[np.logical_and(summary['Demog1_RIDAGEYR'] >= age-window, summary['Demog1_RIDAGEYR'] <= age+window), 'sum']
      ) / np.sum(
          summary.loc[np.logical_and(summary['Demog1_RIDAGEYR'] >= age-window, summary['Demog1_RIDAGEYR'] <= age+window), 'count']
      )
    return age_map

  age_map = precalculate_window_mean(data, window)
  ret = []
  for idx, row in data.iterrows():
    ret.append(age_map[row['Demog1_RIDAGEYR']])
  return ret


In [None]:
demog = df_raw[[col for col in df_raw.columns if "Demog" in col]]

# The null values on education level has age between 2 and 9, set these with less than 9th grade
demog.loc[demog['Demog1_DMDEDUC'].isnull(), 'Demog1_DMDEDUC'] = 1

# Set Marital Status for < 19 y.o to never married
demog.loc[demog['Demog1_RIDAGEYR'] < 20, 'Demog1_DMDMARTL'] = 5

# Get average income from household income & family income, and set the null value with "Don't Know"
demog.loc[demog['Demog1_INDIN2'].isnull(), 'Demog1_INDIN2'] = 99

demog.info()

In [None]:
# Laboratory Data

labor = df_raw[[col for col in df_raw.columns if "Labor" in col]]
col_to_fill = labor.columns

labor = labor.join(demog)

labor_mean = labor.copy()

for col in col_to_fill:
  labor_mean[col] = window_mean_by_age(labor_mean[[col, 'Demog1_RIDAGEYR']], col_name=col, window=5)

labor = labor.fillna(labor_mean)
labor = labor.fillna(0)
labor = labor[labor.columns.drop(list(labor.filter(regex='Demog')))]

labor.info()

In [None]:
# Examination Data

exam = df_raw[[col for col in df_raw.columns if "Exam" in col]]
col_to_fill = exam.columns

exam = exam.join(demog)

for col in col_to_fill:
  exam[col] = exam[col].fillna(exam.groupby('Demog1_RIDAGEYR')[col].transform('mean'))

# ============================
# Cleaning from RB
# ============================

# ----------------------------
# Fill Systolic Null Values
# ----------------------------

# fill infant (1 year and below)
# male
male_condition = (exam['Demog1_RIDAGEYR'] <= 1.0) & (exam['Demog1_RIAGENDR'] == 1)
exam.loc[exam['Exami1_SysPulse'].isna() & male_condition, 'Exami1_SysPulse'] = 85.0

# female
male_condition = (exam['Demog1_RIDAGEYR'] <= 1.0) & (exam['Demog1_RIAGENDR'] == 2)
exam.loc[exam['Exami1_SysPulse'].isna() & male_condition, 'Exami1_SysPulse'] = 86.0

# fill infant (>=2 years, <7 years)
# male
male_condition = (exam['Demog1_RIDAGEYR'] >= 2.0) & (exam['Demog1_RIDAGEYR'] < 7.0) & (exam['Demog1_RIAGENDR'] == 1)
exam.loc[exam['Exami1_SysPulse'].isna() & male_condition, 'Exami1_SysPulse'] = 88.0

# female
male_condition = (exam['Demog1_RIDAGEYR'] >= 2.0) & (exam['Demog1_RIDAGEYR'] < 7.0) & (exam['Demog1_RIAGENDR'] == 2)
exam.loc[exam['Exami1_SysPulse'].isna() & male_condition, 'Exami1_SysPulse'] = 86.0

# fill infant (>=7 years)
# male
male_condition = (exam['Demog1_RIDAGEYR'] >= 7.0) & (exam['Demog1_RIAGENDR'] == 1)
exam.loc[exam['Exami1_SysPulse'].isna() & male_condition, 'Exami1_SysPulse'] = 97.0

# female
male_condition = (exam['Demog1_RIDAGEYR'] >= 7.0) & (exam['Demog1_RIAGENDR'] == 2)
exam.loc[exam['Exami1_SysPulse'].isna() & male_condition, 'Exami1_SysPulse'] = 96.0

# ----------------------------
# Fill Diastolic Null Values
# ----------------------------

# fill infant (1 year and below)
# male
male_condition = (exam['Demog1_RIDAGEYR'] <= 1.0) & (exam['Demog1_RIAGENDR'] == 1)
exam.loc[exam['Exami1_DiaPulse'].isna() & male_condition, 'Exami1_DiaPulse'] = 37.0

# female
female_condition = (exam['Demog1_RIDAGEYR'] <= 1.0) & (exam['Demog1_RIAGENDR'] == 2)
exam.loc[exam['Exami1_DiaPulse'].isna() & female_condition, 'Exami1_DiaPulse'] = 46.0

# fill infant (>=2 years, <7 years)
# male
male_condition = (exam['Demog1_RIDAGEYR'] >= 2.0) & (exam['Demog1_RIDAGEYR'] < 7.0) & (exam['Demog1_RIAGENDR'] == 1)
exam.loc[exam['Exami1_DiaPulse'].isna() & male_condition, 'Exami1_DiaPulse'] = 42.0

# female
female_condition = (exam['Demog1_RIDAGEYR'] >= 2.0) & (exam['Demog1_RIDAGEYR'] < 7.0) & (exam['Demog1_RIAGENDR'] == 2)
exam.loc[exam['Exami1_DiaPulse'].isna() & female_condition, 'Exami1_DiaPulse'] = 40.0

# fill infant (>=7 years)
# male
male_condition = (exam['Demog1_RIDAGEYR'] >= 7.0) & (exam['Demog1_RIAGENDR'] == 1)
exam.loc[exam['Exami1_DiaPulse'].isna() & male_condition, 'Exami1_DiaPulse'] = 57.0

# female
female_condition = (exam['Demog1_RIDAGEYR'] >= 7.0) & (exam['Demog1_RIAGENDR'] == 2)
exam.loc[exam['Exami1_DiaPulse'].isna() & female_condition, 'Exami1_DiaPulse'] = 57.0


# ----------------------------
# Fill 60 Sec Pulse
# ----------------------------

age_conditions = (exam['Demog1_RIDAGEYR'] <= 7.0)
exam.loc[exam['Exami1_BPXPLS'].isna() & age_conditions, 'Exami1_BPXPLS'] = 85.0


# ----------------------------
# Fill Height and BMI
# ----------------------------

#  Fill Height Male and female (5-7 yrs)
age_conditions = (exam['Demog1_RIDAGEYR'] >= 5.0) & (exam['Demog1_RIDAGEYR'] <= 7.0) & (exam['Demog1_RIAGENDR'] == 1.0)
exam.loc[exam['Exami2_BMXHT'].isna() & age_conditions, 'Exami2_BMXHT'] = 115.5

age_conditions = (exam['Demog1_RIDAGEYR'] >= 5.0) & (exam['Demog1_RIDAGEYR'] <= 7.0) & (exam['Demog1_RIAGENDR'] == 2.0)
exam.loc[exam['Exami2_BMXHT'].isna() & age_conditions, 'Exami2_BMXHT'] = 115.5

#  Fill Height Male and female (3-4 yrs)
age_conditions = (exam['Demog1_RIDAGEYR'] >= 3.0) & (exam['Demog1_RIDAGEYR'] < 5.0) & (exam['Demog1_RIAGENDR'] == 1.0)
exam.loc[exam['Exami2_BMXHT'].isna() & age_conditions, 'Exami2_BMXHT'] = 101.6

age_conditions = (exam['Demog1_RIDAGEYR'] >= 3.0) & (exam['Demog1_RIDAGEYR'] < 5.0) & (exam['Demog1_RIAGENDR'] == 2.0)
exam.loc[exam['Exami2_BMXHT'].isna() & age_conditions, 'Exami2_BMXHT'] = 101.6

#  Fill Height Male and female (1-2 yrs)
age_conditions = (exam['Demog1_RIDAGEYR'] >= 1.0) & (exam['Demog1_RIDAGEYR'] < 3.0) & (exam['Demog1_RIAGENDR'] == 1.0)
exam.loc[exam['Exami2_BMXHT'].isna() & age_conditions, 'Exami2_BMXHT'] = 81.28

age_conditions = (exam['Demog1_RIDAGEYR'] >= 1.0) & (exam['Demog1_RIDAGEYR'] < 3.0) & (exam['Demog1_RIAGENDR'] == 2.0)
exam.loc[exam['Exami2_BMXHT'].isna() & age_conditions, 'Exami2_BMXHT'] = 78.74

#  Fill Height Male and female (1 year and below)
age_conditions = (exam['Demog1_RIDAGEYR'] < 1.0) & (exam['Demog1_RIAGENDR'] == 1.0)
exam.loc[exam['Exami2_BMXHT'].isna() & age_conditions, 'Exami2_BMXHT'] = 71.12

age_conditions = (exam['Demog1_RIDAGEYR'] < 1.0) & (exam['Demog1_RIAGENDR'] == 2.0)
exam.loc[exam['Exami2_BMXHT'].isna() & age_conditions, 'Exami2_BMXHT'] = 68.58

# Fill BMI column
null_bmi = exam[exam['Exami2_BMXBMI'].isna()]
null_bmi['Exami2_BMXBMI'] = null_bmi['Exami2_BMXWT']/((null_bmi['Exami2_BMXHT']/100)**2)

# exam['Exami2_BMXBMI'] = exam['Exami2_BMXWT']/((exam['Exami2_BMXHT']/100)**2)
exam.loc[exam['Exami2_BMXBMI'].isna(), 'Exami2_BMXBMI'] = null_bmi['Exami2_BMXBMI']


# ----------------------------
# Drop Unnecessary Column
# ----------------------------

exam.drop(['Exami1_BPAARM','Exami2_BMXARML', 'Exami2_BMXARMC', 'Exami2_BMXLEG', 'Exami2_BMXWAIST', 'Exami2_BMXHIP', 'Exami1_BPXPTY'], axis=1, inplace=True)

exam = exam[exam.columns.drop(list(exam.filter(regex='Demog')))]

exam.info()
