# Prep Environment & Ingest Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import random

from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
data_path = "Dataset/Data Versioning/"
model_path = "Model/ML Model/"
data_version = "Trained_V2-3.csv"
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

Mounted at /content/drive


In [None]:

df_raw = pd.read_csv(os.path.join(drive_path, data_path+data_version), index_col = 0)
try:
  df_raw = df_raw.set_index('SEQN', drop=True)
  df_raw = df_raw.drop(columns = "Unnamed: 0")
except:
  pass

df_raw.head()

Unnamed: 0_level_0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,...,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C,Quest16_MCQ160B
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93705.0,1,0,2,2,2.0,0,2,1,0,2,...,1.0,9.0,2.0,75.0,2.0,1.204,2.0,2.0,1.0,2.0
93708.0,1,2,2,2,2.0,0,2,1,0,0,...,2.0,9.0,2.0,5.397605e-79,2.0,0.5,1.0,2.0,2.0,2.0
93709.0,0,2,2,1,2.0,2,1,2,1,3,...,9.0,9.0,2.0,40.0,2.0,0.107,4.0,1.0,9.0,2.0
93711.0,1,3,1,0,1.0,3,3,3,3,0,...,1.0,9.0,2.0,857.0,2.0,0.605,5.0,2.0,9.0,2.0
93713.0,1,1,2,1,1.0,1,2,3,2,0,...,1.0,9.0,2.0,40.0,2.0,0.706,3.0,2.0,1.0,2.0


# Oversampling

## Define Target Variable

In [None]:
X = df_raw.copy().drop(columns=['Quest16_MCQ160B'])
y = df_raw['Quest16_MCQ160B']
y = y.replace({2: 0})
y = y.replace({9: 0})
y = y.astype(int)

In [None]:
y.value_counts()

Quest16_MCQ160B
0    5368
1     201
Name: count, dtype: int64

## Oversampling Using SMOTE

In [None]:
def get_categorical(df):
    categorical = []
    for col in df.columns.tolist():
        try:
            if len(df[col].unique().tolist()) < 20:
                categorical.append({
                    'desc': model_var[model_var['model_var_name'] ==  col]['desc'].values[0],
                    'column': col,
                    'possible_values': df[col].unique().tolist()}
                            )
        except Exception as error:
            if len(df[col].unique().tolist()) < 20:
                categorical.append({
                    'desc': None,
                    'column': col,
                    'possible_values': df[col].unique().tolist()})
            # print(f'{col} error: {error}')
    return categorical

categorical_summary = get_categorical(X)
categorical_cols = [True if data_col in [col['column'] for col in categorical_summary] else False for data_col in X.columns.tolist()]
print(categorical_cols)

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, False, True, True, False, True, True, False, True, True, True, True, False, True, True, False, True, True, False, False, True, True, False, True, False, False, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, False, True, False, True, True, True]


In [None]:
from imblearn.over_sampling import SMOTENC

# Instantiate the SMOTE oversampler
smotenc = SMOTENC(
    categorical_features=categorical_cols,
    sampling_strategy='auto',
    random_state=37
  )

# Apply SMOTE to generate synthetic samples
X_resampled, y_resampled = smotenc.fit_resample(X, y)

# Print the number of samples in each class after oversampling
print("Class distribution after oversampling:", {label: count for label, count in zip(*np.unique(y_resampled, return_counts=True))})


Class distribution after oversampling: {0: 5368, 1: 5368}


# Evaluate Oversampling Result

### Summary Data Before vs After Oversampling

In [None]:
X.iloc[:, :20].describe()

Unnamed: 0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,Dieta1_DR1TSUGR,Quest21_SLQ320,Quest21_SLQ330,Quest19_PAD615,Quest21_SLD012,Quest21_SLD013,Dieta1_DR1DAY,Quest6_DED120,Quest19_PAQ610,Quest6_DED125
count,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0
mean,0.749865,1.495062,1.114383,1.436883,1.762076,1.476567,1.461304,1.499731,1.497755,1.49596,1.499731,1.436883,1.360029,1.467948,1.475669,1.430598,1.303466,1.16951,1.477465,0.883821
std,0.433129,1.120771,0.835859,1.078465,0.425851,1.118611,1.119792,1.118215,1.118052,1.118689,1.118215,1.145649,1.059741,1.116308,1.102583,1.11863,1.083041,1.102829,1.119593,0.774484
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
max,1.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0


In [None]:
X_resampled.iloc[:, :20].describe()

Unnamed: 0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,Dieta1_DR1TSUGR,Quest21_SLQ320,Quest21_SLQ330,Quest19_PAD615,Quest21_SLD012,Quest21_SLD013,Dieta1_DR1DAY,Quest6_DED120,Quest19_PAQ610,Quest6_DED125
count,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0
mean,0.788376,1.341841,1.04117,1.574143,1.87174,1.435637,1.052534,1.44402,1.27487,1.793126,1.530458,1.420268,1.190387,1.073864,1.543778,1.253446,1.148007,1.091841,0.939735,0.938897
std,0.408479,1.087655,0.798217,0.867623,0.334395,1.114645,1.053146,1.058967,1.053389,1.05249,1.041032,1.125466,1.066614,0.976216,1.186535,1.115892,0.990021,0.808401,1.057103,0.56706
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
50%,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,1.0,2.0,1.0
max,1.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0


In [None]:
X.iloc[:, 20:40].describe()

Unnamed: 0,Dieta1_DR1TPROT,Quest19_PAQ640,Dieta1_DR1TPFAT,Dieta1_DR1TMFAT,Dieta1_DR1TCALC,Dieta1_DR1TCARB,Dieta1_DR1TTFAT,Quest19_PAD645,Exami1_BPXPLS,Demog1_RIDRETH3,Demog1_DMDHHSZA,Demog1_DMDHHSZE,Quest14_INQ020,Quest18_OCQ210,Demog1_INDIN2,Quest12_HEQ030,Quest22_SMQ900,Exami2_BMXHT,Quest3_CDQ009,Quest3_CDQ010
count,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0
mean,1.498474,1.416233,1.499192,1.498474,1.495421,1.497935,1.499551,1.480158,71.664099,3.509248,0.249596,0.7240079,1.780212,7.859939,17.408332,2.017418,1.811456,166.246497,0.039145,3.878614
std,1.118214,1.11475,1.118214,1.118374,1.118366,1.118213,1.118215,1.11852,11.133882,1.640114,0.6051535,0.8342093,1.990773,2.680441,26.055169,0.513697,0.391181,9.777816,0.193958,3.421848
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,1.0,5.397605e-79,5.397605e-79,1.0,1.0,1.0,1.0,1.0,138.3,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,3.0,5.397605e-79,5.397605e-79,1.0,9.0,6.0,2.0,2.0,159.2,0.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,70.810811,3.0,5.397605e-79,5.397605e-79,1.0,9.0,9.0,2.0,2.0,165.8,0.0,2.0
75%,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,78.0,4.0,5.397605e-79,1.0,2.0,9.0,15.0,2.0,2.0,172.9,0.0,9.0
max,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,136.0,7.0,3.0,3.0,9.0,9.0,99.0,9.0,2.0,197.7,1.0,9.0


In [None]:
X_resampled.iloc[:, 20:40].describe()

Unnamed: 0,Dieta1_DR1TPROT,Quest19_PAQ640,Dieta1_DR1TPFAT,Dieta1_DR1TMFAT,Dieta1_DR1TCALC,Dieta1_DR1TCARB,Dieta1_DR1TTFAT,Quest19_PAD645,Exami1_BPXPLS,Demog1_RIDRETH3,Demog1_DMDHHSZA,Demog1_DMDHHSZE,Quest14_INQ020,Quest18_OCQ210,Demog1_INDIN2,Quest12_HEQ030,Quest22_SMQ900,Exami2_BMXHT,Quest3_CDQ009,Quest3_CDQ010
count,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0
mean,1.228484,1.258849,1.35516,1.411233,1.349292,1.276732,1.362239,1.340723,70.752955,3.440294,0.1294709,1.048249,1.885153,8.408625,15.310658,2.009035,1.893536,166.787402,0.049739,2.53288
std,1.081087,0.925429,1.086513,1.067861,1.058078,1.063465,1.103336,0.931148,9.700859,1.309481,0.4533203,0.7829701,1.812121,2.01273,23.78846,0.370063,0.308445,9.092116,0.217416,2.839314
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,1.0,5.397605e-79,5.397605e-79,1.0,1.0,1.0,1.0,1.0,138.3,0.0,1.0
25%,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,64.713513,3.0,5.397605e-79,5.397605e-79,1.0,9.0,5.171529,2.0,2.0,160.841829,0.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,70.0,3.0,5.397605e-79,1.0,1.0,9.0,7.5,2.0,2.0,166.544311,0.0,1.0
75%,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,76.0,4.0,5.397605e-79,2.0,2.0,9.0,14.0,2.0,2.0,172.808903,0.0,2.0
max,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,136.0,7.0,3.0,3.0,9.0,9.0,99.0,9.0,2.0,197.7,1.0,9.0


In [None]:
X.iloc[:, 40:].describe()

Unnamed: 0,Exami2_BMXWT,Quest3_CDQ008,Quest20_PFQ061H,Quest7_DIQ010,Quest20_PFQ061B,Labor1_LBDTCSI,Quest17_DPQ040,Demog1_RIAGENDR,Labor2_URDTIME1,Quest22_SMQ890,...,Quest12_HEQ010,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C
count,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,...,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0
mean,82.73995,7.527743,4.921889,1.878614,6.086191,4.859067,1.83067,1.514814,147.109676,1.639073,...,2.017418,2.075417,9.0,1.896929,28265.13,1.720776,0.9147597,3.525768,1.831388,6.025139
std,22.163483,2.921444,3.900593,0.456818,3.720251,1.008385,2.914901,0.499825,96.053089,0.500814,...,0.451154,2.609262,0.0,0.335527,164245.3,0.470541,1.162899,1.240231,0.412771,3.783673
min,32.6,1.0,1.0,1.0,1.0,1.97,5.397605e-79,1.0,4.0,1.0,...,1.0,1.0,9.0,1.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0
25%,68.0,9.0,1.0,2.0,1.0,4.22,5.397605e-79,1.0,89.0,1.0,...,2.0,1.0,9.0,2.0,40.0,1.0,0.439,3.0,2.0,1.0
50%,79.6,9.0,2.0,2.0,9.0,4.781616,1.0,2.0,138.857977,2.0,...,2.0,1.0,9.0,2.0,100.0,2.0,0.764,4.0,2.0,9.0
75%,93.3,9.0,9.0,2.0,9.0,5.4,2.0,2.0,164.0,2.0,...,2.0,1.0,9.0,2.0,200.0,2.0,1.018974,4.0,2.0,9.0
max,242.6,9.0,9.0,9.0,9.0,11.53,9.0,2.0,1243.0,9.0,...,9.0,9.0,9.0,9.0,999999.0,9.0,50.5,9.0,9.0,9.0


In [None]:
X_resampled.iloc[:, 40:].describe()

Unnamed: 0,Exami2_BMXWT,Quest3_CDQ008,Quest20_PFQ061H,Quest7_DIQ010,Quest20_PFQ061B,Labor1_LBDTCSI,Quest17_DPQ040,Demog1_RIAGENDR,Labor2_URDTIME1,Quest22_SMQ890,...,Quest12_HEQ010,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C
count,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,...,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0
mean,87.034262,6.091002,3.288003,1.681073,6.783625,4.626942,2.441971,1.436941,147.814433,1.541729,...,2.009035,2.262016,9.0,1.893536,24575.93,1.561662,0.9083769,3.427627,1.577776,6.531017
std,21.358663,3.57435,3.371287,0.524215,3.458993,0.976295,3.404562,0.496031,96.977034,0.50864,...,0.325034,2.856903,0.0,0.324918,152094.2,0.50661,1.578674,1.125881,0.509533,3.641244
min,32.6,1.0,1.0,1.0,1.0,1.97,5.397605e-79,1.0,4.0,1.0,...,1.0,1.0,9.0,1.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0
25%,73.3,2.0,1.0,1.0,2.0,3.91783,5.397605e-79,1.0,92.0,1.0,...,2.0,1.0,9.0,2.0,25.0,1.0,0.429,3.0,1.0,1.0
50%,83.7,9.0,1.0,2.0,9.0,4.598256,1.0,1.0,138.423154,2.0,...,2.0,1.0,9.0,2.0,80.0,2.0,0.7427424,4.0,2.0,9.0
75%,98.8,9.0,9.0,2.0,9.0,5.15,3.0,2.0,158.0,2.0,...,2.0,1.0,9.0,2.0,180.0,2.0,0.9881101,4.0,2.0,9.0
max,242.6,9.0,9.0,9.0,9.0,11.53,9.0,2.0,1243.0,9.0,...,9.0,9.0,9.0,9.0,999999.0,9.0,50.5,9.0,9.0,9.0


### Get Variables Info

In [None]:
var_mapping_path = 'Dataset/Variable Mapping V2.xlsx'

var_mapping = pd.read_excel(os.path.join(drive_path, var_mapping_path))
var_mapping['group'] = var_mapping['group'].replace({'Dieta2': 'Dieta1'})
var_mapping['model_var_name'] = var_mapping['group'] + "_" + var_mapping['variable']

var_mapping = var_mapping.set_index('model_var_name')

var_mapping

Unnamed: 0_level_0,name,title,variable,desc,is_used,lifestyle,group
model_var_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Demog1_RIAGENDR,Demographics,Demographic Variables and Sample Weights,RIAGENDR,Gender,True,False,Demog1
Demog1_RIDAGEYR,Demographics,Demographic Variables and Sample Weights,RIDAGEYR,Age In Years At Screening,True,False,Demog1
Demog1_RIDRETH3,Demographics,Demographic Variables and Sample Weights,RIDRETH3,Race/Hispanic Origin W/ Nh Asian,True,False,Demog1
Demog1_DMDEDUC3,Demographics,Demographic Variables and Sample Weights,DMDEDUC3,Education Level - Children/Youth 6-19,True,False,Demog1
Demog1_DMDEDUC2,Demographics,Demographic Variables and Sample Weights,DMDEDUC2,Education Level - Adults 20+,True,False,Demog1
...,...,...,...,...,...,...,...
Demog1_DMDEDUC,Demographic,Demographic Variables and Sample Weights,DMDEDUC,Education Level all age,True,False,Demog1
Demog1_INDIN2,Demographic,Demographic Variables and Sample Weights,INDIN2,Average income from household income & family ...,True,False,Demog1
Exami1_SysPulse,Examination,Blood Pressure,SysPulse,Systolic Pulse Combined,True,False,Exami1
Exami1_DiaPulse,Examination,Blood Pressure,DiaPulse,Diastolic Pulse Combined,True,False,Exami1


### Summarize Relative Changes

In [None]:
before = X.describe().loc[['mean', 'std'], :].transpose()
after = X_resampled.describe().loc[['mean', 'std'], :].transpose()

before_after = before.join(after, rsuffix='_oversampled')

before_after['mean_relative_change'] = (before_after['mean_oversampled'] - before_after['mean']) / before_after['mean']
before_after['std_relative_change'] = (before_after['std_oversampled'] - before_after['std']) / before_after['std']

# Get Variable's description
before_after = before_after.join(var_mapping['desc'])

before_after

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Demog1_DMDEDUC,3.525768,1.240231,3.427627,1.125881,-0.027835,-0.092201,Education Level all age
Demog1_DMDFMSIZ,2.971988,1.667688,2.449236,1.459860,-0.175893,-0.124621,Total Number Of People In The Family
Demog1_DMDHHSIZ,3.112408,1.625256,2.558867,1.445317,-0.177850,-0.110714,Total Number Of People In The Household
Demog1_DMDHHSZA,0.249596,0.605154,0.129471,0.453320,-0.481278,-0.250900,# Of Children 5 Years Or Younger In Hh
Demog1_DMDHHSZB,0.490573,0.849670,0.264903,0.666492,-0.460013,-0.215587,# Of Children 6-17 Years Old In Hh
...,...,...,...,...,...,...,...
Quest4_CBD121,28265.128097,164245.278134,24575.927644,152094.174623,-0.130521,-0.073981,Money Spent On Eating Out
Quest6_DED120,1.169510,1.102829,1.091841,0.808401,-0.066412,-0.266976,Minutes Outdoors 9Am - 5Pm Work Day
Quest6_DED125,0.883821,0.774484,0.938897,0.567060,0.062316,-0.267822,Minutes Outdoors 9Am - 5Pm Not Work Day
Quest7_DIQ010,1.878614,0.456818,1.681073,0.524215,-0.105152,0.147537,Doctor Told You Have Diabetes


#### Mean Relative Changes

In [None]:
before_after.sort_values(by=['mean_relative_change']).head(10)

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Demog1_DMDHHSZA,0.249596,0.605154,0.129471,0.45332,-0.481278,-0.2509,# Of Children 5 Years Or Younger In Hh
Demog1_DMDHHSZB,0.490573,0.84967,0.264903,0.666492,-0.460013,-0.215587,# Of Children 6-17 Years Old In Hh
Quest4_CBD111,16612.48333,126081.94761,9793.447034,95014.987972,-0.410477,-0.246403,Money Spent On Food At Other Stores
Quest19_PAQ610,1.477465,1.119593,0.939735,1.057103,-0.363954,-0.055815,Number Of Days Vigorous Work
Quest3_CDQ010,3.878614,3.421848,2.53288,2.839314,-0.346963,-0.17024,Shortness Of Breath On Stairs/Inclines
Quest20_PFQ061H,4.921889,3.900593,3.288003,3.371287,-0.331963,-0.135699,Difficulty Walking Between Rooms
Quest19_PAQ655,1.461304,1.119792,1.052534,1.053146,-0.27973,-0.059517,Days Vigorous Recreational Activities
Quest19_PAD615,1.467948,1.116308,1.073864,0.976216,-0.268459,-0.125496,Minutes Vigorous-Intensity Work
Quest3_CDQ008,7.527743,2.921444,6.091002,3.57435,-0.190859,0.223487,Severe Pain In Chest More Than Half Hour
Dieta1_DR1TPROT,1.498474,1.118214,1.228484,1.081087,-0.180177,-0.033201,Protein (Gm)


In [None]:
before_after.sort_values(by=['mean_relative_change']).tail(10)

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Quest1_ALQ111,2.075417,2.609262,2.262016,2.856903,0.089909,0.094909,Ever Had A Drink Of Any Kind Of Alcohol
Quest19_PAD660,1.436883,1.078465,1.574143,0.867623,0.095526,-0.195502,Minutes Vigorous Recreational Activities
Quest20_PFQ061B,6.086191,3.720251,6.783625,3.458993,0.114593,-0.070226,Difficulty Walking For A Quarter Mile
Demog1_RIDAGEYR,51.503681,17.812855,59.848812,16.576309,0.16203,-0.069419,Age In Years At Screening
Exami2_BMXBMI,1.49596,1.118689,1.793126,1.05249,0.198646,-0.059176,Body Mass Index (Kg/M**2)
Quest3_CDQ009,0.039145,0.193958,0.049739,0.217416,0.270631,0.120943,Pain In Body (Combined)
Quest17_DPQ020,1.480697,2.988309,1.936103,3.482663,0.307562,0.165429,"Feeling Down, Depressed, Or Hopeless"
Quest17_DPQ030,1.725983,2.955484,2.282507,3.462113,0.322439,0.17142,Trouble Sleeping Or Sleeping Too Much
Quest17_DPQ040,1.83067,2.914901,2.441971,3.404562,0.333922,0.167986,Feeling Tired Or Having Little Energy
Demog1_DMDHHSZE,0.724008,0.834209,1.048249,0.78297,0.447842,-0.061423,# Of Adults 60 Years Or Older In Hh


#### Standard Deviation Relative Changes

In [None]:
before_after.sort_values(by=['std_relative_change']).head(10)

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Quest12_HEQ030,2.017418,0.513697,2.009035,0.370063,-0.004155,-0.279608,Ever Told You Have Hepatitis C?
Quest12_HEQ010,2.017418,0.451154,2.009035,0.325034,-0.004155,-0.279549,Ever Told You Have Hepatitis B?
Quest6_DED125,0.883821,0.774484,0.938897,0.56706,0.062316,-0.267822,Minutes Outdoors 9Am - 5Pm Not Work Day
Quest6_DED120,1.16951,1.102829,1.091841,0.808401,-0.066412,-0.266976,Minutes Outdoors 9Am - 5Pm Work Day
Quest11_HIQ011,1.175435,0.561837,1.091002,0.414017,-0.071831,-0.263101,Covered By Health Insurance
Demog1_DMDHHSZA,0.249596,0.605154,0.129471,0.45332,-0.481278,-0.2509,# Of Children 5 Years Or Younger In Hh
Quest18_OCQ210,7.859939,2.680441,8.408625,2.01273,0.069808,-0.249105,Usually Work 35 Or More Hours Per Week
Quest4_CBD111,16612.48333,126081.94761,9793.447034,95014.987972,-0.410477,-0.246403,Money Spent On Food At Other Stores
Quest15_KIQ026,1.914347,0.434698,1.938618,0.33912,0.012678,-0.219873,Ever Had Kidney Stones?
Demog1_DMDHHSZB,0.490573,0.84967,0.264903,0.666492,-0.460013,-0.215587,# Of Children 6-17 Years Old In Hh


In [None]:
before_after.sort_values(by=['std_relative_change']).tail(10)

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Quest7_DIQ010,1.878614,0.456818,1.681073,0.524215,-0.105152,0.147537,Doctor Told You Have Diabetes
Quest17_DPQ020,1.480697,2.988309,1.936103,3.482663,0.307562,0.165429,"Feeling Down, Depressed, Or Hopeless"
Quest17_DPQ040,1.83067,2.914901,2.441971,3.404562,0.333922,0.167986,Feeling Tired Or Having Little Energy
Quest17_DPQ030,1.725983,2.955484,2.282507,3.462113,0.322439,0.17142,Trouble Sleeping Or Sleeping Too Much
Quest3_CDQ008,7.527743,2.921444,6.091002,3.57435,-0.190859,0.223487,Severe Pain In Chest More Than Half Hour
Quest9_DLQ050,1.831388,0.412771,1.577776,0.509533,-0.138481,0.234421,Have Serious Difficulty Walking?
Labor2_URDFLOW1,0.91476,1.162899,0.908377,1.578674,-0.006978,0.357533,Urine #1 Flow Rate (Ml/Min)
Exami1_BPXPULS,1.038546,0.181753,1.103204,0.253753,0.062258,0.396144,Pulse Regular Or Irregular?
Quest10_ECQ020,9.0,0.0,9.0,0.0,0.0,,Mother Smoked When Pregnant
Quest20_PFQ020,9.0,0.0,9.0,0.0,0.0,,"Crawl, Walk, Run, Play Limitations"


## Export Data

In [None]:
df = X_resampled.copy()
df['Quest16_MCQ160B'] = y_resampled
df

Unnamed: 0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,...,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C,Quest16_MCQ160B
0,1,0,2,2,2.0,0,2,1,0,2,...,1.0,9.0,2.0,7.500000e+01,2.0,1.204000,2.0,2.0,1.0,0
1,1,2,2,2,2.0,0,2,1,0,0,...,2.0,9.0,2.0,5.397605e-79,2.0,0.500000,1.0,2.0,2.0,0
2,0,2,2,1,2.0,2,1,2,1,3,...,9.0,9.0,2.0,4.000000e+01,2.0,0.107000,4.0,1.0,9.0,0
3,1,3,1,0,1.0,3,3,3,3,0,...,1.0,9.0,2.0,8.570000e+02,2.0,0.605000,5.0,2.0,9.0,0
4,1,1,2,1,1.0,1,2,3,2,0,...,1.0,9.0,2.0,4.000000e+01,2.0,0.706000,3.0,2.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10731,1,2,1,2,2.0,3,1,3,3,1,...,1.0,9.0,2.0,4.691092e+01,2.0,0.837825,3.0,1.0,9.0,1
10732,1,1,2,2,2.0,3,0,3,0,3,...,1.0,9.0,2.0,3.546832e+01,1.0,0.209351,2.0,1.0,9.0,1
10733,1,0,0,2,2.0,0,0,1,2,3,...,1.0,9.0,2.0,2.991748e+01,1.0,0.963971,3.0,1.0,9.0,1
10734,1,2,1,2,2.0,2,0,2,1,2,...,9.0,9.0,2.0,4.415803e+00,1.0,1.027300,2.0,1.0,9.0,1


In [None]:
target_path = "Dataset/Data Versioning/Trained_Oversampled-2.csv"
final = df.copy()

final.to_csv(os.path.join(drive_path, target_path))