# Prep Environment & Ingest Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import random

from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
data_path = "Dataset/Data Versioning/"
model_path = "Model/ML Model/"
data_version = "Trained_V2-3.csv"
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

Mounted at /content/drive


In [4]:

df_raw = pd.read_csv(os.path.join(drive_path, data_path+data_version), index_col = 0)
try:
  df_raw = df_raw.set_index('SEQN', drop=True)
  df_raw = df_raw.drop(columns = "Unnamed: 0")
except:
  pass

df_raw.head()

Unnamed: 0_level_0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,...,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C,Quest16_MCQ160B
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93705.0,1,0,2,2,2.0,0,2,1,0,2,...,1.0,9.0,2.0,75.0,2.0,1.204,2.0,2.0,1.0,2.0
93708.0,1,2,2,2,2.0,0,2,1,0,0,...,2.0,9.0,2.0,5.397605e-79,2.0,0.5,1.0,2.0,2.0,2.0
93709.0,0,2,2,1,2.0,2,1,2,1,3,...,9.0,9.0,2.0,40.0,2.0,0.107,4.0,1.0,9.0,2.0
93711.0,1,3,1,0,1.0,3,3,3,3,0,...,1.0,9.0,2.0,857.0,2.0,0.605,5.0,2.0,9.0,2.0
93713.0,1,1,2,1,1.0,1,2,3,2,0,...,1.0,9.0,2.0,40.0,2.0,0.706,3.0,2.0,1.0,2.0


# Oversampling

## Define Target Variable

In [5]:
X = df_raw.copy().drop(columns=['Quest16_MCQ160B'])
y = df_raw['Quest16_MCQ160B']
y = y.replace({2: 0})
y = y.replace({9: 0})
y = y.astype(int)

In [6]:
y.value_counts()

Quest16_MCQ160B
0    5368
1     201
Name: count, dtype: int64

## Oversampling Using SMOTE

In [7]:
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE oversampler
smote = SMOTE(random_state=37)

# Apply SMOTE to generate synthetic samples
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print the number of samples in each class after oversampling
print("Class distribution after oversampling:", {label: count for label, count in zip(*np.unique(y_resampled, return_counts=True))})


Class distribution after oversampling: {0: 5368, 1: 5368}


# Evaluate Oversampling Result

### Summary Data Before vs After Oversampling

In [8]:
X.iloc[:, :20].describe()

Unnamed: 0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,Dieta1_DR1TSUGR,Quest21_SLQ320,Quest21_SLQ330,Quest19_PAD615,Quest21_SLD012,Quest21_SLD013,Dieta1_DR1DAY,Quest6_DED120,Quest19_PAQ610,Quest6_DED125
count,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0
mean,0.749865,1.495062,1.114383,1.436883,1.762076,1.476567,1.461304,1.499731,1.497755,1.49596,1.499731,1.436883,1.360029,1.467948,1.475669,1.430598,1.303466,1.16951,1.477465,0.883821
std,0.433129,1.120771,0.835859,1.078465,0.425851,1.118611,1.119792,1.118215,1.118052,1.118689,1.118215,1.145649,1.059741,1.116308,1.102583,1.11863,1.083041,1.102829,1.119593,0.774484
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
max,1.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0


In [9]:
X_resampled.iloc[:, :20].describe()

Unnamed: 0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,Dieta1_DR1TSUGR,Quest21_SLQ320,Quest21_SLQ330,Quest19_PAD615,Quest21_SLD012,Quest21_SLD013,Dieta1_DR1DAY,Quest6_DED120,Quest19_PAQ610,Quest6_DED125
count,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0
mean,0.648845,1.241803,0.928558,1.35516,1.82828,1.249162,1.038562,1.280551,1.184985,1.559054,1.32582,1.261736,1.114475,0.976621,1.344355,1.187966,1.079545,1.093797,0.94402,0.865406
std,0.477354,1.04149,0.786365,0.908008,0.359316,1.053505,1.036908,1.015712,1.02646,1.027274,1.017604,1.04534,0.994269,1.035069,1.06605,1.04935,0.984749,0.852435,1.064801,0.613048
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0
max,1.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0


In [10]:
X.iloc[:, 20:40].describe()

Unnamed: 0,Dieta1_DR1TPROT,Quest19_PAQ640,Dieta1_DR1TPFAT,Dieta1_DR1TMFAT,Dieta1_DR1TCALC,Dieta1_DR1TCARB,Dieta1_DR1TTFAT,Quest19_PAD645,Exami1_BPXPLS,Demog1_RIDRETH3,Demog1_DMDHHSZA,Demog1_DMDHHSZE,Quest14_INQ020,Quest18_OCQ210,Demog1_INDIN2,Quest12_HEQ030,Quest22_SMQ900,Exami2_BMXHT,Quest3_CDQ009,Quest3_CDQ010
count,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0
mean,1.498474,1.416233,1.499192,1.498474,1.495421,1.497935,1.499551,1.480158,71.664099,3.509248,0.249596,0.7240079,1.780212,7.859939,17.408332,2.017418,1.811456,166.246497,0.039145,3.878614
std,1.118214,1.11475,1.118214,1.118374,1.118366,1.118213,1.118215,1.11852,11.133882,1.640114,0.6051535,0.8342093,1.990773,2.680441,26.055169,0.513697,0.391181,9.777816,0.193958,3.421848
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,1.0,5.397605e-79,5.397605e-79,1.0,1.0,1.0,1.0,1.0,138.3,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,3.0,5.397605e-79,5.397605e-79,1.0,9.0,6.0,2.0,2.0,159.2,0.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,70.810811,3.0,5.397605e-79,5.397605e-79,1.0,9.0,9.0,2.0,2.0,165.8,0.0,2.0
75%,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,78.0,4.0,5.397605e-79,1.0,2.0,9.0,15.0,2.0,2.0,172.9,0.0,9.0
max,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,136.0,7.0,3.0,3.0,9.0,9.0,99.0,9.0,2.0,197.7,1.0,9.0


In [11]:
X_resampled.iloc[:, 20:40].describe()

Unnamed: 0,Dieta1_DR1TPROT,Quest19_PAQ640,Dieta1_DR1TPFAT,Dieta1_DR1TMFAT,Dieta1_DR1TCALC,Dieta1_DR1TCARB,Dieta1_DR1TTFAT,Quest19_PAD645,Exami1_BPXPLS,Demog1_RIDRETH3,Demog1_DMDHHSZA,Demog1_DMDHHSZE,Quest14_INQ020,Quest18_OCQ210,Demog1_INDIN2,Quest12_HEQ030,Quest22_SMQ900,Exami2_BMXHT,Quest3_CDQ009,Quest3_CDQ010
count,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0
mean,1.167008,1.152478,1.23994,1.257731,1.235283,1.207619,1.236028,1.23696,70.766285,3.525106,0.1497923,1.016463,1.901112,8.137534,15.353435,2.026254,1.811829,166.822748,0.114988,2.736217
std,1.037605,0.966095,1.036183,1.020875,1.030041,1.026079,1.02797,0.978539,9.703803,1.408098,0.4695196,0.7692484,1.834359,2.247411,23.82807,0.523717,0.357447,9.141039,0.276351,2.858649
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,1.0,5.397605e-79,5.397605e-79,1.0,1.0,1.0,1.0,1.0,138.3,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,64.588864,3.0,5.397605e-79,5.397605e-79,1.0,9.0,5.202276,2.0,1.909767,160.8,0.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,70.0,3.045384,5.397605e-79,1.0,1.335667,9.0,7.597855,2.0,2.0,166.574052,0.0,1.62913
75%,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,76.0,4.0,5.397605e-79,1.764484,2.0,9.0,14.0,2.0,2.0,172.935649,0.0,2.0
max,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,136.0,7.0,3.0,3.0,9.0,9.0,99.0,9.0,2.0,197.7,1.0,9.0


In [12]:
X.iloc[:, 40:].describe()

Unnamed: 0,Exami2_BMXWT,Quest3_CDQ008,Quest20_PFQ061H,Quest7_DIQ010,Quest20_PFQ061B,Labor1_LBDTCSI,Quest17_DPQ040,Demog1_RIAGENDR,Labor2_URDTIME1,Quest22_SMQ890,...,Quest12_HEQ010,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C
count,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,...,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0,5569.0
mean,82.73995,7.527743,4.921889,1.878614,6.086191,4.859067,1.83067,1.514814,147.109676,1.639073,...,2.017418,2.075417,9.0,1.896929,28265.13,1.720776,0.9147597,3.525768,1.831388,6.025139
std,22.163483,2.921444,3.900593,0.456818,3.720251,1.008385,2.914901,0.499825,96.053089,0.500814,...,0.451154,2.609262,0.0,0.335527,164245.3,0.470541,1.162899,1.240231,0.412771,3.783673
min,32.6,1.0,1.0,1.0,1.0,1.97,5.397605e-79,1.0,4.0,1.0,...,1.0,1.0,9.0,1.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0
25%,68.0,9.0,1.0,2.0,1.0,4.22,5.397605e-79,1.0,89.0,1.0,...,2.0,1.0,9.0,2.0,40.0,1.0,0.439,3.0,2.0,1.0
50%,79.6,9.0,2.0,2.0,9.0,4.781616,1.0,2.0,138.857977,2.0,...,2.0,1.0,9.0,2.0,100.0,2.0,0.764,4.0,2.0,9.0
75%,93.3,9.0,9.0,2.0,9.0,5.4,2.0,2.0,164.0,2.0,...,2.0,1.0,9.0,2.0,200.0,2.0,1.018974,4.0,2.0,9.0
max,242.6,9.0,9.0,9.0,9.0,11.53,9.0,2.0,1243.0,9.0,...,9.0,9.0,9.0,9.0,999999.0,9.0,50.5,9.0,9.0,9.0


In [13]:
X_resampled.iloc[:, 40:].describe()

Unnamed: 0,Exami2_BMXWT,Quest3_CDQ008,Quest20_PFQ061H,Quest7_DIQ010,Quest20_PFQ061B,Labor1_LBDTCSI,Quest17_DPQ040,Demog1_RIAGENDR,Labor2_URDTIME1,Quest22_SMQ890,...,Quest12_HEQ010,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C
count,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,...,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0,10736.0
mean,87.044032,5.980851,3.523076,1.710196,6.151693,4.62762,2.193551,1.461468,147.972704,1.566777,...,2.02748,2.234349,9.0,1.858007,24575.99,1.582318,0.905842,3.363583,1.608454,6.06736
std,21.277567,3.319317,3.329755,0.492177,3.372938,0.980144,2.890899,0.461003,96.871123,0.466529,...,0.497464,2.611374,0.0,0.518368,152094.2,0.468022,1.579921,1.116858,0.467864,3.455533
min,32.6,1.0,1.0,1.0,1.0,1.97,5.397605e-79,1.0,4.0,1.0,...,1.0,1.0,9.0,1.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0
25%,73.3,2.0,1.0,1.182478,2.0,3.916416,5.397605e-79,1.0,92.0,1.0,...,2.0,1.0,9.0,1.941018,24.68815,1.0,0.425,2.730257,1.0,2.0
50%,83.79669,8.183003,1.782466,2.0,9.0,4.588571,1.0,1.318486,138.312992,1.78256,...,2.0,1.0,9.0,2.0,80.0,1.890349,0.7390375,3.428536,2.0,9.0
75%,98.874769,9.0,8.721978,2.0,9.0,5.15,2.85879,2.0,158.516093,2.0,...,2.0,1.650529,9.0,2.0,183.0162,2.0,0.9838996,4.0,2.0,9.0
max,242.6,9.0,9.0,9.0,9.0,11.53,9.0,2.0,1243.0,9.0,...,9.0,9.0,9.0,9.0,999999.0,9.0,50.5,9.0,9.0,9.0


### Get Variables Info

In [14]:
var_mapping_path = 'Dataset/Variable Mapping V2.xlsx'

var_mapping = pd.read_excel(os.path.join(drive_path, var_mapping_path))
var_mapping['group'] = var_mapping['group'].replace({'Dieta2': 'Dieta1'})
var_mapping['model_var_name'] = var_mapping['group'] + "_" + var_mapping['variable']

var_mapping = var_mapping.set_index('model_var_name')

var_mapping

Unnamed: 0_level_0,name,title,variable,desc,is_used,lifestyle,group
model_var_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Demog1_RIAGENDR,Demographics,Demographic Variables and Sample Weights,RIAGENDR,Gender,True,False,Demog1
Demog1_RIDAGEYR,Demographics,Demographic Variables and Sample Weights,RIDAGEYR,Age In Years At Screening,True,False,Demog1
Demog1_RIDRETH3,Demographics,Demographic Variables and Sample Weights,RIDRETH3,Race/Hispanic Origin W/ Nh Asian,True,False,Demog1
Demog1_DMDEDUC3,Demographics,Demographic Variables and Sample Weights,DMDEDUC3,Education Level - Children/Youth 6-19,True,False,Demog1
Demog1_DMDEDUC2,Demographics,Demographic Variables and Sample Weights,DMDEDUC2,Education Level - Adults 20+,True,False,Demog1
...,...,...,...,...,...,...,...
Demog1_DMDEDUC,Demographic,Demographic Variables and Sample Weights,DMDEDUC,Education Level all age,True,False,Demog1
Demog1_INDIN2,Demographic,Demographic Variables and Sample Weights,INDIN2,Average income from household income & family ...,True,False,Demog1
Exami1_SysPulse,Examination,Blood Pressure,SysPulse,Systolic Pulse Combined,True,False,Exami1
Exami1_DiaPulse,Examination,Blood Pressure,DiaPulse,Diastolic Pulse Combined,True,False,Exami1


### Summarize Relative Changes

In [15]:
before = X.describe().loc[['mean', 'std'], :].transpose()
after = X_resampled.describe().loc[['mean', 'std'], :].transpose()

before_after = before.join(after, rsuffix='_oversampled')

before_after['mean_relative_change'] = (before_after['mean_oversampled'] - before_after['mean']) / before_after['mean']
before_after['std_relative_change'] = (before_after['std_oversampled'] - before_after['std']) / before_after['std']

# Get Variable's description
before_after = before_after.join(var_mapping['desc'])

before_after

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Demog1_DMDEDUC,3.525768,1.240231,3.363583,1.116858,-0.046000,-0.099476,Education Level all age
Demog1_DMDFMSIZ,2.971988,1.667688,2.656389,1.465244,-0.106191,-0.121392,Total Number Of People In The Family
Demog1_DMDHHSIZ,3.112408,1.625256,2.757286,1.448298,-0.114099,-0.108880,Total Number Of People In The Household
Demog1_DMDHHSZA,0.249596,0.605154,0.149792,0.469520,-0.399861,-0.224131,# Of Children 5 Years Or Younger In Hh
Demog1_DMDHHSZB,0.490573,0.849670,0.345738,0.696285,-0.295235,-0.180522,# Of Children 6-17 Years Old In Hh
...,...,...,...,...,...,...,...
Quest4_CBD121,28265.128097,164245.278134,24575.985774,152094.165405,-0.130519,-0.073982,Money Spent On Eating Out
Quest6_DED120,1.169510,1.102829,1.093797,0.852435,-0.064739,-0.227047,Minutes Outdoors 9Am - 5Pm Work Day
Quest6_DED125,0.883821,0.774484,0.865406,0.613048,-0.020836,-0.208444,Minutes Outdoors 9Am - 5Pm Not Work Day
Quest7_DIQ010,1.878614,0.456818,1.710196,0.492177,-0.089650,0.077405,Doctor Told You Have Diabetes


#### Mean Relative Changes

In [16]:
before_after.sort_values(by=['mean_relative_change']).head(10)

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Quest4_CBD111,16612.48333,126081.94761,9793.44662,95014.988068,-0.410477,-0.246403,Money Spent On Food At Other Stores
Demog1_DMDHHSZA,0.249596,0.605154,0.149792,0.46952,-0.399861,-0.224131,# Of Children 5 Years Or Younger In Hh
Quest19_PAQ610,1.477465,1.119593,0.94402,1.064801,-0.361054,-0.048939,Number Of Days Vigorous Work
Quest19_PAD615,1.467948,1.116308,0.976621,1.035069,-0.334703,-0.072775,Minutes Vigorous-Intensity Work
Demog1_DMDHHSZB,0.490573,0.84967,0.345738,0.696285,-0.295235,-0.180522,# Of Children 6-17 Years Old In Hh
Quest3_CDQ010,3.878614,3.421848,2.736217,2.858649,-0.294537,-0.164589,Shortness Of Breath On Stairs/Inclines
Quest19_PAQ655,1.461304,1.119792,1.038562,1.036908,-0.289291,-0.074018,Days Vigorous Recreational Activities
Quest20_PFQ061H,4.921889,3.900593,3.523076,3.329755,-0.284202,-0.146346,Difficulty Walking Between Rooms
Dieta1_DR1TPROT,1.498474,1.118214,1.167008,1.037605,-0.221202,-0.072087,Protein (Gm)
Dieta1_DR1TKCAL,1.497755,1.118052,1.184985,1.02646,-0.208826,-0.081921,Energy (Kcal)


In [17]:
before_after.sort_values(by=['mean_relative_change']).tail(10)

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Exami2_BMXWT,82.73995,22.163483,87.044032,21.277567,0.052019,-0.039972,Weight (Kg)
Exami1_BPXPULS,1.038546,0.181753,1.103892,0.252918,0.06292,0.39155,Pulse Regular Or Irregular?
Quest14_INQ020,1.780212,1.990773,1.901112,1.834359,0.067913,-0.07857,Income From Wages/Salaries
Quest1_ALQ111,2.075417,2.609262,2.234349,2.611374,0.076578,0.000809,Ever Had A Drink Of Any Kind Of Alcohol
Demog1_RIDAGEYR,51.503681,17.812855,59.773668,16.507971,0.160571,-0.073255,Age In Years At Screening
Quest17_DPQ040,1.83067,2.914901,2.193551,2.890899,0.198223,-0.008234,Feeling Tired Or Having Little Energy
Quest17_DPQ030,1.725983,2.955484,2.070084,2.934825,0.199365,-0.00699,Trouble Sleeping Or Sleeping Too Much
Quest17_DPQ020,1.480697,2.988309,1.783586,2.989779,0.204559,0.000492,"Feeling Down, Depressed, Or Hopeless"
Demog1_DMDHHSZE,0.724008,0.834209,1.016463,0.769248,0.403939,-0.077871,# Of Adults 60 Years Or Older In Hh
Quest3_CDQ009,0.039145,0.193958,0.114988,0.276351,1.937468,0.424799,Pain In Body (Combined)


#### Standard Deviation Relative Changes

In [19]:
before_after.sort_values(by=['std_relative_change']).head(10)

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Quest4_CBD111,16612.48333,126081.94761,9793.44662,95014.988068,-0.410477,-0.246403,Money Spent On Food At Other Stores
Quest6_DED120,1.16951,1.102829,1.093797,0.852435,-0.064739,-0.227047,Minutes Outdoors 9Am - 5Pm Work Day
Demog1_DMDMARTL,2.688813,3.073212,2.484129,2.376573,-0.076124,-0.226681,Marital Status
Demog1_DMDHHSZA,0.249596,0.605154,0.149792,0.46952,-0.399861,-0.224131,# Of Children 5 Years Or Younger In Hh
Quest6_DED125,0.883821,0.774484,0.865406,0.613048,-0.020836,-0.208444,Minutes Outdoors 9Am - 5Pm Not Work Day
Demog1_DMDHHSZB,0.490573,0.84967,0.345738,0.696285,-0.295235,-0.180522,# Of Children 6-17 Years Old In Hh
Quest3_CDQ010,3.878614,3.421848,2.736217,2.858649,-0.294537,-0.164589,Shortness Of Breath On Stairs/Inclines
Quest14_IND235,16.606213,27.866392,13.559997,23.319649,-0.183438,-0.163162,Monthly Family Income
Quest18_OCQ210,7.859939,2.680441,8.137534,2.247411,0.035318,-0.161552,Usually Work 35 Or More Hours Per Week
Quest19_PAD660,1.436883,1.078465,1.35516,0.908008,-0.056875,-0.158055,Minutes Vigorous Recreational Activities


In [20]:
before_after.sort_values(by=['std_relative_change']).tail(10)

Unnamed: 0,mean,std,mean_oversampled,std_oversampled,mean_relative_change,std_relative_change,desc
Quest12_HEQ010,2.017418,0.451154,2.02748,0.497464,0.004988,0.102647,Ever Told You Have Hepatitis B?
Quest16_MCQ300A,2.03322,1.117299,2.060381,1.251612,0.013359,0.120212,
Quest9_DLQ050,1.831388,0.412771,1.608454,0.467864,-0.121729,0.133473,Have Serious Difficulty Walking?
Quest3_CDQ008,7.527743,2.921444,5.980851,3.319317,-0.205492,0.136191,Severe Pain In Chest More Than Half Hour
Labor2_URDFLOW1,0.91476,1.162899,0.905842,1.579921,-0.009749,0.358606,Urine #1 Flow Rate (Ml/Min)
Exami1_BPXPULS,1.038546,0.181753,1.103892,0.252918,0.06292,0.39155,Pulse Regular Or Irregular?
Quest3_CDQ009,0.039145,0.193958,0.114988,0.276351,1.937468,0.424799,Pain In Body (Combined)
Quest16_MCQ220,1.896929,0.335527,1.858007,0.518368,-0.020519,0.544936,Ever Told You Had Cancer Or Malignancy
Quest10_ECQ020,9.0,0.0,9.0,0.0,0.0,,Mother Smoked When Pregnant
Quest20_PFQ020,9.0,0.0,9.0,0.0,0.0,,"Crawl, Walk, Run, Play Limitations"


## Export Data

In [18]:
df = X_resampled.copy()
df['Quest16_MCQ160B'] = y_resampled
df

Unnamed: 0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,...,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C,Quest16_MCQ160B
0,1,0,2,2,2.000000,0,2,1,0,2,...,1.000000,9.0,2.000000,7.500000e+01,2.0,1.204000,2.000000,2.000000,1.000000,0
1,1,2,2,2,2.000000,0,2,1,0,0,...,2.000000,9.0,2.000000,5.397605e-79,2.0,0.500000,1.000000,2.000000,2.000000,0
2,0,2,2,1,2.000000,2,1,2,1,3,...,9.000000,9.0,2.000000,4.000000e+01,2.0,0.107000,4.000000,1.000000,9.000000,0
3,1,3,1,0,1.000000,3,3,3,3,0,...,1.000000,9.0,2.000000,8.570000e+02,2.0,0.605000,5.000000,2.000000,9.000000,0
4,1,1,2,1,1.000000,1,2,3,2,0,...,1.000000,9.0,2.000000,4.000000e+01,2.0,0.706000,3.000000,2.000000,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10731,1,0,0,1,2.000000,0,1,0,0,1,...,1.000000,9.0,1.308908,4.000000e+01,2.0,0.579171,4.308908,1.691092,3.471260,1
10732,1,1,1,1,2.000000,0,0,0,0,2,...,1.000000,9.0,2.000000,3.546832e+01,1.0,0.209351,2.773416,1.386708,5.906337,1
10733,1,0,0,1,2.000000,0,0,0,0,3,...,1.000000,9.0,1.003438,2.991748e+01,1.0,0.510536,3.000000,1.000000,3.020630,1
10734,0,1,1,2,2.000000,2,1,2,1,2,...,2.766321,9.0,2.000000,4.415803e+00,1.0,1.026967,3.441580,1.000000,7.233679,1


In [22]:
target_path = "Dataset/Data Versioning/Trained_Oversampled.csv"
final = df.copy()

final.to_csv(os.path.join(drive_path, target_path))