# Prepare Environment

In [None]:
!pip install joblib



In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import random

from sklearn.metrics import confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, models
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin

import joblib

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
data_path = "Dataset/Data Versioning/"
model_path = "Model/ML Model/"
scaler_path = "Model/Scaler/"
data_version = "Trained_V2-3.csv"
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

Mounted at /content/drive


In [3]:
df_raw = pd.read_csv(os.path.join(drive_path, data_path+data_version), index_col = 0)
try:
  df_raw = df_raw.set_index('SEQN', drop=True)
  df_raw = df_raw.drop(columns = "Unnamed: 0")
except:
  pass

df_raw.head()

Unnamed: 0_level_0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,...,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C,Quest16_MCQ160B
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93705.0,1,0,2,2,2.0,0,2,1,0,2,...,1.0,9.0,2.0,75.0,2.0,1.204,2.0,2.0,1.0,2.0
93708.0,1,2,2,2,2.0,0,2,1,0,0,...,2.0,9.0,2.0,5.397605e-79,2.0,0.5,1.0,2.0,2.0,2.0
93709.0,0,2,2,1,2.0,2,1,2,1,3,...,9.0,9.0,2.0,40.0,2.0,0.107,4.0,1.0,9.0,2.0
93711.0,1,3,1,0,1.0,3,3,3,3,0,...,1.0,9.0,2.0,857.0,2.0,0.605,5.0,2.0,9.0,2.0
93713.0,1,1,2,1,1.0,1,2,3,2,0,...,1.0,9.0,2.0,40.0,2.0,0.706,3.0,2.0,1.0,2.0


In [4]:
naive_model = tf.keras.models.load_model(os.path.join(drive_path, model_path+'model_cnn_naive.h5'))
naive_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_9 (Conv1D)           (None, 83, 8)             32        
                                                                 
 conv1d_10 (Conv1D)          (None, 81, 12)            300       
                                                                 
 conv1d_11 (Conv1D)          (None, 79, 16)            592       
                                                                 
 flatten_3 (Flatten)         (None, 1264)              0         
                                                                 
 dense_9 (Dense)             (None, 4)                 5060      
                                                                 
 dense_10 (Dense)            (None, 16)                80        
                                                                 
 dense_11 (Dense)            (None, 2)                

In [5]:
proper_model = tf.keras.models.load_model(os.path.join(drive_path, model_path+'model_cnn_proper.h5'))
proper_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_6 (Conv1D)           (None, 83, 8)             32        
                                                                 
 conv1d_7 (Conv1D)           (None, 81, 12)            300       
                                                                 
 conv1d_8 (Conv1D)           (None, 79, 16)            592       
                                                                 
 flatten_2 (Flatten)         (None, 1264)              0         
                                                                 
 dense_6 (Dense)             (None, 4)                 5060      
                                                                 
 dense_7 (Dense)             (None, 16)                80        
                                                                 
 dense_8 (Dense)             (None, 2)                

In [6]:
scaler = joblib.load(os.path.join(drive_path, scaler_path+'/standard_scaler.gz'))
scaler

In [20]:
def predict_risk(data, model, version='custom_threshold'):
  scaler = joblib.load(os.path.join(drive_path, scaler_path+'/standard_scaler.gz'))
  data.iloc[:, 28:] = scaler.fit_transform(data.iloc[:, 28:])

  data = np.array(data).reshape(data.shape[0], data.shape[1], 1)

  prob = model.predict(data)

  if(version == 'custom_threshold'):
    return (1 if prob[0][1] >= 0.0006083620246499777 else 0), min(prob[0][1] / 0.0006083620246499777, 1)
  else:
    return np.argmax(prob[0]), prob[0][1]

# Sampling Data

In [26]:
def sample_predict():
  X = df_raw.sample()
  y = X.pop('Quest16_MCQ160B')

  y = y.replace({2: 0})
  y = y.replace({9: 0})
  y = y.astype(int)

  print(y)
  print(X)

  print(predict_risk(X, naive_model, version='normal'))
  print(predict_risk(X, proper_model, version='normal'))
  print(predict_risk(X, proper_model, version='custom_threshold'))

In [11]:
X = df_raw.sample()
y = X.pop('Quest16_MCQ160B')

y = y.replace({2: 0})
y = y.replace({9: 0})
y = y.astype(int)

print(y)
X

SEQN
99537.0    1
Name: Quest16_MCQ160B, dtype: int64


Unnamed: 0_level_0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,...,Quest12_HEQ010,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
99537.0,0,2,1,1,2.0,1,2,1,1,2,...,2.0,9.0,9.0,1.0,5.397605e-79,2.0,0.909839,2.0,1.0,9.0


In [21]:
predict_risk(X, naive_model, version='normal')



(0, 0.31557038)

In [22]:
predict_risk(X, proper_model, version='normal')



(0, 0.22311062)

In [23]:
predict_risk(X, proper_model, version='custom_threshold')



(1, 1)

In [24]:
X = df_raw.sample()
y = X.pop('Quest16_MCQ160B')

y = y.replace({2: 0})
y = y.replace({9: 0})
y = y.astype(int)

print(y)
X

SEQN
96066.0    0
Name: Quest16_MCQ160B, dtype: int64


Unnamed: 0_level_0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,...,Quest12_HEQ010,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96066.0,1,3,1,1,1.0,3,2,3,3,0,...,2.0,1.0,9.0,2.0,5.397605e-79,2.0,2.138,3.0,2.0,2.0


In [25]:
print(predict_risk(X, naive_model, version='normal'))
print(predict_risk(X, proper_model, version='normal'))
print(predict_risk(X, proper_model, version='custom_threshold'))

(0, 0.02732069)
(0, 0.067523256)
(1, 1)


In [27]:
sample_predict()

SEQN
101282.0    0
Name: Quest16_MCQ160B, dtype: int64
          Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                       
101282.0              1                1               1               2   

          Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                         
101282.0             2.0                0               0                1   

          Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                      ...                                  
101282.0                1              3  ...             2.0            1.0   

          Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                       
101282.0             9.0             2.0          400.0              1.0   

          L

In [29]:
sample_predict()

SEQN
95471.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
95471.0              1                1               0               2   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
95471.0             1.0                3               0                3   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
95471.0                2              0  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
95471.0             9.0             1.0          150.0              2.0   

         Labor2_URDFLOW1

In [31]:
sample_predict()

SEQN
94933.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
94933.0              1                3               0               1   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
94933.0             1.0                0               2                0   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
94933.0                0              3  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
94933.0             9.0             2.0           40.0              2.0   

         Labor2_URDFLOW1

In [32]:
sample_predict()

SEQN
95543.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
95543.0              0                1               0               3   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
95543.0             2.0                1               3                2   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
95543.0                2              0  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
95543.0             9.0             2.0         1000.0              1.0   

         Labor2_URDFLOW1

In [33]:
sample_predict()

SEQN
98250.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
98250.0              0                1               1               2   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
98250.0             2.0                1               0                1   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
98250.0                1              1  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
98250.0             9.0             1.0           50.0              1.0   

         Labor2_URDFLOW1

In [34]:
sample_predict()

SEQN
97305.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
97305.0              1                3               1               2   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
97305.0             2.0                3               1                2   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
97305.0                3              2  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
97305.0             9.0             2.0          321.0              2.0   

         Labor2_URDFLOW1

In [35]:
sample_predict()

SEQN
101109.0    0
Name: Quest16_MCQ160B, dtype: int64
          Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                       
101109.0              1                0               1               2   

          Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                         
101109.0             2.0                3               0                1   

          Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                      ...                                  
101109.0                1              3  ...             2.0            1.0   

          Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                       
101109.0             9.0             2.0           50.0              1.0   

          L

In [36]:
sample_predict()

SEQN
102720.0    0
Name: Quest16_MCQ160B, dtype: int64
          Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                       
102720.0              1                1               1               2   

          Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                         
102720.0             2.0                3               1                2   

          Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                      ...                                  
102720.0                3              2  ...             2.0            1.0   

          Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                       
102720.0             9.0             2.0          300.0              2.0   

          L

In [37]:
sample_predict()

SEQN
96226.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
96226.0              0                2               3               3   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
96226.0             2.0                3               0                3   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
96226.0                3              2  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
96226.0             9.0             2.0          100.0              2.0   

         Labor2_URDFLOW1

In [38]:
sample_predict()

SEQN
94927.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
94927.0              1                2               0               3   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
94927.0             2.0                2               3                3   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
94927.0                3              3  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
94927.0             9.0             2.0     189.273769              2.0   

         Labor2_URDFLOW1

In [39]:
sample_predict()

SEQN
97124.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
97124.0              1                1               1               1   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
97124.0             1.0                0               1                0   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
97124.0                1              1  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
97124.0             9.0             1.0   5.397605e-79              2.0   

         Labor2_URDFLOW1

In [40]:
sample_predict()

SEQN
95374.0    0
Name: Quest16_MCQ160B, dtype: int64
         Dieta1_DRDINT  Dieta1_DR1TFIBE  Quest21_SLQ300  Quest19_PAD660  \
SEQN                                                                      
95374.0              1                0               0               3   

         Quest19_PAQ635  Dieta1_DR1TCHOL  Quest19_PAQ655  Dieta1_DR1TSFAT  \
SEQN                                                                        
95374.0             2.0                0               3                0   

         Dieta1_DR1TKCAL  Exami2_BMXBMI  ...  Quest12_HEQ010  Quest1_ALQ111  \
SEQN                                     ...                                  
95374.0                0              0  ...             2.0            1.0   

         Quest10_ECQ020  Quest16_MCQ220  Quest4_CBD121  Quest16_MCQ366A  \
SEQN                                                                      
95374.0             9.0             2.0   5.397605e-79              2.0   

         Labor2_URDFLOW1