In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import random
import itertools
import pprint
from imblearn.over_sampling import SMOTENC

from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

drive_path = '/content/drive/MyDrive/Kuliah/Tugas Akhir/Final Project Shared Folder'
data_path = "Dataset/Data Versioning/"
model_path = "Model/ML Model/"
data_version = "Trained_V2-3.csv"
base_url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?CycleBeginYear=2017"
dataset_names = ['Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire']

# Ingest Data

df_raw = pd.read_csv(os.path.join(drive_path, data_path+data_version), index_col = 0)
try:
  df_raw = df_raw.set_index('SEQN', drop=True)
  df_raw = df_raw.drop(columns = "Unnamed: 0")
except:
  pass

df_raw.head()

Unnamed: 0_level_0,Dieta1_DRDINT,Dieta1_DR1TFIBE,Quest21_SLQ300,Quest19_PAD660,Quest19_PAQ635,Dieta1_DR1TCHOL,Quest19_PAQ655,Dieta1_DR1TSFAT,Dieta1_DR1TKCAL,Exami2_BMXBMI,...,Quest1_ALQ111,Quest10_ECQ020,Quest16_MCQ220,Quest4_CBD121,Quest16_MCQ366A,Labor2_URDFLOW1,Demog1_DMDEDUC,Quest9_DLQ050,Quest20_PFQ061C,Quest16_MCQ160B
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93705.0,1,0,2,2,2.0,0,2,1,0,2,...,1.0,9.0,2.0,75.0,2.0,1.204,2.0,2.0,1.0,2.0
93708.0,1,2,2,2,2.0,0,2,1,0,0,...,2.0,9.0,2.0,5.397605e-79,2.0,0.5,1.0,2.0,2.0,2.0
93709.0,0,2,2,1,2.0,2,1,2,1,3,...,9.0,9.0,2.0,40.0,2.0,0.107,4.0,1.0,9.0,2.0
93711.0,1,3,1,0,1.0,3,3,3,3,0,...,1.0,9.0,2.0,857.0,2.0,0.605,5.0,2.0,9.0,2.0
93713.0,1,1,2,1,1.0,1,2,3,2,0,...,1.0,9.0,2.0,40.0,2.0,0.706,3.0,2.0,1.0,2.0


In [3]:
log_raw = pd.DataFrame({
    'convolution_part':[],
    'convolution_hyperparameter':[],
    'dense_part':[],
    'dense_hyperparameter':[],
    'input_shape':[],
    'layer_metadata':[]
})

log_raw.head()

Unnamed: 0,convolution_part,convolution_hyperparameter,dense_part,dense_hyperparameter,input_shape,layer_metadata


In [4]:
def get_categorical(df):
    categorical = []
    for col in df.columns.tolist():
        try:
            if len(df[col].unique().tolist()) < 20:
                categorical.append({
                    'desc': model_var[model_var['model_var_name'] ==  col]['desc'].values[0],
                    'column': col,
                    'possible_values': df[col].unique().tolist()}
                            )
        except Exception as error:
            if len(df[col].unique().tolist()) < 20:
                categorical.append({
                    'desc': None,
                    'column': col,
                    'possible_values': df[col].unique().tolist()})
            # print(f'{col} error: {error}')
    return categorical

X = df_raw.copy().drop(columns=['Quest16_MCQ160B'])
y = df_raw['Quest16_MCQ160B']

categorical_summary = get_categorical(X)
categorical_cols = [True if data_col in [col['column'] for col in categorical_summary] else False for data_col in X.columns.tolist()]
print(categorical_cols)

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, False, True, True, False, True, True, False, True, True, True, True, False, True, True, False, True, True, False, False, True, True, False, True, False, False, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, False, True, False, True, True, True]


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTENC

def prep_data_cross_validation(df, num_folds=5):
  X = df.copy().drop(columns=['Quest16_MCQ160B'])
  y = df['Quest16_MCQ160B']

  y = y.replace({2: 0})
  y = y.replace({9: 0})

  y = y.astype(int)

  kf = KFold(n_splits=num_folds, shuffle=True, random_state=37)

  for train_index, val_index in kf.split(X):
    X_train, X_val, y_train, y_val = X.iloc[train_index, :], X.iloc[val_index, :], y.iloc[train_index], y.iloc[val_index]
    # Instantiate the SMOTE oversampler
    smotenc = SMOTENC(
          categorical_features=categorical_cols,
          sampling_strategy='auto',
          random_state=37
        )

      # Apply SMOTE to generate synthetic samples
    X_resampled, y_resampled = smotenc.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_resampled.iloc[:, 28:] = scaler.fit_transform(X_resampled.iloc[:, 28:])
    X_val.iloc[:, 28:] = scaler.transform(X_val.iloc[:, 28:])

    X_resampled = np.array(X_resampled).reshape(X_resampled.shape[0], X_resampled.shape[1], 1)
    X_val = np.array(X_val).reshape(X_val.shape[0], X_val.shape[1], 1)

    yield X_resampled, X_val, y_resampled, y_val

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC

def prep_data(df):
  X = df.copy().drop(columns=['Quest16_MCQ160B'])
  y = df['Quest16_MCQ160B']

  y = y.replace({2: 0})
  y = y.replace({9: 0})

  y = y.astype(int)

  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

  # Instantiate the SMOTE oversampler
  smotenc = SMOTENC(
      categorical_features=categorical_cols,
      sampling_strategy='auto',
      random_state=37
    )

  # Apply SMOTE to generate synthetic samples
  X_resampled, y_resampled = smotenc.fit_resample(X_train, y_train)

  scaler = StandardScaler()
  X_resampled.iloc[:, 28:] = scaler.fit_transform(X_resampled.iloc[:, 28:])
  X_val.iloc[:, 28:] = scaler.transform(X_val.iloc[:, 28:])

  X_resampled = np.array(X_resampled).reshape(X_resampled.shape[0], X_resampled.shape[1], 1)
  X_val = np.array(X_val).reshape(X_val.shape[0], X_val.shape[1], 1)

  print("Train: ", X_resampled.shape, " ", y_resampled.shape)
  print("Val: ", X_val.shape, " ", y_val.shape)
  print("Column Used: ", X.columns.tolist())

  return X_resampled, X_val, y_resampled, y_val

In [6]:
import tensorflow as tf
from tensorflow.keras import layers, models
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin

class CNNModel(BaseEstimator, ClassifierMixin):
  def __init__(
      self,
      input_shape,
      layer_metadata,
      convolution_part=['conv1d'],
      intermediate_part=['flatten'],
      dense_part=['dense'],
      convolution_hyperparameter=[{'filters':16, 'kernel_size':3, 'activation':'relu'}],
      intermediate_hyperparameter=[{}],
      dense_hyperparameter=[{'units':32, 'activation':'relu'}],
      optimizer='adam'
  ):
    self.input_shape = input_shape
    self.layer_metadata = layer_metadata
    self.convolution_part = convolution_part
    self.intermediate_part = intermediate_part
    self.dense_part = dense_part
    self.convolution_hyperparameter = convolution_hyperparameter
    self.intermediate_hyperparameter = intermediate_hyperparameter
    self.dense_hyperparameter = dense_hyperparameter
    self.optimizer = optimizer
    self.model = self.build_model(input_shape, convolution_part, intermediate_part, dense_part, convolution_hyperparameter, intermediate_hyperparameter, dense_hyperparameter, optimizer)

  def build_model(
      self,
      input_shape,
      convolution_part,
      intermediate_part,
      dense_part,
      convolution_hyperparameter,
      intermediate_hyperparameter,
      dense_hyperparameter,
      optimizer,
  ):

    def fix_hyperparameter_type(layer, hyperparameter, layer_metadata, warnings=True):
      for layer_info in layer_metadata:
        if(layer == layer_info['layer']):
          for params in layer_info['hyperparameter']:
            val = hyperparameter.get(params['param'], params['default'])
            if(type(val) != params['type']):
              if(warnings): print("Hyperparameter ", params['param'], " Have Invalid Data Type! Using Default..")
              hyperparameter[params['param']] = params['default']
            else:
              hyperparameter[params['param']] = val
        else:
          pass
      return hyperparameter

    model = models.Sequential()
    model.add(layers.Input(input_shape))

    # Convolution Part
    for layer, hyperparameter in zip(convolution_part, convolution_hyperparameter):
      hyperparameter = fix_hyperparameter_type(layer, hyperparameter, self.layer_metadata)

      if(layer == 'conv1d'):
        model.add(layers.Conv1D(filters=hyperparameter['filters'], kernel_size=hyperparameter['kernel_size'], activation=hyperparameter['activation']))
      elif(layer == 'maxpooling1d'):
        model.add(layers.MaxPooling1D(pool_size=hyperparameter['pool_size']))
      else:
        print("'Convolution Part' Layer Invalid!")
        return None

    # Intermediate Part
    for layer, hyperparameter in zip(intermediate_part, intermediate_hyperparameter):
      hyperparameter = fix_hyperparameter_type(layer, hyperparameter, self.layer_metadata)

      if(layer == 'flatten'):
        model.add(layers.Flatten())
      else:
        print("'Intermediate Part' Layer Invalid!")
        return None

    # Dense Part
    for layer, hyperparameter in zip(dense_part, dense_hyperparameter):
      hyperparameter = fix_hyperparameter_type(layer, hyperparameter, self.layer_metadata)

      if(layer == 'dense'):
        model.add(layers.Dense(units=hyperparameter['units'], activation=hyperparameter['activation']))
      else:
        print("'Dense Part' Layer Invalid!")
        return None

    model.add(layers.Dense(2, activation='softmax'))

    # Compile the model
    model.compile(optimizer=optimizer,
                loss='sparse_categorical_crossentropy',
                metrics=[
                    'accuracy'
                ])

    return model

  def fit(self, X_train, y_train, X_test, y_test, epochs, verbose=1, callbacks=[]):
    self.model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), verbose=verbose, callbacks=callbacks)

  def score(self, X, y):
    loss, accuracy = self.model.evaluate(X, y, verbose=0)
    loss_inverse = 1/(loss+1e-20)

    return loss_inverse

  def evaluate(self, X, y, verbose=1):
    # Evaluate the model on the test set
    loss, accuracy = self.model.evaluate(X, y, verbose=0)

    if(verbose != 0):
      print(f"Loss: %.3f%%" % (loss*100) )
      print(f"Accuracy: %.3f%%" % (accuracy*100) )

    return loss, accuracy

  def summary(self):
    self.model.summary()

  def predict(self, X):
    return self.model.predict(X, verbose=0)

  def save(self, path, save_format='keras'):
    return self.model.save(path, save_format)

In [7]:
layer_metadata = [
  {'layer': 'conv1d', 'hyperparameter': [
      {'param':'filters', 'type':int, 'default':16},
      {'param':'kernel_size', 'type':int, 'default':3},
      {'param':'activation', 'type':str, 'default':'linear'}
    ]
  },
  {'layer': 'maxpooling1d', 'hyperparameter': [
      {'param':'pool_size', 'type':int, 'default':2}
    ]
  },
  {'layer': 'conv2d', 'hyperparameter': [
      {'param':'filters', 'type':int, 'default':16},
      {'param':'kernel_size', 'type':tuple, 'default':(3,3)},
      {'param':'activation', 'type':str, 'default':'linear'}
    ]
  },
  {'layer': 'maxpooling2d', 'hyperparameter': [
      {'param':'pool_size', 'type':tuple, 'default':(2,2)}
    ]
  },
  {'layer': 'flatten', 'hyperparameter': [

    ]
  },
  {'layer': 'dense', 'hyperparameter': [
      {'param':'units', 'type':int, 'default':8},
      {'param':'activation', 'type':str, 'default':'linear'}
    ]
  }
]

In [8]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score

class MetricsCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data=()):
        super(MetricsCallback, self).__init__()
        self.validation_data = validation_data
        self.logs = {}

    def on_epoch_end(self, epoch, logs={}):
        if not self.validation_data:
            raise RuntimeError("Requires validation_data.")

        X_val, y_val = self.validation_data
        y_proba = self.model.predict(X_val, verbose=0)

        # Convert one-hot encoded labels to class labels
        y_pred = np.argmax(y_proba, axis=1)

        # Compute Metrics score
        f1score = f1_score(y_val, y_pred, average='weighted')
        roc_auc = roc_auc_score(y_val, y_pred, average='weighted')

        # Add Metrics score to logs
        self.logs['roc_auc'] = roc_auc
        self.logs['f1score'] = f1score

        # Calculate ROC curve
        fpr, tpr, thresholds = roc_curve(y_val, y_proba[:, 1])

        # Compute the optimal threshold by maximizing the Youden's J statistic
        J = tpr - fpr
        optimal_idx = np.argmax(J)
        optimal_threshold = thresholds[optimal_idx]

        # Alternatively, you can minimize the distance to the top-left corner (0,1)
        distances = np.sqrt(fpr**2 + (1-tpr)**2)
        optimal_idx = np.argmin(distances)
        optimal_threshold = thresholds[optimal_idx]

        y_pred_optimal_threshold = [1 if prob[1] >= optimal_threshold else 0 for prob in y_proba]

        # Compute Metrics score
        f1score_optimal_threshold = f1_score(y_val, y_pred_optimal_threshold, average='weighted')
        roc_auc_optimal_threshold = roc_auc_score(y_val, y_pred_optimal_threshold, average='weighted')
        accuracy_optimal_threshold = accuracy_score(y_val, y_pred_optimal_threshold)

        self.logs['roc_auc_optimal_threshold'] = roc_auc_optimal_threshold
        self.logs['f1score_optimal_threshold'] = f1score_optimal_threshold
        self.logs['accuracy_optimal_threshold'] = accuracy_optimal_threshold

In [9]:
tf.random.set_seed(37)
random.seed(37)

X_train, X_val, y_train, y_val = prep_data(df_raw)

log = None

schema_log = [
    row.to_dict()
    for index, row in
    log_raw[
        ['convolution_part', 'convolution_hyperparameter', 'dense_part', 'dense_hyperparameter', 'input_shape', 'layer_metadata']
    ].iterrows()
]

input_shape = X_train.shape[1:]
output_shape = 1

param_grid = {
  'n_conv': [0],
  'n_dense':[1, 2, 3, 4],
  'dense_layer':['dense'],
  'units': [4, 16, 32, 64, 128],
  'activation': ['relu']
}

def save_result(save_state, target_path):
  log.to_csv(target_path, index=False)

def get_schema_combination(part_name, n_layer, layer_values, layer_metadata, param_grid):
  schema_combination = []
  for n in n_layer:
    part_combination=itertools.product(layer_values, repeat=n)

    for part in part_combination:
      hyperparameters = []

      for selected_layer in part:

        # Get selected layers's possible hyperparameters
        hyperparameter_values = {}
        for layer in layer_metadata:
          if(selected_layer == layer['layer']):
            for params in layer['hyperparameter']:
              hyperparameter_values[params['param']] = [params['default']]
              if params['param'] in list(param_grid):
                hyperparameter_values[params['param']] = param_grid[params['param']]

        # get hyperparameter combination to grid search
        hyperparameter_combination = []
        for combination in list(itertools.product(*hyperparameter_values.values())):
          layer_hyperparameter = {}
          for key, val in zip(hyperparameter_values.keys(), combination):
            layer_hyperparameter[key] = val
          hyperparameter_combination.append(layer_hyperparameter)

        hyperparameters.append(hyperparameter_combination)

      for hyperparameter in list(itertools.product(*hyperparameters)):
        schema = {part_name+'_part': list(part), part_name+'_hyperparameter': list(hyperparameter)}
        schema_combination.append(schema)

  return schema_combination

conv_shcema = get_schema_combination(part_name='convolution',
                       n_layer = param_grid.get('n_conv', [1]),
                       layer_values = param_grid.get('conv_layer', ['conv1d']),
                       layer_metadata=layer_metadata,
                       param_grid=param_grid)

dense_schema = get_schema_combination(part_name='dense',
                       n_layer = param_grid.get('n_dense', [1]),
                       layer_values = param_grid.get('dense_layer', ['dense']),
                       layer_metadata=layer_metadata,
                       param_grid=param_grid)

model_schema_combination = []
for schema in itertools.product(conv_shcema, dense_schema):
  model_schema = {}
  for schema_dict in schema:
    model_schema.update(schema_dict)

  model_schema_combination.append(model_schema)

print(len(model_schema_combination))
print(model_schema_combination[0])
print(model_schema_combination[-1])

Train:  (7522, 85, 1)   (7522,)
Val:  (1671, 85, 1)   (1671,)
Column Used:  ['Dieta1_DRDINT', 'Dieta1_DR1TFIBE', 'Quest21_SLQ300', 'Quest19_PAD660', 'Quest19_PAQ635', 'Dieta1_DR1TCHOL', 'Quest19_PAQ655', 'Dieta1_DR1TSFAT', 'Dieta1_DR1TKCAL', 'Exami2_BMXBMI', 'Dieta1_DR1TSUGR', 'Quest21_SLQ320', 'Quest21_SLQ330', 'Quest19_PAD615', 'Quest21_SLD012', 'Quest21_SLD013', 'Dieta1_DR1DAY', 'Quest6_DED120', 'Quest19_PAQ610', 'Quest6_DED125', 'Dieta1_DR1TPROT', 'Quest19_PAQ640', 'Dieta1_DR1TPFAT', 'Dieta1_DR1TMFAT', 'Dieta1_DR1TCALC', 'Dieta1_DR1TCARB', 'Dieta1_DR1TTFAT', 'Quest19_PAD645', 'Exami1_BPXPLS', 'Demog1_RIDRETH3', 'Demog1_DMDHHSZA', 'Demog1_DMDHHSZE', 'Quest14_INQ020', 'Quest18_OCQ210', 'Demog1_INDIN2', 'Quest12_HEQ030', 'Quest22_SMQ900', 'Exami2_BMXHT', 'Quest3_CDQ009', 'Quest3_CDQ010', 'Exami2_BMXWT', 'Quest3_CDQ008', 'Quest20_PFQ061H', 'Quest7_DIQ010', 'Quest20_PFQ061B', 'Labor1_LBDTCSI', 'Quest17_DPQ040', 'Demog1_RIAGENDR', 'Labor2_URDTIME1', 'Quest22_SMQ890', 'Demog1_DMDMARTL', '

In [12]:
import time

random.seed(37)

skipped = 0
# Randomized to get 30 sample
for i, model_schema in enumerate(random.sample(model_schema_combination, min(len(model_schema_combination), 30))):
  start_time = time.time()
  model_schema['input_shape'] = input_shape
  model_schema['layer_metadata'] = layer_metadata

  print(model_schema)

  loss = []
  accuracy = []
  precision = []
  recall = []
  f1score = []
  auc = []
  accuracy_optimal_threshold = []
  f1score_optimal_threshold = []
  auc_optimal_threshold = []
  train_loss = []
  train_accuracy = []
  train_precision = []
  train_recall = []
  train_f1score = []
  train_auc = []
  train_accuracy_optimal_threshold = []
  train_f1score_optimal_threshold = []
  train_auc_optimal_threshold = []

  model = CNNModel(**model_schema)
  model.summary()

  print("Training Model...")
  for X_train, X_val, y_train, y_val in prep_data_cross_validation(df_raw, num_folds=4):

    model = CNNModel(**model_schema)
    metrics_callback = MetricsCallback(validation_data=(X_val, y_val))
    train_metrics_callback = MetricsCallback(validation_data=(X_train, y_train))

    model.fit(X_train, y_train, X_val, y_val, epochs=20, verbose=0, callbacks=[metrics_callback, train_metrics_callback])
    _loss, _accuracy = model.evaluate(X_val, y_val, verbose=0)
    loss.append(_loss)
    accuracy.append(_accuracy)
    f1score.append(metrics_callback.logs.get('f1score'))
    auc.append(metrics_callback.logs.get('roc_auc'))
    accuracy_optimal_threshold.append(metrics_callback.logs.get('accuracy_optimal_threshold'))
    f1score_optimal_threshold.append(metrics_callback.logs.get('f1score_optimal_threshold'))
    auc_optimal_threshold.append(metrics_callback.logs.get('roc_auc_optimal_threshold'))

    _train_loss, _train_accuracy = model.evaluate(X_train, y_train, verbose=0)
    train_loss.append(_train_loss)
    train_accuracy.append(_train_accuracy)
    train_f1score.append(train_metrics_callback.logs.get('f1score'))
    train_auc.append(train_metrics_callback.logs.get('roc_auc'))
    train_accuracy_optimal_threshold.append(train_metrics_callback.logs.get('accuracy_optimal_threshold'))
    train_f1score_optimal_threshold.append(train_metrics_callback.logs.get('f1score_optimal_threshold'))
    train_auc_optimal_threshold.append(train_metrics_callback.logs.get('roc_auc_optimal_threshold'))

  print(f"Loss: %.3f%%" %  (np.mean(loss)*100) )
  print(f"Accuracy: %.3f%%" % (np.mean(accuracy)*100) )
  print(f"F1 Score: %.3f%%" % (np.mean(f1score)*100) )
  print(f"AUC: %.3f%%" % (np.mean(auc)*100) )
  print(f"Accuracy Optimal Threshold: %.3f%%" % (np.mean(accuracy_optimal_threshold)*100) )
  print(f"F1 Score Optimal Threshold: %.3f%%" % (np.mean(f1score_optimal_threshold)*100) )
  print(f"AUC Optimal Threshold: %.3f%%" % (np.mean(auc_optimal_threshold)*100) )

  loss, accuracy, f1score, auc, accuracy_optimal_threshold, f1score_optimal_threshold, auc_optimal_threshold = np.mean(loss), np.mean(accuracy), np.mean(f1score), np.mean(auc), np.mean(accuracy_optimal_threshold), np.mean(f1score_optimal_threshold), np.mean(auc_optimal_threshold)
  train_loss, train_accuracy, train_f1score, train_auc, train_accuracy_optimal_threshold, train_f1score_optimal_threshold, train_auc_optimal_threshold = np.mean(train_loss), np.mean(accuracy), np.mean(train_f1score), np.mean(train_auc), np.mean(train_accuracy_optimal_threshold), np.mean(train_f1score_optimal_threshold), np.mean(train_auc_optimal_threshold)

  end_time = time.time()
  print("Duration (Second):", (end_time - start_time))
  save_state = {}
  save_state.update(model_schema)
  save_state.update({'duration': (end_time - start_time)})
  save_state.update({'loss': loss, 'accuracy': accuracy, 'f1score': f1score, 'auc': auc, 'accuracy_optimal_threshold': accuracy_optimal_threshold, 'f1score_optimal_threshold':f1score_optimal_threshold, 'auc_optimal_threshold':auc_optimal_threshold})
  save_state.update({'train_loss': train_loss, 'train_accuracy': train_accuracy, 'train_f1score': train_f1score, 'train_auc': train_auc, 'train_accuracy_optimal_threshold': train_accuracy_optimal_threshold, 'train_f1score_optimal_threshold':train_f1score_optimal_threshold, 'train_auc_optimal_threshold':train_auc_optimal_threshold})

  log = pd.concat([log, pd.DataFrame({key: [value] for key, value in save_state.items()})])

  save_result(log, drive_path+"/Notebook/Arif's Workspace/Model Comparison/hyperparameter_tuning_log.csv")

{'convolution_part': [], 'convolution_hyperparameter': [], 'dense_part': ['dense', 'dense', 'dense', 'dense'], 'dense_hyperparameter': [{'units': 128, 'activation': 'relu'}, {'units': 16, 'activation': 'relu'}, {'units': 64, 'activation': 'relu'}, {'units': 64, 'activation': 'relu'}], 'input_shape': (85, 1), 'layer_metadata': [{'layer': 'conv1d', 'hyperparameter': [{'param': 'filters', 'type': <class 'int'>, 'default': 16}, {'param': 'kernel_size', 'type': <class 'int'>, 'default': 3}, {'param': 'activation', 'type': <class 'str'>, 'default': 'linear'}]}, {'layer': 'maxpooling1d', 'hyperparameter': [{'param': 'pool_size', 'type': <class 'int'>, 'default': 2}]}, {'layer': 'conv2d', 'hyperparameter': [{'param': 'filters', 'type': <class 'int'>, 'default': 16}, {'param': 'kernel_size', 'type': <class 'tuple'>, 'default': (3, 3)}, {'param': 'activation', 'type': <class 'str'>, 'default': 'linear'}]}, {'layer': 'maxpooling2d', 'hyperparameter': [{'param': 'pool_size', 'type': <class 'tuple'

In [17]:
hp_log_sorted = pd.read_csv( drive_path+"/Notebook/Arif's Workspace/Model Comparison/hyperparameter_tuning_log.csv")
hp_log_sorted['avg_metrics'] = (hp_log_sorted['accuracy_optimal_threshold'] + hp_log_sorted['f1score_optimal_threshold'] + hp_log_sorted['auc_optimal_threshold']) / 3

hp_log_sorted = hp_log_sorted.sort_values(by=['avg_metrics', 'accuracy_optimal_threshold', 'f1score_optimal_threshold', 'auc_optimal_threshold'], ascending=[False, False, False, False])

hp_log_sorted

Unnamed: 0,convolution_part,convolution_hyperparameter,dense_part,dense_hyperparameter,input_shape,layer_metadata,duration,loss,accuracy,f1score,...,f1score_optimal_threshold,auc_optimal_threshold,train_loss,train_accuracy,train_f1score,train_auc,train_accuracy_optimal_threshold,train_f1score_optimal_threshold,train_auc_optimal_threshold,avg_metrics
1,[],[],"['dense', 'dense', 'dense', 'dense']","[{'units': 64, 'activation': 'relu'}, {'units'...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",182.060262,0.507082,0.940204,0.942976,...,0.86549,0.789361,0.002373,0.940204,0.999473,0.999473,0.999659,0.999659,0.999659,0.820426
23,[],[],"['dense', 'dense', 'dense']","[{'units': 4, 'activation': 'relu'}, {'units':...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",163.559588,0.250262,0.922607,0.931561,...,0.863238,0.770089,0.088941,0.922607,0.975189,0.97519,0.976712,0.976711,0.976712,0.812354
3,[],[],"['dense', 'dense', 'dense', 'dense']","[{'units': 64, 'activation': 'relu'}, {'units'...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",176.472398,0.419361,0.941641,0.942325,...,0.870992,0.741619,0.007524,0.941641,0.998665,0.998665,0.998696,0.998696,0.998696,0.81036
8,[],[],"['dense', 'dense', 'dense']","[{'units': 4, 'activation': 'relu'}, {'units':...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",173.386961,0.245717,0.928712,0.935931,...,0.856628,0.775347,0.090189,0.928712,0.973858,0.973859,0.974542,0.974542,0.974542,0.80837
27,[],[],"['dense', 'dense', 'dense', 'dense']","[{'units': 16, 'activation': 'relu'}, {'units'...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",175.815647,0.293654,0.939665,0.942591,...,0.868137,0.741633,0.039424,0.939665,0.990092,0.990092,0.99062,0.99062,0.99062,0.808088
14,[],[],"['dense', 'dense', 'dense', 'dense']","[{'units': 128, 'activation': 'relu'}, {'units...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",163.674135,0.52268,0.93356,0.937885,...,0.851108,0.775958,0.003819,0.93356,0.998759,0.998759,0.999379,0.999379,0.999379,0.803982
22,[],[],"['dense', 'dense', 'dense']","[{'units': 128, 'activation': 'relu'}, {'units...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",179.881797,0.482417,0.94649,0.944591,...,0.844323,0.780676,5.2e-05,0.94649,1.0,1.0,1.0,1.0,1.0,0.800419
17,[],[],"['dense', 'dense', 'dense', 'dense']","[{'units': 16, 'activation': 'relu'}, {'units'...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",169.263867,0.249476,0.941283,0.940896,...,0.84571,0.772042,0.037509,0.941283,0.989286,0.989286,0.990652,0.990652,0.990652,0.798005
16,[],[],"['dense', 'dense', 'dense', 'dense']","[{'units': 64, 'activation': 'relu'}, {'units'...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",170.158511,0.492403,0.942539,0.944577,...,0.846244,0.764469,0.001039,0.942539,0.999689,0.999689,0.999782,0.999782,0.999782,0.796614
21,[],[],"['dense', 'dense', 'dense', 'dense']","[{'units': 32, 'activation': 'relu'}, {'units'...","(85, 1)","[{'layer': 'conv1d', 'hyperparameter': [{'para...",184.567027,0.368777,0.939487,0.942011,...,0.840802,0.778262,0.010563,0.939487,0.996553,0.996553,0.997018,0.997018,0.997018,0.796166


In [18]:
# Top 3 Model

for i in range(3):
  print("Best Model #", i+1)
  print("-"*10)
  pprint.pprint(hp_log_sorted.iloc[-i-1].to_dict())
  print("-"*10)

Best Model # 1
----------
{'accuracy': 0.9335605800151824,
 'accuracy_optimal_threshold': 0.7300984344959609,
 'auc': 0.617797987994414,
 'auc_optimal_threshold': 0.7135034447725397,
 'avg_metrics': 0.7524480189945075,
 'convolution_hyperparameter': '[]',
 'convolution_part': '[]',
 'dense_hyperparameter': "[{'units': 16, 'activation': 'relu'}, {'units': 64, "
                         "'activation': 'relu'}, {'units': 32, 'activation': "
                         "'relu'}, {'units': 128, 'activation': 'relu'}]",
 'dense_part': "['dense', 'dense', 'dense', 'dense']",
 'duration': 173.2511863708496,
 'f1score': 0.9386478497205052,
 'f1score_optimal_threshold': 0.8137421777150223,
 'input_shape': '(85, 1)',
 'layer_metadata': "[{'layer': 'conv1d', 'hyperparameter': [{'param': "
                   "'filters', 'type': <class 'int'>, 'default': 16}, "
                   "{'param': 'kernel_size', 'type': <class 'int'>, 'default': "
                   "3}, {'param': 'activation', 'type': <class

In [19]:
# Bottom 3 Model

for i in range(3):
  print("Worst Model #", i+1)
  print("-"*10)
  pprint.pprint(hp_log_sorted.iloc[i].to_dict())
  print("-"*10)

Worst Model # 1
----------
{'accuracy': 0.9402037560939788,
 'accuracy_optimal_threshold': 0.80642642605474,
 'auc': 0.6199351362289272,
 'auc_optimal_threshold': 0.7893607189914458,
 'avg_metrics': 0.8204256644844793,
 'convolution_hyperparameter': '[]',
 'convolution_part': '[]',
 'dense_hyperparameter': "[{'units': 64, 'activation': 'relu'}, {'units': 64, "
                         "'activation': 'relu'}, {'units': 64, 'activation': "
                         "'relu'}, {'units': 32, 'activation': 'relu'}]",
 'dense_part': "['dense', 'dense', 'dense', 'dense']",
 'duration': 182.060261964798,
 'f1score': 0.9429761657984452,
 'f1score_optimal_threshold': 0.8654898484072522,
 'input_shape': '(85, 1)',
 'layer_metadata': "[{'layer': 'conv1d', 'hyperparameter': [{'param': "
                   "'filters', 'type': <class 'int'>, 'default': 16}, "
                   "{'param': 'kernel_size', 'type': <class 'int'>, 'default': "
                   "3}, {'param': 'activation', 'type': <class '