# Original Dataset - ML Test

In [52]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

import sqlalchemy
from sqlalchemy import create_engine, inspect

import pandas as pd
import tensorflow as tf
import keras_tuner as kt
from pprint import pprint

import os
import time
from datetime import datetime

import numpy as np
from joblib import dump, load
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score

from imblearn.over_sampling import SMOTE
import json
import pickle

%run functions.ipynb

In [2]:
# Time the run
start_time = time.time()

In [3]:
# Check for untitled_project folder
if os.path.exists("untitled_project"):
    input("DELETE untitled_project")

## Import datasets

In [57]:
# Import the data
engine = create_engine("sqlite:///voice.sqlite")

# View all of the classes
inspector = inspect(engine)
table_names = inspector.get_table_names()
table_names

['alexaval',
 'alexbval',
 'alexgval',
 'alexrval',
 'aval',
 'bval',
 'chroma1',
 'chroma10',
 'chroma11',
 'chroma12',
 'chroma2',
 'chroma3',
 'chroma4',
 'chroma5',
 'chroma6',
 'chroma7',
 'chroma8',
 'chroma9',
 'chromastd',
 'deltachroma1',
 'deltachroma10',
 'deltachroma11',
 'deltachroma12',
 'deltachroma2',
 'deltachroma3',
 'deltachroma4',
 'deltachroma5',
 'deltachroma6',
 'deltachroma7',
 'deltachroma8',
 'deltachroma9',
 'deltachromastd',
 'deltaenergy',
 'deltaenergyentropy',
 'deltamfcc1',
 'deltamfcc10',
 'deltamfcc11',
 'deltamfcc12',
 'deltamfcc13',
 'deltamfcc2',
 'deltamfcc3',
 'deltamfcc4',
 'deltamfcc5',
 'deltamfcc6',
 'deltamfcc7',
 'deltamfcc8',
 'deltamfcc9',
 'deltaspectralcentroid',
 'deltaspectralentropy',
 'deltaspectralflux',
 'deltaspectralrolloff',
 'deltaspectralspread',
 'deltazcr',
 'demographic',
 'diagnosis',
 'energy',
 'energyentropy',
 'gval',
 'habits',
 'mfcc1',
 'mfcc10',
 'mfcc11',
 'mfcc12',
 'mfcc13',
 'mfcc2',
 'mfcc3',
 'mfcc4',
 'mfcc5

In [58]:
# Tables to import
import_tables = ['diagnosis', 'demographic', 'habits']

In [59]:
# Initialise a dictionary to hold dataframes
dataframes = dict()

# Loop through each table
for table in import_tables:
    
    # Dataframe name
    df_name = f'{table}_df'
    
    # Create dataframe
    dataframes[df_name] = pd.read_sql(
        f'SELECT * FROM {table}',
        engine
    )

In [60]:
# Merge dataframes
merged_df = pd.merge(
    dataframes['demographic_df'],
    dataframes['diagnosis_df'],
    how = 'inner',
    on = 'id'
)

merged_df = pd.merge(
    merged_df,
    dataframes['habits_df'],
    how = 'inner',
    on = 'id'
)

# Display merged_df
merged_df.head()

Unnamed: 0,id,age,gender,occupation_status,diagnosis,subtype,vhi_score,rsi_score,reflux_indicated,vhi_zscore,...,chocolate,chocolate_grams_pd,coffee,coffee_pd,citrus_fruits,citrus_fruits_pd,soft_cheese,soft_cheese_pd,tomatoes,water_litres_pd
0,voice100,24,m,unknown,healthy,no subtype,0,5,0,-0.58,...,sometimes,30,always,3,never,0.0,almost always,100,never,1.5
1,voice101,60,m,unknown,healthy,no subtype,80,10,0,4.76,...,sometimes,30,always,4,never,0.0,sometimes,100,sometimes,1.5
2,voice192,22,m,cook,hyperkinetic dysphonia,no subtype,0,10,0,-0.58,...,always,14,always,3,almost always,1.17,sometimes,100,sometimes,2.5
3,voice193,46,f,housewife,hyperkinetic dysphonia,no subtype,0,36,1,-0.58,...,sometimes,30,always,2,sometimes,1.0,sometimes,100,sometimes,1.0
4,voice008,51,f,researcher,reflux laryngitis,no subtype,19,15,1,0.68,...,almost always,20,always,2,almost always,1.0,sometimes,100,almost always,1.0


In [61]:
merged_df['diagnosis'].value_counts()

diagnosis
hyperkinetic dysphonia    70
healthy                   57
hypokinetic dysphonia     39
reflux laryngitis         38
Name: count, dtype: int64

## Preprocessing

### Separate the target and feature variables

In [62]:
# Drop the 'id' column
no_id_df = merged_df.drop(columns=['id'])

In [63]:
# Define the target variables
target_var = ['diagnosis', 'subtype']
y = no_id_df[target_var]

# Define the feature variables
X = no_id_df.drop(columns=target_var)

### Binary classification - `diagnosis`

In [64]:
# Encode the target variable, ignore subtype
y = y['diagnosis'].apply(encode_binary)
y

0      0
1      0
2      1
3      1
4      1
      ..
199    0
200    1
201    1
202    0
203    0
Name: diagnosis, Length: 204, dtype: int64

### Bin `occupation_status` column

In [65]:
# Use limit_unique() function to bin the column
limit_unique(X, 10, ['occupation_status'])

occupation_status
unknown               41
researcher            41
other                 25
employee              25
housewife             23
student               16
technical operator    12
singer                10
pensioner              6
doctor                 5
Name: count, dtype: int64
Number of unique values: 10



### Encode feature columns

#### Encoding
- `smoker` column
	- `0` for `no`
	- `1` for `casual`
	- `2` for `yes`
- `alcohol_consumption` column
	- `0` for `nondrinker`
	- `1` for `casual`
	- `2` for `habitual`
- `carbonated_beverages`, `tomatoes`, `coffee`, `chocolate`, `soft_cheese`, `citrus_fruits` columns
	- `0` for `never`
	- `1` for `almost never`
	- `2` for `sometimes`
	- `3` for `almost always`
	- `4` for `always`

In [66]:
# Define the maps
smoker_map = {
    'no': 0,
    'casual': 1,
    'yes': 2
}

alcohol_map = {
    'nondrinker': 0,
    'casual': 1,
    'habitual': 2
}

habit_map = {
    'never': 0,
    'almost never': 1,
    'sometimes': 2,
    'almost always': 3,
    'always': 4
}

In [67]:
# Apply the label encoding using the maps
X['smoker'] = X['smoker'].map(smoker_map)
X['alcohol_consumption'] = X['alcohol_consumption'].map(alcohol_map)

# Habit columns
habit_cols = [
    'carbonated_beverages', 'tomatoes',
    'coffee', 'chocolate',
    'soft_cheese', 'citrus_fruits'
]

# Use a loop for the habit columns
for habit in habit_cols:
    X[habit] = X[habit].map(habit_map)

In [68]:
# Remove the occupation_status column
# X = X.drop(columns=['occupation_status'])

In [69]:
# Encode the categorical columns using get_dummies
categorical_hot = ['gender', 'occupation_status']
# categorical_hot = ['gender']

# One-hot encoding
encoded_columns = pd.get_dummies(X[categorical_hot]).astype(int)

# Update the feature dataframe
X.drop(categorical_hot, axis=1, inplace=True)
X = pd.concat([X, encoded_columns], axis=1)

# Convert the 'occupation_status_unknown' to 'occupation_status_other'
X.loc[X['occupation_status_unknown'] == 1, 'occupation_status_other'] = 1

# Drop the 'occupation_status_unknown' column
X = X.drop(columns=['occupation_status_unknown'])

In [70]:
# Display the dataframe
X.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 33 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   age                                   204 non-null    int64  
 1   vhi_score                             204 non-null    int64  
 2   rsi_score                             204 non-null    int64  
 3   reflux_indicated                      204 non-null    int64  
 4   vhi_zscore                            204 non-null    float64
 5   vhi_impact                            204 non-null    int64  
 6   alcohol_consumption                   204 non-null    int64  
 7   alcohol_pd                            204 non-null    float64
 8   smoker                                204 non-null    int64  
 9   cigarettes_pd                         204 non-null    int64  
 10  carbonated_beverages                  204 non-null    int64  
 11  carbonated_pd      

Unnamed: 0,age,vhi_score,rsi_score,reflux_indicated,vhi_zscore,vhi_impact,alcohol_consumption,alcohol_pd,smoker,cigarettes_pd,...,gender_m,occupation_status_doctor,occupation_status_employee,occupation_status_housewife,occupation_status_other,occupation_status_pensioner,occupation_status_researcher,occupation_status_singer,occupation_status_student,occupation_status_technical operator
0,24,0,5,0,-0.58,0,1,0.36,0,0,...,1,0,0,0,1,0,0,0,0,0
1,60,80,10,0,4.76,4,0,0.0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,22,0,10,0,-0.58,0,0,0.0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,46,0,36,1,-0.58,0,1,0.36,2,15,...,0,0,0,1,0,0,0,0,0,0
4,51,19,15,1,0.68,1,1,0.36,0,0,...,0,0,0,0,0,0,1,0,0,0


In [71]:
# Collect info to output to JSON
output_dict = dict()

# Populate the dictionary
output_dict['feature_names'] = list(X.columns)
output_dict['smoker_map'] = smoker_map
output_dict['alcohol_map'] = alcohol_map
output_dict['habit_map'] = habit_map

# Convert the feature names to a JSON
json_data = json.dumps(output_dict)

# Specify the file path within your repository
file_path = '../voice_app/assets/model_meta.json'

# Export JSON data to a file
with open(file_path, 'w') as json_file:
    json_file.write(json_data)

### Split and Scale

In [19]:
# Split the preprocessed data to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [20]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# # Instantiate SMOTE
# smote = SMOTE()

# # Fit and transform the data
# X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

In [22]:
# Save the scaler
dump(X_scaler, '../voice_app/assets/scaler.joblib')

['../voice_app/assets/scaler.joblib']

## Hyperparameter Tuning

In [23]:
# Define the model parameters
number_input_features = len(X_train.columns)

# Maximum hidden layers (min. 2 for DL)
max_hidden_layers = 5

# Maximum neurons per hidden layer
max_num_neurons = number_input_features * 2 - 1

# Step count
step_count = 5

# Hidden layer activation functions
activation_functions = [
    'relu', 'leaky_relu', 'tanh',
    'elu', 'selu', 'exponential',
    'softmax', 'softplus'
]

# Define the output layer
output_layer_neurons = 1 # 4
output_layer_activation = 'sigmoid' # 'softmax'

# Model compilation
compile_loss = "binary_crossentropy"
# compile_loss = "sparse_categorical_crossentropy"
compile_opt = "adam"

# Number of max epochs
tuner_max_epochs = 20
search_max_epochs = 20

# Hyperband iterations
hp_iterations = 2

# Regularizers
reg_kernel = regularizers.L1(0.01)

In [24]:
# Extract class name and parameter value, for performance tracker
class_name = reg_kernel.__class__.__name__

if class_name == "L1L2":
    value_l1 = reg_kernel.get_config()['l1']
    value_l2 = reg_kernel.get_config()['l2']

    # Create string version
    reg_kernel_string = f"{class_name}(l1={value_l1:.3f})(l2={value_l2:.3f})"

else:
    param_value = reg_kernel.get_config()[f'{class_name.lower()}']

    # Create string version
    reg_kernel_string = f"{class_name}({param_value:.3f})"

print(reg_kernel_string)

L1(0.010)


In [25]:
# Initialise the Hyperband tuner
tuner = kt.Hyperband(
    create_model,
    objective = "val_accuracy",
    max_epochs = tuner_max_epochs,
    hyperband_iterations = hp_iterations
)

In [26]:
# Find the best hyperparameters
tuner.search(
    X_train_scaled,
    y_train,
    epochs = search_max_epochs,
    validation_data = (X_test_scaled, y_test)
)
# tuner.search(
#     X_resampled,
#     y_resampled,
#     epochs = search_max_epochs,
#     validation_data = (X_test_scaled, y_test)
# )

Trial 60 Complete [00h 00m 01s]
val_accuracy: 0.7254902124404907

Best val_accuracy So Far: 0.8627451062202454
Total elapsed time: 00h 00m 32s


## Compile, Train, Evaluate the Best Model

In [27]:
# Get the top 3 model hyperparameters
top3_hyper = tuner.get_best_hyperparameters(3)

for model in top3_hyper:
    pprint(model.values)

{'activation_layer_0': 'exponential',
 'activation_layer_1': 'leaky_relu',
 'activation_layer_2': 'leaky_relu',
 'activation_layer_3': 'tanh',
 'activation_layer_4': 'selu',
 'num_layers': 1,
 'tuner/bracket': 1,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/round': 1,
 'tuner/trial_id': '0053',
 'units_layer_0': 51,
 'units_layer_1': 41,
 'units_layer_2': 6,
 'units_layer_3': 1,
 'units_layer_4': 31}
{'activation_layer_0': 'exponential',
 'activation_layer_1': 'elu',
 'activation_layer_2': 'relu',
 'activation_layer_3': 'elu',
 'activation_layer_4': 'softmax',
 'num_layers': 1,
 'tuner/bracket': 1,
 'tuner/epochs': 7,
 'tuner/initial_epoch': 0,
 'tuner/round': 0,
 'units_layer_0': 51,
 'units_layer_1': 26,
 'units_layer_2': 46,
 'units_layer_3': 31,
 'units_layer_4': 56}
{'activation_layer_0': 'selu',
 'activation_layer_1': 'tanh',
 'activation_layer_2': 'exponential',
 'activation_layer_3': 'elu',
 'activation_layer_4': 'selu',
 'num_layers': 4,
 'tuner/bracket': 2,
 'tuner

In [28]:
# Get the top model
best_hyper = top3_hyper[0]
best_hyper.values

{'activation_layer_0': 'exponential',
 'units_layer_0': 51,
 'num_layers': 1,
 'units_layer_1': 41,
 'activation_layer_1': 'leaky_relu',
 'units_layer_2': 6,
 'activation_layer_2': 'leaky_relu',
 'units_layer_3': 1,
 'activation_layer_3': 'tanh',
 'units_layer_4': 31,
 'activation_layer_4': 'selu',
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 1,
 'tuner/round': 1,
 'tuner/trial_id': '0053'}

In [29]:
# Define the model parameters
number_input_features = len(X_train.columns)
hidden_layer0_neurons = best_hyper.values['units_layer_0']
hidden_layer0_activation = best_hyper.values['activation_layer_0']

# Return the number of hidden layers
total_hidden = best_hyper.values['num_layers'] + 1

# Train model
model_train_epochs = 100

In [30]:
# Initialise the sequential model
nn = Sequential()

# Create the input layer and first hidden layer
nn.add(Dense(
    units = hidden_layer0_neurons,
    activation = hidden_layer0_activation,
    kernel_regularizer = reg_kernel,
    input_dim = number_input_features
))

# Create additional hidden layers
for layer in range(1, total_hidden):
    nn.add(Dense(
        units = best_hyper.values[f'units_layer_{layer}'],
        activation = best_hyper.values[f'activation_layer_{layer}'],
        kernel_regularizer = reg_kernel
    ))

# Create the output layer
nn.add(Dense(
    units = output_layer_neurons,
    activation = output_layer_activation
))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 51)                1734      
                                                                 
 dense_4 (Dense)             (None, 41)                2132      
                                                                 
 dense_5 (Dense)             (None, 1)                 42        
                                                                 
Total params: 3908 (15.27 KB)
Trainable params: 3908 (15.27 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [31]:
# Compile the model
nn.compile(
    loss = compile_loss,
    optimizer = compile_opt,
    metrics = ["accuracy"]
)

In [32]:
# Train the model
fit_model = nn.fit(
    X_train_scaled,
    y_train,
    shuffle = True,
    epochs = model_train_epochs,
    verbose = 1
)
# fit_model = nn.fit(
#     X_resampled,
#     y_resampled,
#     shuffle = True,
#     epochs = model_train_epochs,
#     verbose = 1
# )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Evaluate the Model Results

In [33]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(
    X_test_scaled,
    y_test,
    verbose = 2
)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2/2 - 0s - loss: 1.1740 - accuracy: 0.8235 - 46ms/epoch - 23ms/step
Loss: 1.1740237474441528, Accuracy: 0.8235294222831726


In [34]:
# Check the prediction's output probabilities
predicted_prob = nn.predict(X_test_scaled)
clean_prob = np.round(predicted_prob, 2)

# Round to the nearest integer and flatten
clean_predicted = np.round(predicted_prob).astype(int).flatten()

# Convert to a dataframe for readability
output_prob = pd.DataFrame({
    'Actual': y_test,
    'Predicted': clean_predicted,
    'Probability': clean_prob.flatten()
})

output_prob.head(10)



Unnamed: 0,Actual,Predicted,Probability
22,1,1,0.66
73,1,1,0.98
33,1,1,0.98
31,1,1,0.89
182,1,1,0.61
10,1,1,0.9
153,1,1,0.75
121,1,1,0.89
20,0,0,0.42
64,1,1,0.89


In [35]:
# Display the confusion matrix
cmatrix = confusion_matrix(y_test, clean_predicted)

# Convert the matrix to a DataFrame
cmatrix_df = pd.DataFrame(
    cmatrix,
    index = ["Actual 0", "Actual 1"],
    columns = ["Predicted 0", "Predicted 1"]
)

# Display the confusion matrix
print("Confusion Matrix:")
cmatrix_df

Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7,7
Actual 1,2,35


In [36]:
# Print the classification report for the model
creport = classification_report(
    y_test,
    clean_predicted,
    target_names = ["Healthy (0)", "Pathological (1)"]
)

print("Classification Report:\n", creport)

Classification Report:
                   precision    recall  f1-score   support

     Healthy (0)       0.78      0.50      0.61        14
Pathological (1)       0.83      0.95      0.89        37

        accuracy                           0.82        51
       macro avg       0.81      0.72      0.75        51
    weighted avg       0.82      0.82      0.81        51



In [37]:
# Extract the precision and recall
precision = precision_score(y_test, clean_predicted, average=None)
recall = recall_score(y_test, clean_predicted, average=None)
print(precision, recall)

[0.77777778 0.83333333] [0.5        0.94594595]


## Save Results to Performance Tracker

In [38]:
# Create dictionary to save the results
results_dict = dict()

current_time = datetime.now()
results_dict['timestamp'] = current_time
results_dict['runtime'] = time.time() - start_time
results_dict['model_loss'] = model_loss
results_dict['model_accuracy'] = model_accuracy
results_dict['precision_0'] = precision[0]
results_dict['precision_1'] = precision[1]
results_dict['recall_0'] = recall[0]
results_dict['recall_1'] = recall[1]

In [39]:
# Print the model architecture
print(f"Input features: {number_input_features}")
print(f"Hidden Layer 0: {hidden_layer0_activation}, {hidden_layer0_neurons}")

# Populate results_dict with model details
results_dict['input_features'] = number_input_features
results_dict['num_layers'] = best_hyper.values['num_layers']
results_dict['hlayer_0_activation'] = hidden_layer0_activation
results_dict['hlayer_0_neurons'] = hidden_layer0_neurons
results_dict['kernel_regularizer'] = reg_kernel_string

for layer in range(1, total_hidden):
    activation = best_hyper.values[f'activation_layer_{layer}']
    neurons = best_hyper.values[f'units_layer_{layer}']
    print(f"Hidden Layer {layer}: {activation}, {neurons}")
    
    results_dict[f'hlayer_{layer}_activation'] = best_hyper.values[
        f'activation_layer_{layer}']
    results_dict[f'hlayer_{layer}_neurons'] = best_hyper.values[
        f'units_layer_{layer}']

results_dict['olayer_neurons'] = output_layer_neurons
results_dict['olayer_activation'] = output_layer_activation

# Populate results_dict with tuning details
results_dict['tuning_max_hidden'] = max_hidden_layers
results_dict['tuning_max_neurons'] = max_num_neurons
results_dict['tuning_step_count'] = step_count
results_dict['activation_functions'] = str(activation_functions)
results_dict['tuning_tuner_epochs'] = tuner_max_epochs
results_dict['tuning_search_epochs'] = search_max_epochs
results_dict['tuning_hp_iterations'] = hp_iterations

# Populate results_dict with compilation details
results_dict['compile_loss'] = compile_loss
results_dict['compile_optimizer'] = compile_opt

Input features: 33
Hidden Layer 0: exponential, 51
Hidden Layer 1: leaky_relu, 41


In [40]:
# Change message
change_message = input("Changes from previous iteration: ")

# Append to results_dict
results_dict['change_message'] = change_message

Changes from previous iteration:  no changes, rerun


In [41]:
# Display the dictionary
results_dict

{'timestamp': datetime.datetime(2024, 1, 9, 10, 36, 19, 558145),
 'runtime': 33.478758096694946,
 'model_loss': 1.1740237474441528,
 'model_accuracy': 0.8235294222831726,
 'precision_0': 0.7777777777777778,
 'precision_1': 0.8333333333333334,
 'recall_0': 0.5,
 'recall_1': 0.9459459459459459,
 'input_features': 33,
 'num_layers': 1,
 'hlayer_0_activation': 'exponential',
 'hlayer_0_neurons': 51,
 'kernel_regularizer': 'L1(0.010)',
 'hlayer_1_activation': 'leaky_relu',
 'hlayer_1_neurons': 41,
 'olayer_neurons': 1,
 'olayer_activation': 'sigmoid',
 'tuning_max_hidden': 5,
 'tuning_max_neurons': 65,
 'tuning_step_count': 5,
 'activation_functions': "['relu', 'leaky_relu', 'tanh', 'elu', 'selu', 'exponential', 'softmax', 'softplus']",
 'tuning_tuner_epochs': 20,
 'tuning_search_epochs': 20,
 'tuning_hp_iterations': 2,
 'compile_loss': 'binary_crossentropy',
 'compile_optimizer': 'adam',
 'change_message': 'no changes, rerun'}

In [42]:
# Convert the dictionary to a dataframe
results_df = pd.DataFrame(results_dict, index=[0])
results_df.head()

Unnamed: 0,timestamp,runtime,model_loss,model_accuracy,precision_0,precision_1,recall_0,recall_1,input_features,num_layers,...,tuning_max_hidden,tuning_max_neurons,tuning_step_count,activation_functions,tuning_tuner_epochs,tuning_search_epochs,tuning_hp_iterations,compile_loss,compile_optimizer,change_message
0,2024-01-09 10:36:19.558145,33.478758,1.174024,0.823529,0.777778,0.833333,0.5,0.945946,33,1,...,5,65,5,"['relu', 'leaky_relu', 'tanh', 'elu', 'selu', ...",20,20,2,binary_crossentropy,adam,"no changes, rerun"


In [43]:
# Performance tracker
tracker_path = "../resources/tracker/dl_performance_tracker.csv"

# Model percentage
model_pct = round(model_accuracy, 3)

# Check if the CSV exists
if os.path.exists(tracker_path):
    
    # Read the existing CSV
    tracker_df = pd.read_csv(tracker_path)
    
    # Append the new row of data
    updated_df = pd.concat([tracker_df, results_df], ignore_index=True)
    
    # Update the CSV file
    updated_df.to_csv(tracker_path, index=False)
    
    # Export model to HDF5 file
    nn.save(f'../models/dl/run_{len(tracker_df)}_{model_pct}.h5', save_format='h5')

else:    
    # Export to CSV
    results_df.to_csv(tracker_path, index=False)
    
    # Export model to HDF5 file
    nn.save(f'../models/dl/run_0_{model_pct}.h5', save_format='h5')

  saving_api.save_model(


In [51]:
# Serialize the model with pickle
with open('../models/pickled_run_41_0.824.h5', 'wb') as file:
    pickle.dump(nn, file)

## Understand the Predictions

In [44]:
# Check the prediction's output probabilities
predicted_prob = nn.predict(X_test_scaled)
clean_prob = np.round(predicted_prob, 2)

# Round to the nearest integer and flatten
clean_predicted = np.round(predicted_prob).astype(int).flatten()

# Convert to a dataframe for readability
output_prob = pd.DataFrame({
    'Actual': y_test,
    'Predicted': clean_predicted,
    'Probability': clean_prob.flatten()
})
output_prob



Unnamed: 0,Actual,Predicted,Probability
22,1,1,0.66
73,1,1,0.98
33,1,1,0.98
31,1,1,0.89
182,1,1,0.61
10,1,1,0.9
153,1,1,0.75
121,1,1,0.89
20,0,0,0.42
64,1,1,0.89


In [45]:
# Identify incorrect predictions
output_prob.loc[output_prob['Actual'] != output_prob['Predicted']]

Unnamed: 0,Actual,Predicted,Probability
135,0,1,0.75
110,0,1,0.8
166,0,1,0.87
145,1,0,0.02
54,0,1,0.89
150,0,1,0.93
34,1,0,0.01
61,0,1,0.92
156,0,1,0.77


In [46]:
# Display incorrect indices
incorrect_idx = output_prob.loc[output_prob['Actual'] != output_prob['Predicted']].index
incorrect_idx

Index([135, 110, 166, 145, 54, 150, 34, 61, 156], dtype='int64')

In [47]:
# Display incorrect predictions as a complete dataframe
dataframes['diagnosis_df'].iloc[incorrect_idx]

Unnamed: 0,id,diagnosis,subtype,vhi_score,rsi_score,reflux_indicated,vhi_zscore,vhi_impact
135,voice109,healthy,no subtype,11,14,1,0.15,1
110,voice120,healthy,no subtype,22,4,0,0.89,1
166,voice196,healthy,no subtype,0,22,1,-0.58,0
145,voice191,hyperkinetic dysphonia,no subtype,0,19,1,-0.58,0
54,voice051,healthy,no subtype,2,13,1,-0.45,0
150,voice034,healthy,no subtype,19,16,1,0.68,1
34,voice160,hyperkinetic dysphonia,no subtype,4,0,0,-0.32,0
61,voice019,healthy,no subtype,45,10,0,2.42,3
156,voice095,healthy,no subtype,29,0,0,1.35,2
