# Original Dataset - Stacking Classifier

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

import sqlalchemy
from sqlalchemy import create_engine, inspect

import pandas as pd
import tensorflow as tf
import keras_tuner as kt
from pprint import pprint

import os
import time
from datetime import datetime

import numpy as np
from joblib import dump, load
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score

from imblearn.over_sampling import SMOTE
import json

import joblib

from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

%run functions.ipynb

In [2]:
# Time the run
start_time = time.time()

## Import datasets

In [4]:
# Import the data
engine = create_engine("sqlite:///voice.sqlite")

# View all of the classes
inspector = inspect(engine)
table_names = inspector.get_table_names()
table_names

['alexaval',
 'alexbval',
 'alexgval',
 'alexrval',
 'aval',
 'bval',
 'chroma1',
 'chroma10',
 'chroma11',
 'chroma12',
 'chroma2',
 'chroma3',
 'chroma4',
 'chroma5',
 'chroma6',
 'chroma7',
 'chroma8',
 'chroma9',
 'chromastd',
 'deltachroma1',
 'deltachroma10',
 'deltachroma11',
 'deltachroma12',
 'deltachroma2',
 'deltachroma3',
 'deltachroma4',
 'deltachroma5',
 'deltachroma6',
 'deltachroma7',
 'deltachroma8',
 'deltachroma9',
 'deltachromastd',
 'deltaenergy',
 'deltaenergyentropy',
 'deltamfcc1',
 'deltamfcc10',
 'deltamfcc11',
 'deltamfcc12',
 'deltamfcc13',
 'deltamfcc2',
 'deltamfcc3',
 'deltamfcc4',
 'deltamfcc5',
 'deltamfcc6',
 'deltamfcc7',
 'deltamfcc8',
 'deltamfcc9',
 'deltaspectralcentroid',
 'deltaspectralentropy',
 'deltaspectralflux',
 'deltaspectralrolloff',
 'deltaspectralspread',
 'deltazcr',
 'demographic',
 'diagnosis',
 'energy',
 'energyentropy',
 'gval',
 'habits',
 'mfcc1',
 'mfcc10',
 'mfcc11',
 'mfcc12',
 'mfcc13',
 'mfcc2',
 'mfcc3',
 'mfcc4',
 'mfcc5

In [5]:
# Tables to import
import_tables = ['diagnosis', 'demographic', 'habits']

In [6]:
# Initialise a dictionary to hold dataframes
dataframes = dict()

# Loop through each table
for table in import_tables:
    
    # Dataframe name
    df_name = f'{table}_df'
    
    # Create dataframe
    dataframes[df_name] = pd.read_sql(
        f'SELECT * FROM {table}',
        engine
    )

In [7]:
# Merge dataframes
merged_df = pd.merge(
    dataframes['demographic_df'],
    dataframes['diagnosis_df'],
    how = 'inner',
    on = 'id'
)

merged_df = pd.merge(
    merged_df,
    dataframes['habits_df'],
    how = 'inner',
    on = 'id'
)

# Display merged_df
merged_df.head()

Unnamed: 0,id,age,gender,occupation_status,diagnosis,subtype,vhi_score,rsi_score,reflux_indicated,vhi_zscore,...,chocolate,chocolate_grams_pd,coffee,coffee_pd,citrus_fruits,citrus_fruits_pd,soft_cheese,soft_cheese_pd,tomatoes,water_litres_pd
0,voice100,24,m,unknown,healthy,no subtype,0,5,0,-0.58,...,sometimes,30,always,3,never,0.0,almost always,100,never,1.5
1,voice101,60,m,unknown,healthy,no subtype,80,10,0,4.76,...,sometimes,30,always,4,never,0.0,sometimes,100,sometimes,1.5
2,voice192,22,m,cook,hyperkinetic dysphonia,no subtype,0,10,0,-0.58,...,always,14,always,3,almost always,1.17,sometimes,100,sometimes,2.5
3,voice193,46,f,housewife,hyperkinetic dysphonia,no subtype,0,36,1,-0.58,...,sometimes,30,always,2,sometimes,1.0,sometimes,100,sometimes,1.0
4,voice008,51,f,researcher,reflux laryngitis,no subtype,19,15,1,0.68,...,almost always,20,always,2,almost always,1.0,sometimes,100,almost always,1.0


## Preprocessing

### Separate the target and feature variables

In [8]:
# Drop the 'id' column
no_id_df = merged_df.drop(columns=['id'])

In [9]:
# Define the target variables
target_var = ['diagnosis', 'subtype']
y = no_id_df[target_var]

# Define the feature variables
X = no_id_df.drop(columns=target_var)

### Binary classification - `diagnosis`

In [10]:
# Encode the target variable, ignore subtype
y = y['diagnosis'].apply(encode_binary)
y

0      0
1      0
2      1
3      1
4      1
      ..
199    0
200    1
201    1
202    0
203    0
Name: diagnosis, Length: 204, dtype: int64

### Bin `occupation_status` column

In [11]:
# Use limit_unique() function to bin the column
limit_unique(X, 10, ['occupation_status'])

occupation_status
unknown               41
researcher            41
other                 25
employee              25
housewife             23
student               16
technical operator    12
singer                10
pensioner              6
doctor                 5
Name: count, dtype: int64
Number of unique values: 10



### Encode feature columns

#### Encoding
- `smoker` column
	- `0` for `no`
	- `1` for `casual`
	- `2` for `yes`
- `alcohol_consumption` column
	- `0` for `nondrinker`
	- `1` for `casual`
	- `2` for `habitual`
- `carbonated_beverages`, `tomatoes`, `coffee`, `chocolate`, `soft_cheese`, `citrus_fruits` columns
	- `0` for `never`
	- `1` for `almost never`
	- `2` for `sometimes`
	- `3` for `almost always`
	- `4` for `always`

In [12]:
# Define the maps
smoker_map = {
    'no': 0,
    'casual': 1,
    'yes': 2
}

alcohol_map = {
    'nondrinker': 0,
    'casual': 1,
    'habitual': 2
}

habit_map = {
    'never': 0,
    'almost never': 1,
    'sometimes': 2,
    'almost always': 3,
    'always': 4
}

In [13]:
# Apply the label encoding using the maps
X['smoker'] = X['smoker'].map(smoker_map)
X['alcohol_consumption'] = X['alcohol_consumption'].map(alcohol_map)

# Habit columns
habit_cols = [
    'carbonated_beverages', 'tomatoes',
    'coffee', 'chocolate',
    'soft_cheese', 'citrus_fruits'
]

# Use a loop for the habit columns
for habit in habit_cols:
    X[habit] = X[habit].map(habit_map)

In [14]:
# Encode the categorical columns using get_dummies
categorical_hot = ['gender', 'occupation_status']

# One-hot encoding
encoded_columns = pd.get_dummies(X[categorical_hot]).astype(int)

# Update the feature dataframe
X.drop(categorical_hot, axis=1, inplace=True)
X = pd.concat([X, encoded_columns], axis=1)

# Convert the 'occupation_status_unknown' to 'occupation_status_other'
X.loc[X['occupation_status_unknown'] == 1, 'occupation_status_other'] = 1

# Drop the 'occupation_status_unknown' column
X = X.drop(columns=['occupation_status_unknown'])

In [15]:
# Display the dataframe
X.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 33 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   age                                   204 non-null    int64  
 1   vhi_score                             204 non-null    int64  
 2   rsi_score                             204 non-null    int64  
 3   reflux_indicated                      204 non-null    int64  
 4   vhi_zscore                            204 non-null    float64
 5   vhi_impact                            204 non-null    int64  
 6   alcohol_consumption                   204 non-null    int64  
 7   alcohol_pd                            204 non-null    float64
 8   smoker                                204 non-null    int64  
 9   cigarettes_pd                         204 non-null    int64  
 10  carbonated_beverages                  204 non-null    int64  
 11  carbonated_pd      

Unnamed: 0,age,vhi_score,rsi_score,reflux_indicated,vhi_zscore,vhi_impact,alcohol_consumption,alcohol_pd,smoker,cigarettes_pd,...,gender_m,occupation_status_doctor,occupation_status_employee,occupation_status_housewife,occupation_status_other,occupation_status_pensioner,occupation_status_researcher,occupation_status_singer,occupation_status_student,occupation_status_technical operator
0,24,0,5,0,-0.58,0,1,0.36,0,0,...,1,0,0,0,1,0,0,0,0,0
1,60,80,10,0,4.76,4,0,0.0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,22,0,10,0,-0.58,0,0,0.0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,46,0,36,1,-0.58,0,1,0.36,2,15,...,0,0,0,1,0,0,0,0,0,0
4,51,19,15,1,0.68,1,1,0.36,0,0,...,0,0,0,0,0,0,1,0,0,0


### Split and Scale

In [16]:
# Split the preprocessed data to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [17]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# # Instantiate SMOTE
# smote = SMOTE()

# # Fit and transform the data
# X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

In [19]:
# Save the scaler
dump(X_scaler, '../voice_app/assets/scaler.joblib')

['../voice_app/assets/scaler.joblib']

## Stacking Model

In [20]:
# Define the parameters
rfc_estimators = 1000
rfc_criterion = 'gini'
xgb_objective = 'binary:logistic'
svc_kernel = 'poly'
lr_solver = 'liblinear'

# Meta model
hidden_layer = (80,)
max_iter = 1500

In [21]:
# Define the base models
base_models = [
    ('rf', RandomForestClassifier(
        n_estimators = rfc_estimators,
        criterion = rfc_criterion
    )),
    ('xgb', xgb.XGBClassifier(objective = xgb_objective)),
    # ('svm', SVC(
    #     kernel = svc_kernel,
    #     class_weight = 'balanced'
    # )),
    ('lr', LogisticRegression(solver = lr_solver))
]

In [23]:
# Create the meta-model
meta_model = MLPClassifier(
    hidden_layer_sizes = hidden_layer,
    max_iter = max_iter
)

In [24]:
# Create the Stacking Classifier
stacking_model = StackingClassifier(
    estimators = base_models,
    final_estimator = meta_model
)

In [25]:
# Fit the model
fit_model = stacking_model.fit(X_train_scaled, y_train)

In [26]:
# Make predictions
predictions = stacking_model.predict(X_test_scaled)

In [27]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm,
    index = ["Actual 0", "Actual 1"],
    columns = ["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
model_accuracy = accuracy_score(y_test, predictions)

In [28]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"\nAccuracy Score: {model_accuracy}")
print("\nClassification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,6
Actual 1,0,37



Accuracy Score: 0.8823529411764706

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.57      0.73        14
           1       0.86      1.00      0.92        37

    accuracy                           0.88        51
   macro avg       0.93      0.79      0.83        51
weighted avg       0.90      0.88      0.87        51



## Save Results to Performance Tracker

In [29]:
# Extract the precision and recall
precision = precision_score(y_test, predictions, average=None)
recall = recall_score(y_test, predictions, average=None)
print(precision, recall)

[1.         0.86046512] [0.57142857 1.        ]


In [30]:
# Create dictionary to save the results
results_dict = dict()

current_time = datetime.now()
results_dict['timestamp'] = current_time
results_dict['runtime'] = time.time() - start_time
results_dict['model_accuracy'] = model_accuracy
results_dict['precision_0'] = precision[0]
results_dict['precision_1'] = precision[1]
results_dict['recall_0'] = recall[0]
results_dict['recall_1'] = recall[1]
results_dict['rfc_estimators'] = rfc_estimators
results_dict['rfc_criterion'] = rfc_criterion
results_dict['xgb_objective'] = xgb_objective
results_dict['svc_kernel'] = svc_kernel
results_dict['lr_solver'] = lr_solver

# Meta model
results_dict['meta_type'] = 'MLPClassifier'
results_dict['meta_nn_hiddenlayer'] = hidden_layer
results_dict['meta_nn_maxiter'] = max_iter

In [31]:
# Change message
change_message = input("Changes from previous iteration: ")

# Append to results_dict
results_dict['change_message'] = change_message

Changes from previous iteration:  no changes, rerun


In [32]:
# Display the dictionary
results_dict

{'timestamp': datetime.datetime(2024, 1, 9, 9, 41, 21, 161555),
 'runtime': 3.2743847370147705,
 'model_accuracy': 0.8823529411764706,
 'precision_0': 1.0,
 'precision_1': 0.8604651162790697,
 'recall_0': 0.5714285714285714,
 'recall_1': 1.0,
 'rfc_estimators': 1000,
 'rfc_criterion': 'gini',
 'xgb_objective': 'binary:logistic',
 'svc_kernel': 'poly',
 'lr_solver': 'liblinear',
 'meta_type': 'MLPClassifier',
 'meta_nn_hiddenlayer': (80,),
 'meta_nn_maxiter': 1500,
 'change_message': 'no changes, rerun'}

In [33]:
# Convert the dictionary to a dataframe
results_df = pd.DataFrame(results_dict, index=[0])
results_df.head()

Unnamed: 0,timestamp,runtime,model_accuracy,precision_0,precision_1,recall_0,recall_1,rfc_estimators,rfc_criterion,xgb_objective,svc_kernel,lr_solver,meta_type,meta_nn_hiddenlayer,meta_nn_maxiter,change_message
0,2024-01-09 09:41:21.161555,3.274385,0.882353,1.0,0.860465,0.571429,1.0,1000,gini,binary:logistic,poly,liblinear,MLPClassifier,80,1500,"no changes, rerun"


In [34]:
# Performance tracker
tracker_path = "../resources/tracker/stack_performance_tracker.csv"

# Model percentage
model_pct = round(model_accuracy, 3)

# Check if the CSV exists
if os.path.exists(tracker_path):
    
    # Read the existing CSV
    tracker_df = pd.read_csv(tracker_path)
    
    # Append the new row of data
    updated_df = pd.concat([tracker_df, results_df], ignore_index=True)
    
    # Update the CSV file
    updated_df.to_csv(tracker_path, index=False)
    
    # Export model to HDF5 file
    joblib.dump(stacking_model, f'../models/stack/run_{len(tracker_df)}_{model_pct}.h5')

else:    
    # Export to CSV
    results_df.to_csv(tracker_path, index=False)
    
    # Export model to HDF5 file
    joblib.dump(stacking_model, f'../models/stack/run_0_{model_pct}.h5')

## Understand the Predictions

In [35]:
# Check the prediction's output probabilities
predicted_prob = stacking_model.predict(X_test_scaled)
clean_prob = np.round(predicted_prob, 2)

# Round to the nearest integer and flatten
clean_predicted = np.round(predicted_prob).astype(int).flatten()

# Convert to a dataframe for readability
output_prob = pd.DataFrame({
    'Actual': y_test,
    'Predicted': clean_predicted,
    'Probability': clean_prob.flatten()
})
output_prob

Unnamed: 0,Actual,Predicted,Probability
67,1,1,1
19,1,1,1
141,0,1,1
32,1,1,1
109,1,1,1
102,1,1,1
159,0,0,0
61,0,1,1
79,1,1,1
179,1,1,1


In [36]:
# Display incorrect indices
incorrect_idx = output_prob.loc[output_prob['Actual'] != output_prob['Predicted']].index
incorrect_idx

Index([141, 61, 203, 136, 46, 170], dtype='int64')

In [37]:
# Display incorrect predictions as a complete dataframe
dataframes['diagnosis_df'].iloc[incorrect_idx]

Unnamed: 0,id,diagnosis,subtype,vhi_score,rsi_score,reflux_indicated,vhi_zscore,vhi_impact
141,voice103,healthy,no subtype,27,22,1,1.22,2
61,voice019,healthy,no subtype,45,10,0,2.42,3
203,voice024,healthy,no subtype,6,12,0,-0.18,0
136,voice108,healthy,no subtype,0,4,0,-0.58,0
46,voice123,healthy,no subtype,0,3,0,-0.58,0
170,voice045,healthy,no subtype,3,20,1,-0.38,0
