In [25]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import sqlalchemy
from sqlalchemy import create_engine, inspect

import pandas as pd
import tensorflow as tf
import keras_tuner as kt
from pprint import pprint

%run functions.ipynb

## Import datasets

In [2]:
# Import the data
engine = create_engine("sqlite:///voice.sqlite")

# View all of the classes
inspector = inspect(engine)
table_names = inspector.get_table_names()
table_names

['demographic', 'diagnosis', 'habits']

In [3]:
# Initialise a dictionary to hold dataframes
dataframes = dict()

# Loop through each table
for table in table_names:
    
    # Dataframe name
    df_name = f'{table}_df'
    
    # Create dataframe
    dataframes[df_name] = pd.read_sql(
        f'SELECT * FROM {table}',
        engine
    )

In [4]:
# Merge dataframes
merged_df = pd.merge(
    dataframes['demographic_df'],
    dataframes['diagnosis_df'],
    how = 'inner',
    on = 'id'
)

merged_df = pd.merge(
    merged_df,
    dataframes['habits_df'],
    how = 'inner',
    on = 'id'
)

# Display merged_df
merged_df.head()

Unnamed: 0,id,age,gender,occupation_status,diagnosis,subtype,vhi_score,rsi_score,alcohol_consumption,alcohol_pd,...,chocolate,chocolate_grams_pd,coffee,coffee_pd,citrus_fruits,citrus_fruits_pd,soft_cheese,soft_cheese_pd,tomatoes,water_litres_pd
0,voice100,24,m,unknown,healthy,no subtype,0,5,casual,0.36,...,sometimes,30,always,3,never,0.0,almost always,100,never,1.5
1,voice101,60,m,unknown,healthy,no subtype,80,10,nondrinker,0.0,...,sometimes,30,always,4,never,0.0,sometimes,100,sometimes,1.5
2,voice192,22,m,cook,hyperkinetic dysphonia,no subtype,0,10,nondrinker,0.0,...,always,14,always,3,almost always,1.0,sometimes,100,sometimes,2.5
3,voice193,46,f,housewife,hyperkinetic dysphonia,no subtype,0,36,casual,0.36,...,sometimes,30,always,2,sometimes,1.0,sometimes,100,sometimes,1.0
4,voice008,51,f,researcher,reflux laryngitis,no subtype,19,15,casual,0.36,...,almost always,20,always,2,almost always,1.0,sometimes,100,almost always,1.0


## Preprocessing

### Separate the target and feature variables

In [5]:
# Drop the 'id' column
no_id_df = merged_df.drop(columns=['id'])

In [6]:
# Define the target variables
target_var = ['diagnosis', 'subtype']
y = no_id_df[target_var]

# Define the feature variables
X = no_id_df.drop(columns=target_var)

### Binary Classification - `diagnosis`

In [7]:
# Encode the target variable, ignore subtype
y = y['diagnosis'].apply(encode_binary)
y

0      0
1      0
2      1
3      1
4      1
      ..
201    0
202    1
203    1
204    0
205    0
Name: diagnosis, Length: 206, dtype: int64

### Bin `occupation_status` column

In [9]:
# Use limit_unique() function to bin the column
limit_unique(X, 10, ['occupation_status'])

occupation_status
researcher            42
unknown               41
other                 25
employee              25
housewife             23
student               16
technical operator    13
singer                10
pensioner              6
doctor                 5
Name: count, dtype: int64
Number of unique values: 10



### Encode feature columns

#### Encoding
- `smoker` column
	- `0` for `no`
	- `1` for `casual`
	- `2` for `yes`
- `alcohol_consumption` column
	- `0` for `nondrinker`
	- `1` for `casual`
	- `2` for `habitual`
- `carbonated_beverages`, `tomatoes`, `coffee`, `chocolate`, `soft_cheese`, `citrus_fruits` columns
	- `0` for `never`
	- `1` for `almost never`
	- `2` for `sometimes`
	- `3` for `almost always`
	- `4` for `always`

In [10]:
# Define the maps
smoker_map = {
    'no': 0,
    'casual': 1,
    'yes': 2
}

alcohol_map = {
    'nondrinker': 0,
    'casual': 1,
    'habitual': 2
}

habit_map = {
    'never': 0,
    'almost never': 1,
    'sometimes': 2,
    'almost always': 3,
    'always': 4
}

In [11]:
# Apply the label encoding using the maps
X['smoker'] = X['smoker'].map(smoker_map)
X['alcohol_consumption'] = X['alcohol_consumption'].map(alcohol_map)

# Habit columns
habit_cols = [
    'carbonated_beverages', 'tomatoes',
    'coffee', 'chocolate',
    'soft_cheese', 'citrus_fruits'
]

# Use a loop for the habit columns
for habit in habit_cols:
    X[habit] = X[habit].map(habit_map)

In [12]:
# Encode the categorical columns using get_dummies
categorical_hot = ['gender', 'occupation_status']

# One-hot encoding
encoded_columns = pd.get_dummies(X[categorical_hot]).astype(int)

# Update the feature dataframe
X.drop(categorical_hot, axis=1, inplace=True)
X = pd.concat([X, encoded_columns], axis=1)

In [13]:
# Display the dataframe
X.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 31 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   age                                   206 non-null    int64  
 1   vhi_score                             206 non-null    int64  
 2   rsi_score                             206 non-null    int64  
 3   alcohol_consumption                   206 non-null    int64  
 4   alcohol_pd                            206 non-null    float64
 5   smoker                                206 non-null    int64  
 6   cigarettes_pd                         206 non-null    int64  
 7   carbonated_beverages                  206 non-null    int64  
 8   carbonated_pd                         206 non-null    float64
 9   chocolate                             206 non-null    int64  
 10  chocolate_grams_pd                    206 non-null    int64  
 11  coffee             

Unnamed: 0,age,vhi_score,rsi_score,alcohol_consumption,alcohol_pd,smoker,cigarettes_pd,carbonated_beverages,carbonated_pd,chocolate,...,occupation_status_doctor,occupation_status_employee,occupation_status_housewife,occupation_status_other,occupation_status_pensioner,occupation_status_researcher,occupation_status_singer,occupation_status_student,occupation_status_technical operator,occupation_status_unknown
0,24,0,5,1,0.36,0,0,3,3.0,2,...,0,0,0,0,0,0,0,0,0,1
1,60,80,10,0,0.0,0,0,3,3.0,2,...,0,0,0,0,0,0,0,0,0,1
2,22,0,10,0,0.0,0,0,0,0.0,4,...,0,0,0,1,0,0,0,0,0,0
3,46,0,36,1,0.36,2,15,2,0.61,2,...,0,0,1,0,0,0,0,0,0,0
4,51,19,15,1,0.36,0,0,1,0.09,3,...,0,0,0,0,0,1,0,0,0,0


### Split and Scale

In [14]:
# Split the preprocessed data to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [15]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Hyperparameter Tuning

In [34]:
# Define the model parameters
number_input_features = len(X_train.columns)

# Maximum hidden layers (min. 2 for DL)
max_hidden_layers = 3

# Maximum neurons per hidden layer
max_num_neurons = number_input_features * 2 - 1

# Step count
step_count = 5

# Hidden layer activation functions
activation_functions = [
    'relu', 'leaky_relu', 'tanh',
    'elu', 'selu', 'exponential',
    'softmax', 'softplus'
]

In [35]:
# Initialise the Hyperband tuner
tuner = kt.Hyperband(
    create_model,
    objective = "val_accuracy",
    max_epochs = 20,
    hyperband_iterations = 2
)

In [36]:
# Find the best hyperparameters
tuner.search(
    X_train_scaled,
    y_train,
    epochs = 20,
    validation_data = (X_test_scaled, y_test)
)

Trial 60 Complete [00h 00m 01s]
val_accuracy: 0.7307692170143127

Best val_accuracy So Far: 0.8269230723381042
Total elapsed time: 00h 00m 31s


## Compile, Train, Evaluate the Best Model

In [37]:
# Get the top 3 model hyperparameters
top3_hyper = tuner.get_best_hyperparameters(3)

for model in top3_hyper:
    pprint(model.values)

{'activation_layer_0': 'selu',
 'activation_layer_1': 'elu',
 'activation_layer_2': 'tanh',
 'num_layers': 2,
 'tuner/bracket': 2,
 'tuner/epochs': 7,
 'tuner/initial_epoch': 3,
 'tuner/round': 1,
 'tuner/trial_id': '0035',
 'units_layer_0': 6,
 'units_layer_1': 11,
 'units_layer_2': 41}
{'activation_layer_0': 'selu',
 'activation_layer_1': 'elu',
 'activation_layer_2': 'tanh',
 'num_layers': 2,
 'tuner/bracket': 2,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/round': 2,
 'tuner/trial_id': '0042',
 'units_layer_0': 6,
 'units_layer_1': 11,
 'units_layer_2': 41}
{'activation_layer_0': 'tanh',
 'activation_layer_1': 'selu',
 'activation_layer_2': 'relu',
 'num_layers': 2,
 'tuner/bracket': 2,
 'tuner/epochs': 3,
 'tuner/initial_epoch': 0,
 'tuner/round': 0,
 'units_layer_0': 56,
 'units_layer_1': 41,
 'units_layer_2': 61}


In [38]:
# Get the top model
best_hyper = top3_hyper[0]
best_hyper.values

{'activation_layer_0': 'selu',
 'units_layer_0': 6,
 'num_layers': 2,
 'units_layer_1': 11,
 'activation_layer_1': 'elu',
 'units_layer_2': 41,
 'activation_layer_2': 'tanh',
 'tuner/epochs': 7,
 'tuner/initial_epoch': 3,
 'tuner/bracket': 2,
 'tuner/round': 1,
 'tuner/trial_id': '0035'}

In [39]:
# Define the model parameters
number_input_features = len(X_train.columns)
hidden_layer0_neurons = best_hyper.values['units_layer_0']
hidden_layer0_activation = best_hyper.values['activation_layer_0']

# Return the number of hidden layers
total_hidden = best_hyper.values['num_layers'] + 1

In [40]:
# Initialise the sequential model
nn = tf.keras.models.Sequential()

# Create the input layer and first hidden layer
nn.add(tf.keras.layers.Dense(
    units = hidden_layer0_neurons,
    activation = hidden_layer0_activation,
    input_dim = number_input_features
))
        
# Create additional hidden layers
for layer in range(1, total_hidden):
    nn.add(tf.keras.layers.Dense(
        units = best_hyper.values[f'units_layer_{layer}'],
        activation = best_hyper.values[f'activation_layer_{layer}']
    ))

# Create the output layer
nn.add(tf.keras.layers.Dense(
    units = 1,
    activation = "sigmoid"
))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 6)                 192       
                                                                 
 dense_4 (Dense)             (None, 11)                77        
                                                                 
 dense_5 (Dense)             (None, 41)                492       
                                                                 
 dense_6 (Dense)             (None, 1)                 42        
                                                                 
Total params: 803 (3.14 KB)
Trainable params: 803 (3.14 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [41]:
# Compile the model
nn.compile(
    loss = "binary_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)

In [42]:
# Train the model
fit_model = nn.fit(
    X_train_scaled,
    y_train,
    epochs = 100,
    verbose = 1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [43]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(
    X_test_scaled,
    y_test,
    verbose = 2
)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2/2 - 0s - loss: 0.7689 - accuracy: 0.7885 - 52ms/epoch - 26ms/step
Loss: 0.7689491510391235, Accuracy: 0.7884615659713745
