# Temporal Features - RNN Test

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

import sqlalchemy
from sqlalchemy import create_engine, inspect

import math
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_tuner as kt
from pprint import pprint

import os
import sys
import time
from datetime import datetime
from contextlib import redirect_stdout

%run functions.ipynb

In [2]:
# Time the run
start_time = time.time()

## Import datasets

In [3]:
# Import the data
engine = create_engine("sqlite:///voice.sqlite")

# View all of the classes
inspector = inspect(engine)
table_names = inspector.get_table_names()
table_names

['alexaval',
 'alexbval',
 'alexgval',
 'alexrval',
 'aval',
 'bval',
 'chroma1',
 'chroma10',
 'chroma11',
 'chroma12',
 'chroma2',
 'chroma3',
 'chroma4',
 'chroma5',
 'chroma6',
 'chroma7',
 'chroma8',
 'chroma9',
 'chromastd',
 'delta',
 'deltachroma1',
 'deltachroma10',
 'deltachroma11',
 'deltachroma12',
 'deltachroma2',
 'deltachroma3',
 'deltachroma4',
 'deltachroma5',
 'deltachroma6',
 'deltachroma7',
 'deltachroma8',
 'deltachroma9',
 'deltachromastd',
 'deltaenergy',
 'deltaenergyentropy',
 'deltamfcc1',
 'deltamfcc10',
 'deltamfcc11',
 'deltamfcc12',
 'deltamfcc13',
 'deltamfcc2',
 'deltamfcc3',
 'deltamfcc4',
 'deltamfcc5',
 'deltamfcc6',
 'deltamfcc7',
 'deltamfcc8',
 'deltamfcc9',
 'deltaspectralcentroid',
 'deltaspectralentropy',
 'deltaspectralflux',
 'deltaspectralrolloff',
 'deltaspectralspread',
 'deltazcr',
 'demographic',
 'diagnosis',
 'energy',
 'energyentropy',
 'gval',
 'habits',
 'mfcc1',
 'mfcc10',
 'mfcc11',
 'mfcc12',
 'mfcc13',
 'mfcc2',
 'mfcc3',
 'mfcc4

In [4]:
# Initialise a dictionary to hold dataframes
dataframes = dict()

# Loop through each table
for table in table_names:
    
    # Dataframe name
    df_name = f'{table}_df'
    
    # Create dataframe
    dataframes[df_name] = pd.read_sql(
        f'SELECT * FROM {table}',
        engine
    )

## Preprocessing

### Define the target variable

In [5]:
# Isolate the diagnosis column
diagnosis_df = dataframes['diagnosis_df'].sort_values(by='id').reset_index(drop=True)
y = diagnosis_df['diagnosis'].copy()

# Encode the target variable, ignore subtype
y = y.apply(encode_binary)
y

0      1
1      0
2      1
3      1
4      1
      ..
199    0
200    1
201    0
202    0
203    1
Name: diagnosis, Length: 204, dtype: int64

### Recombine the feature variables

In [6]:
# Define non-temporal features
non_temporal = [
    'rval_df', 'gval_df', 'bval_df', 'aval_df',
    'demographic_df', 'diagnosis_df', 'habits_df',
    'alexrval_df', 'alexgval_df', 'alexbval_df', 'alexaval_df',
    'pylabrval_df', 'pylabgval_df', 'pylabbval_df', 'pylabaval_df'
]

# Define subset of only temporal features
temporal_tables = list(set(dataframes.keys()).difference(set(non_temporal)))

# Initialise a dictionary to hold all features
all_feats = dict()

# Loop through each temporal feature
for table in sorted(temporal_tables):
    
    # Get the dataframe
    df = dataframes[table]
    
    # Get the feature array and voice IDs
    feat_array = df.values[:, 1:]
    voice_list = df.values[:, 0]
    
    # Initialise a feature dictionary
    feat_dict = dict()
    
    # Use a for-loop to populate the dictionary
    for idx, feat in enumerate(feat_array):
        feat_dict[voice_list[idx]] = feat
        # if len(feat) != 192:
            # print(len(feat))
    
    # Append to the all feature dictionary
    feat_name = table.split("_")[0]
    all_feats[feat_name] = feat_dict

# Convert to a dataframe
X = pd.DataFrame(all_feats).sort_index().reset_index(drop=True)
# X = pd.DataFrame(all_feats)
X.head()

Unnamed: 0,chroma10,chroma11,chroma12,chroma1,chroma2,chroma3,chroma4,chroma5,chroma6,chroma7,...,mfcc6,mfcc7,mfcc8,mfcc9,spectralcentroid,spectralentropy,spectralflux,spectralrolloff,spectralspread,zcr
0,"[8.160939658571425e-36, 8.160939658571425e-36,...","[6.259135778625184e-35, 6.259135778625184e-35,...","[6.521327966022236e-37, 6.521327966022236e-37,...","[1.981977565762179e-36, 1.981977565762179e-36,...","[1.3575144582002011e-36, 1.3575144582002011e-3...","[0.2, 0.2, 0.2, 0.0083557589856543, 0.00571074...","[4.104873703767106e-36, 4.104873703767106e-36,...","[2.155332585060541e-35, 2.155332585060541e-35,...","[5.559684454245396e-36, 5.559684454245396e-36,...","[1.3576749403691584e-33, 1.3576749403691584e-3...",...,"[1.1190399446969557e-07, 1.1190399446969557e-0...","[8.413179620042933e-08, 8.413179620042933e-08,...","[6.508014845220068e-08, 6.508014845220068e-08,...","[4.8439279586533656e-08, 4.8439279586533656e-0...","[0.005, 0.005, 0.005, 0.3465049654087774, 0.28...","[1.223008975714376e-10, 1.223008975714376e-10,...","[0.0, 0.0, 0.0, 0.9996124728099652, 0.00645907...","[0.0, 0.0, 0.0, 0.42, 0.325, 0.33, 0.325, 0.32...","[4.770690588753296e-09, 4.770690588753296e-09,...","[0.0, 0.0, 0.0, 0.0701754385964912, 0.20300751..."
1,"[7.013453879267504e-36, 7.013453879267504e-36,...","[0.0, 0.0, 0.0, 0.0166532558508933, 0.07869997...","[0.0, 0.0, 0.0, 0.0129537558999756, 0.00446014...","[0.0, 0.0, 0.0, 0.0030468925055301, 0.00093043...","[7.408793082486265e-37, 7.408793082486265e-37,...","[0.1999999999999999, 0.1999999999999999, 0.199...","[3.4822079082192796e-36, 3.4822079082192796e-3...","[3.341581545922013e-36, 3.341581545922013e-36,...","[0.0, 0.0, 0.0, 0.0251245180249349, 0.02791182...","[8.629140939612992e-35, 8.629140939612992e-35,...",...,"[5.570574598767376e-09, 5.570574598767376e-09,...","[2.3875995972436553e-09, 2.3875995972436553e-0...","[2.749349885292186e-10, 2.749349885292186e-10,...","[-1.6617109241744871e-09, -1.6617109241744871e...","[0.005, 0.005, 0.005, 0.2686992943604466, 0.24...","[1.6891478942563823e-09, 1.6891478942563823e-0...","[0.0, 0.0, 0.0, 1.015672065306574, 0.012061107...","[0.0, 0.0, 0.0, 0.305, 0.295, 0.295, 0.295, 0....","[3.0750895132596103e-09, 3.0750895132596103e-0...","[0.0, 0.0, 0.0, 0.0701754385964912, 0.18045112..."
2,"[0.0, 0.0, 0.0, 0.0254328104050869, 0.00097298...","[0.0, 0.0, 0.0, 0.0193505451629107, 0.00262743...","[0.0, 0.0, 0.0, 0.0083292562363225, 0.00160357...","[0.0, 0.0, 0.0, 0.0080703610497697, 0.01283563...","[0.0, 0.0, 0.0, 0.0070286755905986, 0.00614136...","[0.2, 0.2, 0.2, 0.0085329179281302, 0.00461212...","[0.0, 0.0, 0.0, 0.0052980532581862, 0.00109933...","[0.0, 0.0, 0.0, 0.00504925682982, 0.0052632757...","[0.0, 0.0, 0.0, 0.0059614712279919, 0.01369267...","[0.0, 0.0, 0.0, 0.0128160916733445, 0.01380721...",...,"[0.0, 0.0, 0.0, -0.2599778999084198, -0.290567...","[0.0, 0.0, 0.0, 0.1582810230304283, 0.22599124...","[0.0, 0.0, 0.0, -0.1396072459697239, -0.235683...","[0.0, 0.0, 0.0, -0.2101056367575636, -0.212658...","[0.0049999999999999, 0.0049999999999999, 0.004...","[7.084185505857156e-10, 7.084185505857156e-10,...","[0.0, 0.0, 0.0, 1.0014504084715852, 0.01066764...","[0.0, 0.0, 0.0, 0.33, 0.32, 0.32, 0.32, 0.32, ...","[8.88178419700125e-19, 8.88178419700125e-19, 8...","[0.0, 0.0, 0.0, 0.0651629072681704, 0.17293233..."
3,"[2.702574583604019e-36, 2.702574583604019e-36,...","[4.776934641543469e-37, 4.776934641543469e-37,...","[0.0, 0.0, 0.0, 0.0090083795612209, 0.00081537...","[1.1698336326834081e-34, 1.1698336326834081e-3...","[7.2024558647616085e-37, 7.2024558647616085e-3...","[0.2, 0.2, 0.2, 0.0356911864166217, 0.00370847...","[1.3418390923476396e-36, 1.3418390923476396e-3...","[1.2876499240617028e-36, 1.2876499240617028e-3...","[0.0, 0.0, 0.0, 0.0065263846329425, 0.00703412...","[7.234653246625582e-35, 7.234653246625582e-35,...",...,"[-9.094534884871259e-08, -9.094534884871259e-0...","[-5.831674649803382e-07, -5.831674649803382e-0...","[-8.326572895673644e-07, -8.326572895673644e-0...","[-6.521214770955914e-07, -6.521214770955914e-0...","[0.005, 0.005, 0.005, 0.2888106496790997, 0.29...","[1.0169918121625369e-11, 1.0169918121625369e-1...","[0.0, 0.0, 0.0, 0.9866939562783864, 0.00961743...","[0.0, 0.0, 0.0, 0.425, 0.425, 0.43, 0.385, 0.3...","[3.904414395649892e-09, 3.904414395649892e-09,...","[0.0, 0.0, 0.0, 0.025062656641604, 0.082706766..."
4,"[0.0, 0.0, 0.0, 0.0041882695402504, 0.02049149...","[0.0, 0.0, 0.0, 0.0040731803154477, 0.00261262...","[0.0, 0.0, 0.0, 0.0026445650113565, 0.00700864...","[0.0, 0.0, 0.0, 0.0029177614915496, 0.01259147...","[0.0, 0.0, 0.0, 0.0023704152160725, 0.00010023...","[0.2, 0.2, 0.2, 0.0093191724292075, 0.01320134...","[0.0, 0.0, 0.0, 0.0110916645774918, 0.00755240...","[0.0, 0.0, 0.0, 0.021437601576933, 0.009915651...","[0.0, 0.0, 0.0, 0.0344103741912122, 0.00904430...","[0.0, 0.0, 0.0, 0.0300217717324319, 0.06891515...",...,"[0.0, 0.0, 0.0, 0.1528818094724874, 0.05941509...","[0.0, 0.0, 0.0, 0.5976081964535674, 0.33569853...","[0.0, 0.0, 0.0, 0.1015572440453905, 0.04580434...","[0.0, 0.0, 0.0, -0.9055532969635782, -0.709582...","[0.0049999999999999, 0.0049999999999999, 0.004...","[9.842672086828249e-11, 9.842672086828249e-11,...","[0.0, 0.0, 0.0, 1.0073867109950227, 0.00740300...","[0.0, 0.0, 0.0, 0.305, 0.31, 0.285, 0.305, 0.3...","[8.88178419700125e-19, 8.88178419700125e-19, 8...","[0.0, 0.0, 0.0, 0.0275689223057644, 0.11528822..."


In [7]:
# Split the preprocessed data to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [9]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

In [15]:
len(X_test)

51

In [11]:
X_train = np.array([X[feature][i] for feature in X.keys() for i in range(len(X_train))])

In [14]:
X_train.shape

(10557, 192)

In [16]:
X_test = np.array([X[feature][i] for feature in X.keys() for i in range(len(X_test))])

In [17]:
X_test.shape

(3519, 192)

In [18]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

In [20]:
X_train = scaler.fit_transform(
    X_train.reshape(153*69*192, -1)).reshape(153, 69, 192)

In [None]:
X_train.shape

In [21]:
X_test = scaler.transform(X_test.reshape(51*69*192, -1)).reshape(51, 69, 192)

In [59]:
rnn = Sequential()
# rnn.add(LSTM(units=50, input_shape=(X_train.shape[1], 1), activation='relu'))
rnn.add(LSTM(
    units = 128,
    input_shape = (69, 192)
    # return_sequences = True
))
rnn.add(Dense(units=1, activation='sigmoid'))
rnn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 128)               164352    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 164481 (642.50 KB)
Trainable params: 164481 (642.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [60]:
# Compile the model
rnn.compile(
    loss = "binary_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)

In [61]:
fit_model = rnn.fit(
    X_train,
    y_train,
    shuffle = True,
    epochs = 20,
    verbose = 1
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [62]:
# Evaluate the model using the test data
model_loss, model_accuracy = rnn.evaluate(
    X_test,
    y_test,
    verbose = 2
)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2/2 - 0s - loss: 0.5878 - accuracy: 0.7255 - 188ms/epoch - 94ms/step
Loss: 0.5878248810768127, Accuracy: 0.7254902124404907


In [64]:
# Check the prediction's output probabilities
predicted_prob = rnn.predict(X_test)
clean_prob = np.round(predicted_prob, 2)

# Round to the nearest integer and flatten
clean_predicted = np.round(predicted_prob).astype(int).flatten()

# Convert to a dataframe for readability
output_prob = pd.DataFrame({
    'Actual': y_test,
    'Predicted': clean_predicted,
    'Probability': clean_prob.flatten()
})

output_prob.head(10)



Unnamed: 0,Actual,Predicted,Probability
177,0,1,0.72
196,1,1,0.72
140,1,1,0.72
64,1,1,0.72
137,1,1,0.72
37,0,1,0.72
95,0,1,0.72
32,1,1,0.72
188,1,1,0.72
108,1,1,0.72


In [None]:
X.shape

In [None]:
type(X_train.to_numpy())

In [None]:
X_train.to_numpy().shape

In [None]:
X_train.to_numpy()[0]

In [None]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Reshape the 1D arrays into 2D arrays and apply the scaler
# X_train_reshaped = np.stack([x.reshape(-1, 1) for x in X_train.values.flatten()])
# X_train_scaled = scaler.fit_transform(X_train_reshaped)
# X_test_reshaped = np.stack([x.reshape(-1, 1) for x in X_test.values.flatten()])
# X_test_scaled = scaler.transform(X_test_reshaped)

# Reshape the 1D arrays into 2D arrays and apply the scaler
X_train_scaled = np.array([scaler.fit_transform(x.reshape(-1, 1)) for x in X_train.values.flatten()])
X_test_scaled = np.array([scaler.transform(x.reshape(-1, 1)) for x in X_test.values.flatten()])
                          
# Reshape the scaled 2D arrays back into 1D arrays
# X_train_scaled = np.concatenate([x.flatten() for x in X_train_scaled])
# X_test_scaled = np.concatenate([x.flatten() for x in X_test_scaled])
# X_train_scaled = X_train_scaled.reshape(153, 69, 192)
# X_test_scaled = X_test_scaled.reshape(51, 69, 192)

In [None]:
X_train_scaled

In [None]:
X_train_scaled.shape

In [None]:
X_test_scaled.shape

In [None]:
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train_scaled.shape[1], 1), activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, verbose=2)

In [None]:
# Reshape the data
X_train_reshaped = X_train.to_numpy().reshape((
    X_train.shape[0], # total number of samples
    -1 # total number flattened
))

X_test_reshaped = X_test.to_numpy().reshape((
    X_test.shape[0],
    -1
))

In [None]:
X_train_reshaped.shape

In [None]:
reshaped_array = np.array(X.apply(lambda x: np.array(x)).tolist())

In [None]:
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Flatten the 2D arrays into 1D arrays
flat_df = X.values.flatten()

# Apply the MinMaxScaler and StandardScaler to the flattened 1D arrays
flat_df_normalized = min_max_scaler.fit_transform(flat_df)
flat_df_standardized = standard_scaler.fit_transform(flat_df)

# Reshape the scaled 1D arrays back into 2D arrays
df_normalized = flat_df_normalized.reshape(X.shape)
df_standardized = flat_df_standardized.reshape(X.shape)

In [None]:
X.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Apply the MinMaxScaler and StandardScaler to each cell in the DataFrame
df_normalized = df.apply(lambda x: min_max_scaler.fit_transform(np.expand_dims(x, axis=0)), axis=1)
df_standardized = df.apply(lambda x: standard_scaler.fit_transform(np.expand_dims(x, axis=0)), axis=1)

In [None]:
def find_string(df, target):
    for i, item in df.iterrows():
        for j, elem in item.items():
            if isinstance(elem, np.ndarray):
                if target in elem:
                    print(f"Found '{target}' in {i}, {j}")
                else:
                    find_string(pd.DataFrame([elem]), target)
            else:
                print(f"Unexpected type: {type(elem)}")

In [None]:
find_string(X, 'voice094')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Initialize the MinMaxScaler and StandardScaler
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Apply the MinMaxScaler and StandardScaler to the training data
X_train_normalized = X_train.apply(lambda x: min_max_scaler.fit_transform(x.values.reshape(1, -1)))
X_train_standardized = X_train.apply(lambda x: standard_scaler.fit_transform(x.values.reshape(1, -1)))

# Use the same scalers to transform the testing data
X_test_normalized = X_test.apply(lambda x: min_max_scaler.transform(x.values.reshape(1, -1)))
X_test_standardized = X_test.apply(lambda x: standard_scaler.transform(x.values.reshape(1, -1)))

In [None]:
# Combine the data and labels to a single 2D array
combined = np.column_stack((X.values, y))

In [None]:
# Split the preprocessed data to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    combined[:, :-1],
    combined[:, -1],
    stratify = y
)

In [None]:
# Normalize training data to be between 0 and 1
X_scaler = StandardScaler()

# Scale the data
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.fit_transform(X_test)

In [None]:
# Flatten the features dataframe
flat_df = X.values.flatten().reshape((df.shape[0], -1))

In [None]:
# Split the preprocessed data to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(flat_df, y, stratify=y)

In [None]:
# Normalize training data to be between 0 and 1
X_scaler = MinMaxScaler()

# Scale the data
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.fit_transform(X_test)

In [None]:
# Convert to numpy arrays
X_train_array = X_train.to_numpy()
X_test_array = X_test.to_numpy()

In [None]:
# Reshape the data
X_train_reshaped = X_train_array.reshape((X_train_array.shape[0], -1))
X_test_reshaped = X_test_array.reshape((X_test_array.shape[0], -1))

In [None]:
# Normalize training data to be between 0 and 1
X_scaler = MinMaxScaler()

# Scale the data
X_train_scaled = X_scaler.fit_transform(X_train_reshaped)
X_test_scaled = X_scaler.fit_transform(X_test_reshaped)

# # Reshape the data back to the original
# X_train_scaled = X_train_scaled.reshape(())

# X_test_scaled = X_test_scaled.reshape(())