##### Import


In [57]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from keras.utils.np_utils import to_categorical 
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization,\
Flatten, LSTM
# from scikeras.wrappers import KerasClassifier
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import load_model

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

NUM_FEATURES = 51
TIMESTEPS = 1
DATASET_LINK = "3976184250-1655049260.csv"
TEST_DATA_LINK = "3976184250-1655049260-test.csv"

##### Initialise Seed

In [58]:
# random seed for reproducibility
seed = 10
np.random.seed(seed)

# loading of dataset
df = pd.read_csv(DATASET_LINK)
df['Subject'] = df['Subject'].str.replace('@yopmail.com','')
df['Subject'] = df['Subject'].str.replace('@gmail.com','')
df['Subject'] = df['Subject'].str.replace('alithnks','ali')

column_to_reorder = df.pop('Subject')

df.insert(0, 'Subject', column_to_reorder)

# # Remove missing values IF AVAILABLE and print head
df = df.dropna()

df = df[list(df.columns[~df.columns.duplicated()])]

df.head()

# c = df[~df.columns.duplicated(keep='first')]
# print(c)

Unnamed: 0,Subject,__id__,AvA,AvH,AvP,`DD.0.5`,`DD.1.6`,`DD.2.4`,`DD.4.8`,`DD.5.2`,...,`Size.9`,`UD.0.5`,`UD.1.6`,`UD.2.4`,`UD.4.8`,`UD.5.2`,`UD.6.7`,`UD.7.9`,`UD.8.1`,`UD.9.3`
0,aisha,32EyOh61auQrtt845qqu,30.386073,0.11732,0.207566,0.316371,0.525669,0.183837,0.574917,1.299633,...,29.588976,0.316371,0.525669,0.183837,0.574917,1.299633,0.333369,1.700769,0.949691,0.607718
1,aisha,4bA0GNx1E4csaTigdnUk,29.442656,0.11742,0.212496,0.349458,3.484031,0.219371,0.813483,0.342324,...,24.210612,0.349458,3.484031,0.219371,0.813483,0.342324,0.608039,0.625773,0.416927,0.908202
2,aisha,8XMHJ2b4PLrT9XU667dv,29.863961,0.098838,0.219027,0.282859,0.284138,0.192723,0.191222,0.224901,...,28.748535,0.282859,0.284138,0.192723,0.191222,0.224901,0.232967,0.466518,0.207866,0.426501
3,aisha,8eU5uqGw95a2Uao4QD08,31.369481,0.107199,0.287219,0.232834,0.208091,0.121858,0.141128,0.204205,...,27.236084,0.232834,0.208091,0.121858,0.141128,0.204205,0.183798,0.340828,0.183782,0.442837
4,aisha,AmDTWz3WvysQAblug2WG,30.882376,0.101829,0.261792,0.258754,0.21603,0.168339,0.59079,0.215137,...,29.588976,0.258754,0.21603,0.168339,0.59079,0.215137,0.226295,0.350204,0.19236,1.215338


##### Divide dataset into X and Y
##### Normalise features within range 0 (minimum) and 1 (maximum)


In [59]:
dataset = df.values

# divide data into features X and target (Classes) Y
X = dataset[:,2:].astype(float)
Y = dataset[:,0]

# # check for class imbalance
print(df.groupby(Y).size())
# print(df.groupby(X).size())

aisha    30
ali      30
ramla    40
saad     30
umer     30
dtype: int64


In [60]:
# convert target Y to one hot encoded Y for model
Y = Y.reshape(-1, 1)
encoder = OneHotEncoder().fit(Y)

# get all the encoded class
print(encoder.get_feature_names_out())

# print X and Y shape
print("X dataset shape: " + str(X.shape))
print("Y dataset shape: " + str(Y.shape))

['x0_aisha' 'x0_ali' 'x0_ramla' 'x0_saad' 'x0_umer']
X dataset shape: (160, 51)
Y dataset shape: (160, 1)


##### Preparing dataset

In [61]:
# split dataset into train and test of 0.8/0.2 ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=seed)

# normalisation to 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# reshaping the dataset to include LSTM
X_train = np.asarray(X_train, dtype=np.float32)
X_train = np.reshape(X_train, (X_train.shape[0], TIMESTEPS, X_train.shape[1]))
X_test = np.asarray(X_test, dtype=np.float32)
X_test = np.reshape(X_test, (X_test.shape[0], TIMESTEPS, X_test.shape[1]))

# converting y data to encoding
y_train = encoder.transform(y_train).toarray()
y_test = encoder.transform(y_test).toarray()

num_classes = y_train.shape[1]

print("X train shape: " + str(X_train.shape))
print("Y train shape: " + str(y_train.shape))
print("X test shape: " + str(X_test.shape))
print("Y test shape: " + str(y_test.shape))

X train shape: (128, 1, 51)
Y train shape: (128, 5)
X test shape: (32, 1, 51)
Y test shape: (32, 5)


### Create Model

In [62]:
def create_model():
    # define model
    model = Sequential()
    model.add(LSTM(units=128, return_sequences=True, 
                 input_shape=(TIMESTEPS,NUM_FEATURES)))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(LSTM(units=128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(LSTM(units=64, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    # Softmax for multi-class classification
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam',
                metrics=['accuracy'])
    return model

##### Wrap Model in KerasClassifier

In [63]:
model = KerasClassifier(build_fn=create_model, epochs=100, 
                            batch_size=10)

### Perform KFold Validation

In [64]:
num_folds = 10
kfold = KFold(n_splits=num_folds, 
              shuffle=True,
              random_state=seed)

##### Get Accuracy from KFold Validation

In [65]:
results = cross_val_score(model, X_train, y_train, 
                          cv=kfold, error_score="raise", verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.9min finished


##### get validation accuracy

In [66]:
print("Validation Accuracy of %.2f%% (with standard deviation of %.2f%%)" % 
      (results.mean()*100, results.std()*100))

Validation Accuracy of 95.26% (with standard deviation of 6.39%)


##### fit model

In [67]:
# fit the model
es = EarlyStopping(monitor='loss', mode='min', min_delta=0.001, patience=50,
                   verbose=0)
model.fit(X_train, y_train, callbacks=es)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x15eac0040>

In [68]:
# view model summary
model.model.summary()

# save model
model.model.save("model/key_classifier.h5")

Model: "sequential_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_102 (LSTM)             (None, 1, 128)            92160     
                                                                 
 dropout_102 (Dropout)       (None, 1, 128)            0         
                                                                 
 batch_normalization_102 (Ba  (None, 1, 128)           512       
 tchNormalization)                                               
                                                                 
 lstm_103 (LSTM)             (None, 1, 128)            131584    
                                                                 
 dropout_103 (Dropout)       (None, 1, 128)            0         
                                                                 
 batch_normalization_103 (Ba  (None, 1, 128)           512       
 tchNormalization)                                   

In [69]:
model.model = load_model("model/key_classifier.h5")
# model.model = load_model("model/key_classifier.h5")

y_pred = model.predict(X_test)
y_pred = to_categorical(y_pred)

# evaluate predictions
# acc = accuracy_score(y_test, y_pred)
# print("Testing accuracy: %.3f%%" % (acc*100)

acc = accuracy_score(y_test, y_pred)
print("Testing accuracy: %.3f%%" % (acc*100))

Testing accuracy: 90.625%


In [70]:
# import unseen data to check if model works
pred_df = pd.read_csv(TEST_DATA_LINK)
pred_df['Subject'] = pred_df['Subject'].str.replace('@yopmail.com','')
pred_df['Subject'] = pred_df['Subject'].str.replace('@gmail.com','')
pred_df['Subject'] = pred_df['Subject'].str.replace('alithnks','ali')

column_to_reorder = pred_df.pop('Subject')

pred_df.insert(0, 'Subject', column_to_reorder)
pred_df.head()

Unnamed: 0,Subject,__id__,AvA,AvH,AvP,`DD.0.5`,`DD.1.6`,`DD.2.4`,`DD.4.8`,`DD.5.2`,...,`Size.9`,`UD.0.5`,`UD.1.6`,`UD.2.4`,`UD.4.8`,`UD.5.2`,`UD.6.7`,`UD.7.9`,`UD.8.1`,`UD.9.3`
0,aisha,32EyOh61auQrtt845qqu,30.386073,0.11732,0.207566,0.316371,0.525669,0.183837,0.574917,1.299633,...,29.588976,0.316371,0.525669,0.183837,0.574917,1.299633,0.333369,1.700769,0.949691,0.607718
1,aisha,4bA0GNx1E4csaTigdnUk,29.442656,0.11742,0.212496,0.349458,3.484031,0.219371,0.813483,0.342324,...,24.210612,0.349458,3.484031,0.219371,0.813483,0.342324,0.608039,0.625773,0.416927,0.908202
2,aisha,8XMHJ2b4PLrT9XU667dv,29.863961,0.098838,0.219027,0.282859,0.284138,0.192723,0.191222,0.224901,...,28.748535,0.282859,0.284138,0.192723,0.191222,0.224901,0.232967,0.466518,0.207866,0.426501
3,ali,x1FEHtJP9xDxTOeMGxGD,29.888982,0.088453,0.324981,0.267652,0.21547,0.12351,0.207815,0.200449,...,27.667969,0.267652,0.21547,0.12351,0.207815,0.200449,0.117066,0.200477,0.143154,0.207918
4,ali,y5GzBnwpg43QOeOzQ28R,27.80526,0.077486,0.211683,0.233024,0.932978,0.163148,0.449982,0.194738,...,27.667969,0.233024,0.932978,0.163148,0.449982,0.194738,0.165819,0.199959,0.192599,0.201954


In [71]:
pred_dataset = pred_df.values

# divide data into features X
# X_new = new_dataset[:,3:].astype(float)

pred_row=pred_df.iloc[:,2:]
# print("check name")
print(pred_df.iloc[0:15,0:1])

# convert to
pred_row = pred_row.values.tolist()
pred_row = scaler.transform(pred_row)
pred_arr = np.asarray(pred_row, dtype=np.float32)
pred_arr = np.reshape(pred_arr, (pred_row.shape[0], TIMESTEPS, pred_arr.shape[1]))

   Subject
0    aisha
1    aisha
2    aisha
3      ali
4      ali
5      ali
6    ramla
7    ramla
8    ramla
9     saad
10    saad
11    saad
12    umer
13    umer
14    umer


In [72]:
# get prediction and its label
pred = model.predict(pred_arr)
pred = to_categorical(pred)
pred = encoder.inverse_transform(pred)

pred = np.squeeze(pred)

pred_proba = model.predict_proba(pred_arr)
acc = np.max(pred_proba, axis=1)

print(pred)
print(acc)

['aisha' 'aisha' 'aisha' 'ali' 'ali' 'ali' 'ramla' 'ramla' 'ramla' 'saad'
 'saad' 'saad' 'umer' 'umer' 'umer']
[0.9995486  0.9374214  0.994348   0.9996966  0.99866736 0.9992955
 0.99937516 0.9998752  0.99844307 0.9988722  0.999752   0.9969543
 0.9999713  0.9999256  0.999519  ]
