Google Colab link : https://colab.research.google.com/drive/120IqNdb_44K1kSgqTTsZosf7VOaLO0ki?usp=sharing


# Preliminaries

In [85]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [86]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

Dataset link : https://www.kaggle.com/datasets/tunguz/big-five-personality-test

In [87]:
dataset = pd.read_csv("/content/drive/MyDrive/SEM4/Research Method/RM Kel 19 Experiment/data-final.csv", sep='\t')

print(dataset.shape)
dataset.head()

(1015341, 110)


Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,dateload,screenw,screenh,introelapse,testelapse,endelapse,IPC,country,lat_appx_lots_of_err,long_appx_lots_of_err
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,2016-03-03 02:01:01,768.0,1024.0,9.0,234.0,6,1,GB,51.5448,0.1991
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,2016-03-03 02:01:20,1360.0,768.0,12.0,179.0,11,1,MY,3.1698,101.706
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,2016-03-03 02:01:56,1366.0,768.0,3.0,186.0,7,1,GB,54.9119,-1.3833
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,2016-03-03 02:02:02,1920.0,1200.0,186.0,219.0,7,1,GB,51.75,-1.25
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,2016-03-03 02:02:57,1366.0,768.0,8.0,315.0,17,2,KE,1.0,38.0


# Data Preprocessing

In [88]:
data = dataset.drop(list(dataset)[50:], axis=1)

print(data.shape)
data.head()

(1015341, 50)


Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,5.0,1.0,4.0,1.0,4.0,1.0,5.0,3.0,4.0,5.0
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,1.0,2.0,4.0,2.0,3.0,1.0,4.0,2.0,5.0,3.0
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,5.0,1.0,2.0,1.0,4.0,2.0,5.0,3.0,4.0,4.0
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,4.0,2.0,5.0,2.0,3.0,1.0,4.0,4.0,3.0,3.0
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,5.0,1.0,5.0,1.0,5.0,1.0,5.0,3.0,5.0,5.0


In [89]:
for i in data.columns:
  data = data[(data[i].notna()) & (data[i] != 0)]

print(data.shape)

(874434, 50)


In [90]:
x = data.drop(columns=['EST9'])
y = data['EST9']

## Random oversampling

In [91]:
from imblearn.over_sampling import RandomOverSampler

In [99]:
data['EST9'].value_counts()

4.0    247851
2.0    199050
3.0    182001
5.0    133152
1.0    112380
Name: EST9, dtype: int64

In [100]:
oversampler = RandomOverSampler(random_state=42)

In [101]:
x_resampled, y_resampled = oversampler.fit_resample(x, y)

In [102]:
y_resampled.value_counts()

3.0    247851
1.0    247851
4.0    247851
2.0    247851
5.0    247851
Name: EST9, dtype: int64

## XGBoost feature selection

In [103]:
x_resampled = x_resampled[['EST6', 'EST8', 'AGR3', 'EST7', 'EST1', 'EST5', 'CSN9', 'EST10', 'AGR6', 'EST3']]

## Shaping dataset

In [104]:
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size = 0.1)

In [105]:
timesteps = 1
input_dim = 10

x_1 = np.resize(x, (x.shape[0], timesteps, input_dim))
x_train_1 = np.resize(x_train, (x_train.shape[0], timesteps, input_dim))
x_test_1 = np.resize(x_test, (x_test.shape[0], timesteps, input_dim))

In [106]:
y_train_1 = y_train - 1
y_test_1 = y_test - 1

In [107]:
y_train_1 = to_categorical(y_train_1)
y_test_1 = to_categorical(y_test_1)

print(y_test_1)

[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]]


# LSTM training

In [108]:
x_train_1.shape

(1115329, 1, 10)

In [109]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(timesteps, input_dim)))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

In [110]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [111]:
model.fit(x_train_1, y_train_1, epochs=10, batch_size=256, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=7, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2a4d8c8eb0>

In [112]:
model.evaluate(x_test_1,y_test_1)



[1.1366690397262573, 0.5299453139305115]

In [None]:
import sklearn
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

In [113]:
y_pred_lstm = model.predict(x_test_1)
y_pred_lstm



array([[0.8494539 , 0.1048221 , 0.0292339 , 0.01119855, 0.00529161],
       [0.4202517 , 0.07579874, 0.08195388, 0.08591467, 0.33608103],
       [0.3093462 , 0.48616105, 0.13897547, 0.05442817, 0.01108907],
       ...,
       [0.03561417, 0.13808198, 0.18595648, 0.33083284, 0.30951452],
       [0.44467765, 0.40566325, 0.11017065, 0.03250328, 0.00698514],
       [0.00304246, 0.00277093, 0.01063944, 0.07669214, 0.90685505]],
      dtype=float32)

In [114]:
accuracy_score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1))

0.5299452899310879

In [115]:
precision, recall, fscore, support = score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print('################################')
print(sklearn.metrics.classification_report(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1)))

precision: [0.62059317 0.4511665  0.4336694  0.44452314 0.66979505]
recall: [0.70993038 0.45559986 0.37476192 0.43140188 0.67630503]
fscore: [0.66226253 0.45337234 0.40206948 0.43786424 0.6730343 ]
support: [24994 24831 24677 24731 24693]
################################
              precision    recall  f1-score   support

           0       0.62      0.71      0.66     24994
           1       0.45      0.46      0.45     24831
           2       0.43      0.37      0.40     24677
           3       0.44      0.43      0.44     24731
           4       0.67      0.68      0.67     24693

    accuracy                           0.53    123926
   macro avg       0.52      0.53      0.53    123926
weighted avg       0.52      0.53      0.53    123926



# GRU training

In [116]:
modelGRU = Sequential()
modelGRU.add(GRU(128, return_sequences=True, input_shape=(timesteps, input_dim)))
modelGRU.add(Dropout(0.5))
modelGRU.add(GRU(64))
modelGRU.add(Dropout(0.5))
modelGRU.add(Dense(5, activation='softmax'))

In [117]:
modelGRU.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [118]:
modelGRU.fit(x_train_1, y_train_1, epochs=10, batch_size=256, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=7, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2a568c04c0>

In [119]:
modelGRU.evaluate(x_test_1,y_test_1)



[1.138167142868042, 0.5296628475189209]

In [120]:
y_pred_gru = modelGRU.predict(x_test_1)
y_pred_gru



array([[0.88122046, 0.08891194, 0.01888859, 0.00686503, 0.00411395],
       [0.42496186, 0.07020062, 0.07512976, 0.07969684, 0.35001084],
       [0.314827  , 0.4708372 , 0.13722014, 0.06411583, 0.0129998 ],
       ...,
       [0.0364347 , 0.14547902, 0.21133353, 0.33968982, 0.2670629 ],
       [0.45545197, 0.395923  , 0.10823404, 0.03313216, 0.0072588 ],
       [0.00235792, 0.00276659, 0.01149214, 0.07886389, 0.9045195 ]],
      dtype=float32)

In [121]:
accuracy_score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1))

0.5296628633216597

In [122]:
precision, recall, fscore, support = score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print('################################')
print(sklearn.metrics.classification_report(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1)))

precision: [0.61939806 0.45717911 0.42062549 0.45224313 0.68265276]
recall: [0.71389133 0.43534292 0.41585282 0.42962274 0.65196614]
fscore: [0.66329622 0.44599389 0.41822554 0.44064282 0.66695667]
support: [24994 24831 24677 24731 24693]
################################
              precision    recall  f1-score   support

           0       0.62      0.71      0.66     24994
           1       0.46      0.44      0.45     24831
           2       0.42      0.42      0.42     24677
           3       0.45      0.43      0.44     24731
           4       0.68      0.65      0.67     24693

    accuracy                           0.53    123926
   macro avg       0.53      0.53      0.53    123926
weighted avg       0.53      0.53      0.53    123926

