Google Colab link : https://colab.research.google.com/drive/120IqNdb_44K1kSgqTTsZosf7VOaLO0ki?usp=sharing


# Preliminaries

In [85]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

Dataset link : https://www.kaggle.com/datasets/tunguz/big-five-personality-test

In [3]:
dataset = pd.read_csv("/content/drive/MyDrive/SEM4/Research Method/RM Kel 19 Experiment/data-final.csv", sep='\t')

print(dataset.shape)
dataset.head()

(1015341, 110)


Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,dateload,screenw,screenh,introelapse,testelapse,endelapse,IPC,country,lat_appx_lots_of_err,long_appx_lots_of_err
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,2016-03-03 02:01:01,768.0,1024.0,9.0,234.0,6,1,GB,51.5448,0.1991
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,2016-03-03 02:01:20,1360.0,768.0,12.0,179.0,11,1,MY,3.1698,101.706
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,2016-03-03 02:01:56,1366.0,768.0,3.0,186.0,7,1,GB,54.9119,-1.3833
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,2016-03-03 02:02:02,1920.0,1200.0,186.0,219.0,7,1,GB,51.75,-1.25
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,2016-03-03 02:02:57,1366.0,768.0,8.0,315.0,17,2,KE,1.0,38.0


# Data Preprocessing

In [4]:
data = dataset.drop(list(dataset)[50:], axis=1)

print(data.shape)
data.head()

(1015341, 50)


Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,5.0,1.0,4.0,1.0,4.0,1.0,5.0,3.0,4.0,5.0
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,1.0,2.0,4.0,2.0,3.0,1.0,4.0,2.0,5.0,3.0
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,5.0,1.0,2.0,1.0,4.0,2.0,5.0,3.0,4.0,4.0
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,4.0,2.0,5.0,2.0,3.0,1.0,4.0,4.0,3.0,3.0
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,5.0,1.0,5.0,1.0,5.0,1.0,5.0,3.0,5.0,5.0


In [5]:
for i in data.columns:
  data = data[(data[i].notna()) & (data[i] != 0)]

print(data.shape)

(874434, 50)


In [6]:
x = data.drop(columns=['EST9'])
y = data['EST9']

## Random oversampling

In [7]:
from imblearn.over_sampling import RandomOverSampler

In [8]:
data['EST9'].value_counts()

4.0    247851
2.0    199050
3.0    182001
5.0    133152
1.0    112380
Name: EST9, dtype: int64

In [9]:
oversampler = RandomOverSampler(random_state=42)

In [10]:
x_resampled, y_resampled = oversampler.fit_resample(x, y)

In [11]:
y_resampled.value_counts()

3.0    247851
1.0    247851
4.0    247851
2.0    247851
5.0    247851
Name: EST9, dtype: int64

## XGBoost feature selection

In [103]:
x_resampled = x_resampled[['EST6', 'EST8', 'AGR3', 'EST7', 'EST1', 'EST5', 'CSN9', 'EST10', 'AGR6', 'EST3']]

## Shaping dataset

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size = 0.1)

In [20]:
timesteps = 1
input_dim = 49

x_1 = np.resize(x, (x.shape[0], timesteps, input_dim))
x_train_1 = np.resize(x_train, (x_train.shape[0], timesteps, input_dim))
x_test_1 = np.resize(x_test, (x_test.shape[0], timesteps, input_dim))

In [21]:
y_train_1 = y_train - 1
y_test_1 = y_test - 1

In [22]:
y_train_1 = to_categorical(y_train_1)
y_test_1 = to_categorical(y_test_1)

print(y_test_1)

[[0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]]


# LSTM training

In [23]:
x_train_1.shape

(1115329, 1, 49)

In [24]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(timesteps, input_dim)))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

In [25]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [26]:
model.fit(x_train_1, y_train_1, epochs=10, batch_size=256, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=7, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f29e5598d60>

In [27]:
model.evaluate(x_test_1,y_test_1)



[1.1262249946594238, 0.5293400883674622]

In [28]:
import sklearn
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

In [29]:
y_pred_lstm = model.predict(x_test_1)
y_pred_lstm



array([[0.12208755, 0.37514737, 0.29477605, 0.17234486, 0.03564414],
       [0.01575274, 0.03641831, 0.09792258, 0.24976192, 0.60014445],
       [0.61960167, 0.28279737, 0.06901712, 0.02311829, 0.0054655 ],
       ...,
       [0.13014077, 0.34089932, 0.31266937, 0.167421  , 0.04886961],
       [0.25661787, 0.43983576, 0.21498619, 0.07263479, 0.01592533],
       [0.00977751, 0.03231284, 0.09643286, 0.41578683, 0.44569   ]],
      dtype=float32)

In [30]:
accuracy_score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1))

0.5293400900537417

In [31]:
precision, recall, fscore, support = score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print('################################')
print(sklearn.metrics.classification_report(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1)))

precision: [0.61343056 0.45085323 0.44825591 0.43297249 0.64494285]
recall: [0.72913811 0.4504333  0.31085526 0.43239179 0.72313502]
fscore: [0.66629828 0.45064317 0.36712069 0.43268194 0.68180441]
support: [24806 24694 24928 24605 24893]
################################
              precision    recall  f1-score   support

           0       0.61      0.73      0.67     24806
           1       0.45      0.45      0.45     24694
           2       0.45      0.31      0.37     24928
           3       0.43      0.43      0.43     24605
           4       0.64      0.72      0.68     24893

    accuracy                           0.53    123926
   macro avg       0.52      0.53      0.52    123926
weighted avg       0.52      0.53      0.52    123926



# GRU training

In [32]:
modelGRU = Sequential()
modelGRU.add(GRU(128, return_sequences=True, input_shape=(timesteps, input_dim)))
modelGRU.add(Dropout(0.5))
modelGRU.add(GRU(64))
modelGRU.add(Dropout(0.5))
modelGRU.add(Dense(5, activation='softmax'))

In [33]:
modelGRU.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [34]:
modelGRU.fit(x_train_1, y_train_1, epochs=10, batch_size=256, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=7, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f29bc10dd80>

In [35]:
modelGRU.evaluate(x_test_1,y_test_1)



[1.1233220100402832, 0.5316721200942993]

In [36]:
y_pred_gru = modelGRU.predict(x_test_1)
y_pred_gru



array([[0.15757632, 0.42506188, 0.2634018 , 0.13058974, 0.02337027],
       [0.01153164, 0.0355784 , 0.09409884, 0.2802484 , 0.5785427 ],
       [0.59029216, 0.26967525, 0.09362952, 0.03592737, 0.01047568],
       ...,
       [0.08428197, 0.38758084, 0.38790467, 0.12259613, 0.01763637],
       [0.21560532, 0.50036156, 0.20911445, 0.06388856, 0.01103005],
       [0.01442893, 0.04262112, 0.10296519, 0.42654178, 0.4134429 ]],
      dtype=float32)

In [37]:
accuracy_score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1))

0.531672126914449

In [38]:
precision, recall, fscore, support = score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print('################################')
print(sklearn.metrics.classification_report(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1)))

precision: [0.61231225 0.43943966 0.44355119 0.44650774 0.69854265]
recall: [0.73296783 0.47764639 0.35855263 0.43389555 0.65468204]
fscore: [0.66722936 0.45774716 0.39654828 0.44011131 0.67590154]
support: [24806 24694 24928 24605 24893]
################################
              precision    recall  f1-score   support

           0       0.61      0.73      0.67     24806
           1       0.44      0.48      0.46     24694
           2       0.44      0.36      0.40     24928
           3       0.45      0.43      0.44     24605
           4       0.70      0.65      0.68     24893

    accuracy                           0.53    123926
   macro avg       0.53      0.53      0.53    123926
weighted avg       0.53      0.53      0.53    123926

