Google Colab link : https://colab.research.google.com/drive/120IqNdb_44K1kSgqTTsZosf7VOaLO0ki?usp=sharing


# Preliminaries

In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

Dataset link : https://www.kaggle.com/datasets/tunguz/big-five-personality-test

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/SEM4/Research Method/RM Kel 19 Experiment/data-final.csv", sep='\t')

print(dataset.shape)
dataset.head()

# Data Preprocessing

In [None]:
data = dataset.drop(list(dataset)[50:], axis=1)

print(data.shape)
data.head()

(1015341, 50)


Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,5.0,1.0,4.0,1.0,4.0,1.0,5.0,3.0,4.0,5.0
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,1.0,2.0,4.0,2.0,3.0,1.0,4.0,2.0,5.0,3.0
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,5.0,1.0,2.0,1.0,4.0,2.0,5.0,3.0,4.0,4.0
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,4.0,2.0,5.0,2.0,3.0,1.0,4.0,4.0,3.0,3.0
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,5.0,1.0,5.0,1.0,5.0,1.0,5.0,3.0,5.0,5.0


In [None]:
for i in data.columns:
  data = data[(data[i].notna()) & (data[i] != 0)]

print(data.shape)

(874434, 50)


In [None]:
x = data.drop(columns=['EST9'])
y = data['EST9']

## Random oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
data['EST9'].value_counts()

In [None]:
oversampler = RandomOverSampler(random_state=42)

In [None]:
x_resampled, y_resampled = oversampler.fit_resample(x, y)

In [None]:
y_resampled.value_counts()

3.0    247851
1.0    247851
4.0    247851
2.0    247851
5.0    247851
Name: EST9, dtype: int64

## XGBoost feature selection

In [None]:
x_resampled = x_resampled[['EST6', 'EST8', 'AGR3', 'EST7', 'EST1', 'EST5', 'CSN9', 'EST10', 'AGR6', 'EST3']]

## Shaping dataset

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)

In [None]:
timesteps = 1
input_dim = 49

x_1 = np.resize(x, (x.shape[0], timesteps, input_dim))
x_train_1 = np.resize(x_train, (x_train.shape[0], timesteps, input_dim))
x_test_1 = np.resize(x_test, (x_test.shape[0], timesteps, input_dim))

In [None]:
y_train_1 = y_train - 1
y_test_1 = y_test - 1

In [None]:
y_train_1 = to_categorical(y_train_1)
y_test_1 = to_categorical(y_test_1)

print(y_test_1)

[[0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 ...
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]


# LSTM training

In [None]:
x_train_1.shape

(786990, 1, 49)

In [None]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(timesteps, input_dim)))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x_train_1, y_train_1, epochs=10, batch_size=256, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=7, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f23f208d180>

In [None]:
model.evaluate(x_test_1,y_test_1)



[1.1518917083740234, 0.5128768086433411]

In [None]:
import sklearn
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

In [None]:
y_pred_lstm = model.predict(x_test_1)
y_pred_lstm



array([[0.14376506, 0.35302565, 0.2334133 , 0.2246047 , 0.04519135],
       [0.0410495 , 0.4406325 , 0.31062928, 0.19262013, 0.01506868],
       [0.00420265, 0.04181146, 0.17969042, 0.6131048 , 0.16119064],
       ...,
       [0.272303  , 0.45826367, 0.18267886, 0.07699922, 0.00975527],
       [0.4574017 , 0.32994148, 0.11378101, 0.08011462, 0.01876132],
       [0.29034737, 0.33091387, 0.16857842, 0.16444144, 0.0457189 ]],
      dtype=float32)

In [None]:
accuracy_score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1))

0.5128768125886282

In [None]:
precision, recall, fscore, support = score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print('################################')
print(sklearn.metrics.classification_report(np.argmax(y_test_1, axis=1), np.argmax(y_pred_lstm, axis=1)))

precision: [0.57152594 0.46276908 0.43656336 0.50538433 0.69284431]
recall: [0.59903554 0.55457967 0.3040271  0.60247897 0.49768431]
fscore: [0.58495749 0.50453165 0.35843587 0.5496769  0.57926829]
support: [11198 19925 18301 24849 13171]
################################
              precision    recall  f1-score   support

           0       0.57      0.60      0.58     11198
           1       0.46      0.55      0.50     19925
           2       0.44      0.30      0.36     18301
           3       0.51      0.60      0.55     24849
           4       0.69      0.50      0.58     13171

    accuracy                           0.51     87444
   macro avg       0.53      0.51      0.52     87444
weighted avg       0.52      0.51      0.51     87444



# GRU training

In [None]:
modelGRU = Sequential()
modelGRU.add(GRU(128, return_sequences=True, input_shape=(timesteps, input_dim)))
modelGRU.add(Dropout(0.5))
modelGRU.add(GRU(64))
modelGRU.add(Dropout(0.5))
modelGRU.add(Dense(5, activation='softmax'))

In [None]:
modelGRU.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
modelGRU.fit(x_train_1, y_train_1, epochs=10, batch_size=256, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=7, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f23214aead0>

In [None]:
modelGRU.evaluate(x_test_1,y_test_1)



[1.1541283130645752, 0.5131055116653442]

In [None]:
y_pred_gru = modelGRU.predict(x_test_1)
y_pred_gru



array([[0.18099582, 0.31237036, 0.21597154, 0.22785175, 0.06281058],
       [0.04033691, 0.4025733 , 0.31611332, 0.22124636, 0.01973002],
       [0.00891864, 0.05591503, 0.15661131, 0.5469789 , 0.23157607],
       ...,
       [0.27708575, 0.48245645, 0.16222192, 0.06856579, 0.00967013],
       [0.38952866, 0.34466738, 0.13804354, 0.10265088, 0.02510963],
       [0.30249047, 0.26654583, 0.16528188, 0.18818969, 0.07749214]],
      dtype=float32)

In [None]:
accuracy_score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1))

0.5131055303965967

In [None]:
precision, recall, fscore, support = score(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print('################################')
print(sklearn.metrics.classification_report(np.argmax(y_test_1, axis=1), np.argmax(y_pred_gru, axis=1)))

precision: [0.60801257 0.46953282 0.45396433 0.48191743 0.70362992]
recall: [0.55295588 0.55031368 0.24058795 0.68212    0.4827272 ]
fscore: [0.57917875 0.50672397 0.3145     0.56480232 0.57261224]
support: [11198 19925 18301 24849 13171]
################################
              precision    recall  f1-score   support

           0       0.61      0.55      0.58     11198
           1       0.47      0.55      0.51     19925
           2       0.45      0.24      0.31     18301
           3       0.48      0.68      0.56     24849
           4       0.70      0.48      0.57     13171

    accuracy                           0.51     87444
   macro avg       0.54      0.50      0.51     87444
weighted avg       0.52      0.51      0.50     87444

