In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

import statistics
import csv
import xgboost as xgb
import re
import string
import numpy as np
import json

In [2]:
def load_dataset(label_path):
    label = pd.read_csv(label_path)
    label = label.drop(columns=['Unnamed: 0'])
    p_file = open('./dataset_work/personalities.json')
    personalities = json.load(p_file)
    p_file.close()
    all_personalities = pd.DataFrame.from_dict(personalities, orient='index', columns=['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness'])
    all_personalities['id'] = all_personalities.index
    dataset = pd.merge(label, all_personalities, on='id', how='left')
    return dataset

In [3]:
def split_dataset(d_df):
    _feats = d_df[['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
    x = _feats.values
    return x, d_df['label'].values

In [4]:
# laod the personality datasets
train = load_dataset('./dataset_work/labels/personality_only/alice_train_personalityTrue_imageFalse_labels.csv')
valid = load_dataset('./dataset_work/labels/personality_only/alice_valid_personalityTrue_imageFalse_labels.csv')
test = load_dataset('./dataset_work/labels/personality_only/alice_test_personalityTrue_imageFalse_labels.csv')

In [5]:
print(f'train = {len(train)}, valid = {len(valid)}, test = {len(test)}')

train = 375, valid = 124, test = 125


In [6]:
train

Unnamed: 0,id,label,playfulness,chase-proneness,curiosity,sociability,aggressiveness,shyness
0,n02107908_3311,0,6,5,7,5,9,8
1,n02106030_17232,1,3,6,7,5,5,4
2,n02106030_16338,1,5,6,4,4,5,5
3,n02101006_1249,0,7,5,7,4,8,3
4,n02096585_6855,1,6,4,6,6,5,4
...,...,...,...,...,...,...,...,...
370,n02106030_19383,1,5,4,4,5,6,6
371,n02097298_3301,1,7,3,6,6,5,5
372,n02091635_452,0,10,6,3,4,6,4
373,n02097047_5257,1,7,3,5,2,5,5


In [7]:
y_train = train['label']

In [8]:
X_train, y_train = split_dataset(train)
X_valid, y_valid = split_dataset(valid)
X_test, y_test = split_dataset(test)

In [40]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, n_estimators=50, scale_pos_weight=0.8)
xgb_model.fit(X_train, y_train)

In [41]:
valid_preds = xgb_model.predict(X_valid)

In [42]:
print(classification_report(y_valid, valid_preds))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83        35
           1       0.93      0.93      0.93        89

    accuracy                           0.90       124
   macro avg       0.88      0.88      0.88       124
weighted avg       0.90      0.90      0.90       124



In [43]:
sum(y_valid) / len(y_valid)

0.717741935483871

In [32]:
test_preds = xgb_model.predict(X_test)

In [33]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.70      0.84      0.76        31
           1       0.94      0.88      0.91        94

    accuracy                           0.87       125
   macro avg       0.82      0.86      0.84       125
weighted avg       0.88      0.87      0.88       125



In [27]:
img_and_personality_test = load_dataset('./dataset_work/labels/image_and_personality/alice_test_personalityTrue_imageTrue_labels.csv')
X_iandp_test, y_iandp_test = split_dataset(img_and_personality_test)

In [28]:
iandp_test_preds = xgb_model.predict(X_iandp_test)
print(classification_report(y_iandp_test, iandp_test_preds))

              precision    recall  f1-score   support

           0       0.78      0.37      0.50        79
           1       0.43      0.83      0.57        46

    accuracy                           0.54       125
   macro avg       0.61      0.60      0.53       125
weighted avg       0.65      0.54      0.52       125

