In [261]:
import numpy as np
import pandas as pd
import json

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report

import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV


## Image Label (Neural Network)

In [466]:
class Net(nn.Module):
    def __init__(self, INPUT_SIZE):
        super(Net, self).__init__()
        # Input is output of resnet18 image classification model
        self.fc1 = nn.Linear(INPUT_SIZE, 500) 
        self.fc11 = nn.Linear(500, 100)
        self.fc2 = nn.Linear(100, 1)
        self.act_out = nn.Sigmoid()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc11(x))
        x = self.act_out(self.fc2(x))
        return x


def load_dataset_df(label_path):
    label = pd.read_csv(label_path)
    label = label.drop(columns=['Unnamed: 0'])
    dataset = label

    # load embeddings
    with open('./dataset_work/embeddings.json') as f:
        embeddings = json.load(f)
    all_embeddings = pd.DataFrame.from_dict(embeddings, orient='index', columns=['embedding'])
    all_embeddings.index.name = 'id'
    dataset = pd.merge(dataset, all_embeddings, on='id', how='left')

    # load personalities
    with open('./dataset_work/personalities.json') as f:
        personalities = json.load(f)
    all_personalities = pd.DataFrame.from_dict(personalities, orient='index', columns=['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness'])
    all_personalities.index.name = 'id'
    dataset = pd.merge(dataset, all_personalities, on='id', how='left')

    return dataset

def vectorize_dataset(df, with_personalities, with_embeddings):
    if with_personalities and with_embeddings:
        cols = df[['embedding','playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
        x = cols.values
        embeddings = x[:,0]
        expanded = np.array(embeddings.tolist())
        x = np.concatenate((expanded, x[:,1:]), axis=1)
        x = np.array(x, dtype='float64')
        y = df['label'].values
    elif with_embeddings:
        cols = df[['embedding','playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
        x = cols.values
        embeddings = x[:,0]
        expanded = np.array(embeddings.tolist())
        x = expanded
        y = df['label'].values
    else:
        cols = df[['embedding','playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
        x = cols.values
        x = x[:,1:]
        x = np.array(x, dtype="float64")
        y = df['label'].values
    return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

def load(label_path, with_personalities, with_embeddings):
    df = load_dataset_df(label_path)
    return vectorize_dataset(df, with_personalities, with_embeddings)

def nn_pred(df):
    # load the weights, trained on only images
    PATH = "trained_models/alice_medium_images_True_personalities_False_optm_adam_loss_MSE_EPOCHS_150_BATCH_1000_REG_0.001_LR_0.0001_f1_0.7230682829943155"
    #train_x_t, train_y_t = load(label_path, with_personalities=False, with_embeddings=True)
    train_x_t, train_y_t = vectorize_dataset(df, with_personalities=False, with_embeddings=True)
    model = Net(train_x_t.shape[1])
    model.load_state_dict(torch.load(PATH))
    model.eval()
    return model(train_x_t)

In [263]:
nn_preds = nn_pred('dataset_work/labels/image_only/alice_train_personalityFalse_imageTrue_labels.csv')

In [264]:
len(nn_preds)

375

## Personality Label (Decision Trees)

In [418]:
def load_dataset_p(label_path):
    label = pd.read_csv(label_path)
    label = label.drop(columns=['Unnamed: 0'])
    p_file = open('./dataset_work/personalities.json')
    personalities = json.load(p_file)
    p_file.close()
    all_personalities = pd.DataFrame.from_dict(personalities, orient='index', columns=['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness'])
    all_personalities['id'] = all_personalities.index
    dataset = pd.merge(label, all_personalities, on='id', how='left')
    return dataset

In [419]:
# laod the personality datasets
train_p = load_dataset_p('./dataset_work/labels/personality_only/alice_train_personalityTrue_imageFalse_labels.csv')
valid_p = load_dataset_p('./dataset_work/labels/personality_only/alice_valid_personalityTrue_imageFalse_labels.csv')
test_p = load_dataset_p('./dataset_work/labels/personality_only/alice_test_personalityTrue_imageFalse_labels.csv')

In [420]:
def split_dataset(d_df):
    _feats = d_df[['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
    x = _feats.values
    return x, d_df['label'].values

In [421]:
X_train_p, y_train_p = split_dataset(train_p)
X_valid_p, y_valid_p = split_dataset(valid_p)
X_test_p, y_test_p = split_dataset(test_p)

In [422]:
# add regularization with 'lambda'
personality_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, n_estimators=50, scale_pos_weight=0.8)
personality_model.fit(X_train_p, y_train_p)

In [413]:
valid_preds = personality_model.predict_proba(X_valid_p)
p_probs = [v[1] for v in valid_preds] # probability of a output label '1'

## Putting them together on images + personalities

In [549]:
def split_dataset_full(d_df):
    _feats = d_df[['id', 'embedding','playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
    x = _feats
    return x, d_df['label']

In [550]:
train_b = load_dataset_df('./dataset_work/labels/image_and_personality/alice_train_personalityTrue_imageTrue_labels.csv')
valid_b = load_dataset_df('./dataset_work/labels/image_and_personality/alice_valid_personalityTrue_imageTrue_labels.csv')
test_b = load_dataset_df('./dataset_work/labels/image_and_personality/alice_test_personalityTrue_imageTrue_labels.csv')

# X_train_b, y_train_b = split_dataset(train_b)
# X_valid_b, y_valid_b = split_dataset(valid_b)
# X_test_b, y_test_b = split_dataset(test_b)

In [551]:
a = train_b._append(valid_b)._append(test_b)

In [552]:
print(len(pd.unique(a['id']))  == len(a))

True


In [553]:
all_data_X, all_data_y = split_dataset_full(a)
#all_data_X

In [554]:
eng_X_train, eng_X_test, eng_y_train, eng_y_test = train_test_split(all_data_X, all_data_y, test_size=0.3, random_state=30)

In [555]:
# predictions for the neural network

idf = eng_X_train
idf['label'] = eng_y_train
len(nn_pred(idf))

436

In [556]:
p_out = personality_model.predict_proba(eng_X_train[['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']])
len(p_out)

436

In [557]:
def gen_final_data(p_model, together_df, out):
    img_preds = nn_pred(together_df)
    personality_preds = personality_model.predict_proba(together_df[['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']])
    personality_preds = [v[1] for v in personality_preds]
    
    dog_ids = together_df['id'].values
    breed_ids = [i[1:i.find('_')] for i in dog_ids]
    
    dog_ids = together_df['id'].values
    breed_ids = [i[1:i.find('_')] for i in dog_ids]
    breed_to_a = {}
    ctr = 0
    for breed in breed_ids:
        if breed not in breed_to_a:
            breed_to_a[breed] = ctr
            ctr += 1

    processed_breed_ids = [breed_to_a[b] for b in breed_ids]
    out['breed'] = processed_breed_ids
    out['p_out'] = personality_preds
    out['img_out'] = img_preds.detach().numpy()

In [558]:
train_master = pd.DataFrame()
test_master = pd.DataFrame()

together_train = eng_X_train
together_train['label'] = eng_y_train

together_test = eng_X_test
together_test['label'] = eng_y_test

gen_final_data(personality_model, together_train, train_master)
gen_final_data(personality_model, together_test, test_master)
len(test_master) + len(train_master) == len(a)

True

In [559]:
master_model = xgb.XGBClassifier(objective="binary:logistic",  booster='gbtree', random_state=42, n_estimators=20, scale_pos_weight=1, reg_lambda=1.1)
master_model.fit(train_master, eng_y_train)

In [560]:
master_pred_test = master_model.predict(train_master)
print(classification_report(eng_y_train, master_pred_test))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93       260
           1       0.89      0.91      0.90       176

    accuracy                           0.92       436
   macro avg       0.92      0.92      0.92       436
weighted avg       0.92      0.92      0.92       436



In [561]:
master_preds = master_model.predict(test_master)
print(classification_report(eng_y_test, master_preds))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79       114
           1       0.69      0.62      0.65        74

    accuracy                           0.74       188
   macro avg       0.73      0.72      0.72       188
weighted avg       0.74      0.74      0.74       188



## 

In [544]:
def gen_final_data(nn_path, p_model, d_b, X_b, out):
    # create a 3-vector [breed, NN output, tree output] for the training set  
    nn_preds = nn_pred(nn_path)
    preds = p_model.predict_proba(X_b) # using the model trained on only personality data
    p_preds = [v[1] for v in preds]
    dog_ids = d_b['id'].values
    breed_ids = [i[1:i.find('_')] for i in dog_ids]
    
    dog_ids = d_b['id'].values
    breed_ids = [i[1:i.find('_')] for i in dog_ids]
    breed_to_a = {}
    ctr = 0
    for breed in breed_ids:
        if breed not in breed_to_a:
            breed_to_a[breed] = ctr
            ctr += 1

    processed_breed_ids = [breed_to_a[b] for b in breed_ids]
    
    out['breed'] = processed_breed_ids
    out['p_out'] = p_preds
    out['img_out'] = nn_preds.detach().numpy()


In [545]:
BOTH_TRAIN_PATH = './dataset_work/labels/image_and_personality/alice_train_personalityTrue_imageTrue_labels.csv'
BOTH_VALID_PATH = './dataset_work/labels/image_and_personality/alice_valid_personalityTrue_imageTrue_labels.csv'
BOTH_TEST_PATH = './dataset_work/labels/image_and_personality/alice_test_personalityTrue_imageTrue_labels.csv'
train_master = pd.DataFrame()
valid_master = pd.DataFrame()
test_master = pd.DataFrame()
gen_final_data(BOTH_TRAIN_PATH, personality_model, train_b, X_train_b, train_master)
gen_final_data(BOTH_VALID_PATH, personality_model, valid_b, X_valid_b, valid_master)
gen_final_data(BOTH_TEST_PATH, personality_model, test_b, X_test_b, test_master)

train_master_X = train_master.values
valid_master_X = valid_master.values
test_master_X = test_master.values

TypeError: string indices must be integers, not 'list'

In [372]:
master_model = xgb.XGBClassifier(objective="binary:logistic",  booster='gbtree', random_state=42, n_estimators=20, scale_pos_weight=1 reg_lambda=1.1)
master_model.fit(train_master_X, y_train_b)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2252057175.py, line 1)

In [373]:
sum(y_train_b) / len(y_train_b)

0.41333333333333333

In [374]:
final_train_preds = master_model.predict(train_master_X)

In [375]:
print(classification_report(y_train_b, final_train_preds))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       220
           1       0.95      0.90      0.92       155

    accuracy                           0.94       375
   macro avg       0.94      0.93      0.93       375
weighted avg       0.94      0.94      0.94       375



In [376]:
final_valid_preds = master_model.predict(valid_master_X)

In [377]:
print(classification_report(y_valid_b, final_valid_preds))

              precision    recall  f1-score   support

           0       0.92      0.75      0.82        75
           1       0.70      0.90      0.79        49

    accuracy                           0.81       124
   macro avg       0.81      0.82      0.80       124
weighted avg       0.83      0.81      0.81       124



In [378]:
final_test_preds = master_model.predict(test_master_X)

In [379]:
print(classification_report(y_test_b, final_test_preds))

              precision    recall  f1-score   support

           0       0.77      0.67      0.72        79
           1       0.54      0.65      0.59        46

    accuracy                           0.66       125
   macro avg       0.65      0.66      0.65       125
weighted avg       0.68      0.66      0.67       125



In [339]:
# Vrushank to-do: 
# refactor data, use k-fold