In [476]:
import numpy as np
import pandas as pd
import json

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report

import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB

## Image Label (Neural Network)

In [None]:
"""
2 LAYER

class Net(nn.Module):

    def __init__(self, INPUT_SIZE):
        super(Net, self).__init__()
        # Input is output of resnet18 image classification model
        self.fc1 = nn.Linear(INPUT_SIZE, 500) 
        self.fc11 = nn.Linear(500, 100)
        self.fc2 = nn.Linear(100, 1)
        self.act_out = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.25)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc11(x))
        x = self.dropout(x)
        x = self.act_out(self.fc2(x))
        return x
"""

In [None]:
"""
3 LAYER

class Net(nn.Module):

    def __init__(self, INPUT_SIZE):
        super(Net, self).__init__()
        # Input is output of resnet18 image classification model
        self.fc1 = nn.Linear(INPUT_SIZE, 750) 
        self.fc11 = nn.Linear(750, 250)
        self.fc2 = nn.Linear(250, 100)
        self.fc3 = nn.Linear(100, 1)
        self.act_out = nn.Sigmoid()

        self.dropout = nn.Dropout(p=0.25)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc11(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.act_out(self.fc3(x))
        return x
"""

In [729]:

class Net(nn.Module):

    def __init__(self, INPUT_SIZE):
        super(Net, self).__init__()
        # Input is output of resnet18 image classification model
        self.fc1 = nn.Linear(INPUT_SIZE, 500) 
        self.fc11 = nn.Linear(500, 100)
        self.fc2 = nn.Linear(100, 1)
        self.act_out = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.25)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc11(x))
        x = self.dropout(x)
        x = self.act_out(self.fc2(x))
        return x


def load_dataset_df(label_path):
    label = pd.read_csv(label_path)
    label = label.drop(columns=['Unnamed: 0'])
    dataset = label

    # load embeddings
    with open('./dataset_work/embeddings.json') as f:
        embeddings = json.load(f)
    all_embeddings = pd.DataFrame.from_dict(embeddings, orient='index', columns=['embedding'])
    all_embeddings.index.name = 'id'
    dataset = pd.merge(dataset, all_embeddings, on='id', how='left')

    # load personalities
    with open('./dataset_work/personalities.json') as f:
        personalities = json.load(f)
    all_personalities = pd.DataFrame.from_dict(personalities, orient='index', columns=['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness'])
    all_personalities.index.name = 'id'
    dataset = pd.merge(dataset, all_personalities, on='id', how='left')

    return dataset

def vectorize_dataset(df, with_personalities, with_embeddings):
    if with_personalities and with_embeddings:
        cols = df[['embedding','playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
        x = cols.values
        embeddings = x[:,0]
        expanded = np.array(embeddings.tolist())
        x = np.concatenate((expanded, x[:,1:]), axis=1)
        x = np.array(x, dtype='float64')
        y = df['label'].values
    elif with_embeddings:
        cols = df[['embedding','playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
        x = cols.values
        embeddings = x[:,0]
        expanded = np.array(embeddings.tolist())
        x = expanded
        y = df['label'].values
    else:
        cols = df[['embedding','playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
        x = cols.values
        x = x[:,1:]
        x = np.array(x, dtype="float64")
        y = df['label'].values
    return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

def load(label_path, with_personalities, with_embeddings):
    df = load_dataset_df(label_path)
    return vectorize_dataset(df, with_personalities, with_embeddings)

def nn_pred(label_path):
    # load the weights, trained on only images
    # 3 layer train
    #PATH = "trained_models/BEST_MODEL_ALICE/alice_medium_images_True_personalities_False_optm_adam_loss_MSE_EPOCHS_100_BATCH_1000_REG_0.095_LR_0.0001_f1_0.788327619047619"
    # 2 layer train
    PATH = "trained_models/BEST_MODEL_ALICE/ALICE_2_LAYER_BEST_optm_adam_loss_MSE_EPOCHS_100_BATCH_1000_REG_0.09_LR_0.0001_f1_0.7920085561497326"
    train_x_t, train_y_t = load(label_path, with_personalities=False, with_embeddings=True)
    model = Net(train_x_t.shape[1])
    model.load_state_dict(torch.load(PATH))
    model.eval()
    return model(train_x_t)

In [730]:
nn_preds_t = nn_pred('dataset_work/labels/image_only/alice_train_personalityFalse_imageTrue_labels.csv')
train_i = load_dataset_df('./dataset_work/labels/image_only/alice_train_personalityFalse_imageTrue_labels.csv')

In [731]:
print(classification_report(nn_preds_t.round().detach().numpy(), train_i['label']))

              precision    recall  f1-score   support

         0.0       0.94      0.85      0.89       253
         1.0       0.73      0.89      0.80       122

    accuracy                           0.86       375
   macro avg       0.84      0.87      0.85       375
weighted avg       0.87      0.86      0.86       375



In [734]:
nn_preds = nn_pred('dataset_work/labels/image_only/alice_valid_personalityFalse_imageTrue_labels.csv')
valid_i = load_dataset_df('./dataset_work/labels/image_only/alice_valid_personalityFalse_imageTrue_labels.csv')
#valid_i = load_dataset_df('./dataset_work/labels/image_and_personality/alice_valid_personalityFalse_imageTrue_labels.csv')

In [735]:
print(classification_report(nn_preds.round().detach().numpy(), valid_i['label']))

              precision    recall  f1-score   support

         0.0       0.84      0.75      0.80        85
         1.0       0.57      0.70      0.63        40

    accuracy                           0.74       125
   macro avg       0.71      0.73      0.71       125
weighted avg       0.76      0.74      0.74       125



In [736]:
len(nn_preds)

125

## Personality Label (Decision Trees)

In [737]:
def load_dataset_p(label_path):
    label = pd.read_csv(label_path)
    label = label.drop(columns=['Unnamed: 0'])
    p_file = open('./dataset_work/personalities.json')
    personalities = json.load(p_file)
    p_file.close()
    all_personalities = pd.DataFrame.from_dict(personalities, orient='index', columns=['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness'])
    all_personalities['id'] = all_personalities.index
    dataset = pd.merge(label, all_personalities, on='id', how='left')
    return dataset

In [738]:
# laod the personality datasets
train_p = load_dataset_p('./dataset_work/labels/personality_only/alice_train_personalityTrue_imageFalse_labels.csv')
valid_p = load_dataset_p('./dataset_work/labels/personality_only/alice_valid_personalityTrue_imageFalse_labels.csv')
test_p = load_dataset_p('./dataset_work/labels/personality_only/alice_test_personalityTrue_imageFalse_labels.csv')

In [739]:
def split_dataset(d_df):
    _feats = d_df[['playfulness', 'chase-proneness', 'curiosity', 'sociability', 'aggressiveness', 'shyness']]
    x = _feats.values
    return x, d_df['label'].values

In [740]:
X_train_p, y_train_p = split_dataset(train_p)
X_valid_p, y_valid_p = split_dataset(valid_p)
X_test_p, y_test_p = split_dataset(test_p)

In [934]:
# add regularization with 'lambda'
#personality_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, n_estimators=35, scale_pos_weight=0.8, reg_lambda=3)

personality_model = GaussianNB() # likelihood of the features is assumed to be gaussian
personality_model.fit(X_train_p, y_train_p)

In [935]:
valid_preds = personality_model.predict(X_valid_p)
train_preds = personality_model.predict(X_train_p)
test_preds = personality_model.predict(X_test_p)
#p_probs = [v[1] for v in valid_preds] # probability of a output label '1'

In [936]:
sum(y_valid_p) / len(y_valid_p)

0.752

In [937]:
print(classification_report(train_preds, y_train_p))

              precision    recall  f1-score   support

           0       0.56      0.89      0.69        57
           1       0.98      0.87      0.92       318

    accuracy                           0.88       375
   macro avg       0.77      0.88      0.81       375
weighted avg       0.92      0.88      0.89       375



In [938]:
print(classification_report(valid_preds, y_valid_p))

              precision    recall  f1-score   support

           0       0.55      0.81      0.65        21
           1       0.96      0.87      0.91       104

    accuracy                           0.86       125
   macro avg       0.75      0.84      0.78       125
weighted avg       0.89      0.86      0.87       125



In [939]:
print(classification_report(test_preds, y_test_p))

              precision    recall  f1-score   support

           0       0.66      0.92      0.77        25
           1       0.98      0.88      0.93        99

    accuracy                           0.89       124
   macro avg       0.82      0.90      0.85       124
weighted avg       0.91      0.89      0.89       124



## Putting them together on images + personalities

In [940]:
def gen_final_data(nn_path, p_model, d_b, X_b, out):
    # create a 3-vector [breed, NN output, tree output] for the training set  
    nn_preds = nn_pred(nn_path)
    preds = p_model.predict_proba(X_b) # using the model trained on only personality data
    p_preds = [v[1] for v in preds]
    dog_ids = d_b['id'].values
    breed_ids = [i[1:i.find('_')] for i in dog_ids]
    
    dog_ids = d_b['id'].values
    breed_ids = [i[1:i.find('_')] for i in dog_ids]
    breed_to_a = {}
    ctr = 0
    for breed in breed_ids:
        if breed not in breed_to_a:
            breed_to_a[breed] = ctr
            ctr += 1

    processed_breed_ids = [breed_to_a[b] for b in breed_ids]
    
    #out['breed'] = processed_breed_ids
    out['p_out'] = p_preds
    out['img_out'] = nn_preds.detach().numpy()
    

    for breed in breed_to_a.keys():
        nc = []
        for i in breed_ids:
            if i == breed:
                #print('y')
                nc.append(1)
            else:
                nc.append(0)
        out[breed] = nc
 

In [941]:
train_b = load_dataset_df('./dataset_work/labels/image_and_personality/alice_train_personalityTrue_imageTrue_labels.csv')
valid_b = load_dataset_df('./dataset_work/labels/image_and_personality/alice_valid_personalityTrue_imageTrue_labels.csv')
test_b = load_dataset_df('./dataset_work/labels/image_and_personality/alice_test_personalityTrue_imageTrue_labels.csv')

X_train_b, y_train_b = split_dataset(train_b)
X_valid_b, y_valid_b = split_dataset(valid_b)
X_test_b, y_test_b = split_dataset(test_b)

In [942]:
BOTH_TRAIN_PATH = './dataset_work/labels/image_and_personality/alice_train_personalityTrue_imageTrue_labels.csv'
BOTH_VALID_PATH = './dataset_work/labels/image_and_personality/alice_valid_personalityTrue_imageTrue_labels.csv'
BOTH_TEST_PATH = './dataset_work/labels/image_and_personality/alice_test_personalityTrue_imageTrue_labels.csv'
train_master = pd.DataFrame()
valid_master = pd.DataFrame()
test_master = pd.DataFrame()
gen_final_data(BOTH_TRAIN_PATH, personality_model, train_b, X_train_b, train_master)
gen_final_data(BOTH_VALID_PATH, personality_model, valid_b, X_valid_b, valid_master)
gen_final_data(BOTH_TEST_PATH, personality_model, test_b, X_test_b, test_master)

train_master_X = train_master.values
valid_master_X = valid_master.values
test_master_X = test_master.values

In [943]:
test_master

Unnamed: 0,p_out,img_out,02097209,02099267,02091467,02087046,02096585,02106382,02107908,02091635,...,02088364,02100236,02094114,02115641,02099712,02100877,02106030,02113624,02110185,02093647
0,0.934127,0.561628,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.010844,0.270346,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.592269,0.633707,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.829378,0.296303,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.743417,0.575307,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,0.856508,0.446371,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
120,0.397103,0.187637,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
121,0.520248,0.531577,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
122,0.917519,0.151803,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [971]:
master_model = xgb.XGBClassifier(objective="binary:logistic",  booster='gbtree', random_state=42, n_estimators=20, scale_pos_weight=1.15, reg_lambda=1.5)
#master_model = LogisticRegression()
master_model.fit(train_master_X, y_train_b)

In [972]:
sum(y_train_b) / len(y_train_b)

0.41333333333333333

In [973]:
final_train_preds = master_model.predict(train_master_X)

In [974]:
print(classification_report(y_train_b, final_train_preds))

              precision    recall  f1-score   support

           0       0.95      0.92      0.94       220
           1       0.89      0.93      0.91       155

    accuracy                           0.93       375
   macro avg       0.92      0.93      0.92       375
weighted avg       0.93      0.93      0.93       375



In [975]:
final_valid_preds = master_model.predict(valid_master_X)

In [976]:
print(classification_report(y_valid_b, final_valid_preds))

              precision    recall  f1-score   support

           0       0.73      0.73      0.73        79
           1       0.54      0.54      0.54        46

    accuracy                           0.66       125
   macro avg       0.64      0.64      0.64       125
weighted avg       0.66      0.66      0.66       125



In [977]:
final_test_preds = master_model.predict(test_master_X)

In [978]:
sum(final_test_preds) / len(final_test_preds)

0.4596774193548387

In [979]:
print(classification_report(y_test_b, final_test_preds))

              precision    recall  f1-score   support

           0       0.85      0.76      0.80        75
           1       0.68      0.80      0.74        49

    accuracy                           0.77       124
   macro avg       0.77      0.78      0.77       124
weighted avg       0.78      0.77      0.78       124



In [870]:
# Vrushank to-do: 
# refactor data, use k-fold