# Set up and Read in Data

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix

import matplotlib as mpl
%matplotlib inline

from matplotlib import pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
BERT_MODEL_RESULTS_PATH = 'drive/MyDrive/4B Projects/MSCI 598 (Deep Learning for NLP) Project/Code/BERT Code/FINAL RUN'

df_ie_pred = pd.read_csv(BERT_MODEL_RESULTS_PATH+"/IE_pred.csv")
df_jp_pred = pd.read_csv(BERT_MODEL_RESULTS_PATH+"/JP_pred.csv")
df_ns_pred = pd.read_csv(BERT_MODEL_RESULTS_PATH+"/NS_pred.csv")
df_tf_pred = pd.read_csv(BERT_MODEL_RESULTS_PATH+"/TF_pred.csv")

In [None]:
# Print number in each group.
print("df_ie_pred: ", len(df_ie_pred))
print("df_jp_pred: ", len(df_jp_pred))
print("df_ns_pred: ", len(df_ns_pred))
print("df_tf_pred: ", len(df_tf_pred))

df_ie_pred:  4621
df_jp_pred:  4621
df_ns_pred:  4621
df_tf_pred:  4621


In [None]:
df_ie_jp_pred = pd.merge(df_ie_pred, df_jp_pred, how="left", on=["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1", "text", "role", "movie_clean", "mbti", "IE", "NS", "TF", "JP"])
df_ie_jp_ns_pred = pd.merge(df_ie_jp_pred, df_ns_pred, how="left", on=["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1", "text", "role", "movie_clean", "mbti", "IE", "NS", "TF", "JP"])
df_ie_jp_ns_tf_pred = pd.merge(df_ie_jp_ns_pred, df_tf_pred, how="left", on=["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1", "text", "role", "movie_clean", "mbti", "IE", "NS", "TF", "JP"])


In [None]:
print("df_ie_jp_ns_tf_pred: ", len(df_ie_jp_ns_tf_pred))

df_ie_jp_ns_tf_pred:  4621


In [None]:
df_ie_jp_ns_tf_pred

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,text,role,movie_clean,mbti,IE,NS,TF,JP,IE_pred,JP_pred,NS_pred,TF_pred
0,0,860,6334,I feel wrong. I feel wrong being here.,Constanze Mozart,amadeus,ESFJ,0,0,0,1,0,1,1,0
1,1,861,6336,Yes!,Constanze Mozart,amadeus,ESFJ,0,0,0,1,0,1,1,1
2,2,862,6338,I want to go back to Vienna.,Constanze Mozart,amadeus,ESFJ,0,0,0,1,1,0,0,0
3,3,863,6340,I want to go!,Constanze Mozart,amadeus,ESFJ,0,0,0,1,1,1,1,0
4,4,864,6342,Please! Let me sit here. Let me stay here with...,Constanze Mozart,amadeus,ESFJ,0,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4616,4616,62696,300591,Take your time. Enjoy L.A.,Redfoot,the usual suspects,ESTP,0,0,1,0,0,0,1,0
4617,4617,62697,300592,I've got a ton of work and no good people.,Redfoot,the usual suspects,ESTP,0,0,1,0,1,0,0,0
4618,4618,63144,305902,:Oh,Nick Marshall,what women want,ENTP,0,1,1,0,0,0,1,1
4619,4619,63145,305904,:Well,Nick Marshall,what women want,ENTP,0,1,1,0,0,0,1,1


### Accuracy Values

In [None]:
def get_accuracy_for_category(y_test_category, y_pred_category):
  accuracy = accuracy_score(y_test_category, y_pred_category)

  print("* Accuracy: %.2f%%" % (accuracy * 100.0))
  return accuracy

In [None]:
# Accuracy for each category.
ie_accuracy = get_accuracy_for_category(df_ie_jp_ns_tf_pred["IE"], df_ie_jp_ns_tf_pred["IE_pred"])
ns_accuracy = get_accuracy_for_category(df_ie_jp_ns_tf_pred["NS"], df_ie_jp_ns_tf_pred["NS_pred"])
tf_accuracy = get_accuracy_for_category(df_ie_jp_ns_tf_pred["TF"], df_ie_jp_ns_tf_pred["TF_pred"])
jp_accuracy = get_accuracy_for_category(df_ie_jp_ns_tf_pred["JP"], df_ie_jp_ns_tf_pred["JP_pred"])

* Accuracy: 53.43%
* Accuracy: 48.73%
* Accuracy: 53.49%
* Accuracy: 50.27%


In [None]:
# Macroaverage accuracy of the four classifiers.
macroavg_accuracy = (ie_accuracy + jp_accuracy + ns_accuracy + tf_accuracy) / 4
print("* Macroaverage Accuracy: %.2f%%" % (macroavg_accuracy * 100.0))

* Macroaverage Accuracy: 51.48%


In [None]:
ie_correct = df_ie_jp_ns_tf_pred.where(df_ie_jp_ns_tf_pred["IE"] == df_ie_jp_ns_tf_pred["IE_pred"]).count()[0]
ns_correct = df_ie_jp_ns_tf_pred.where(df_ie_jp_ns_tf_pred["NS"] == df_ie_jp_ns_tf_pred["NS_pred"]).count()[0]
tf_correct = df_ie_jp_ns_tf_pred.where(df_ie_jp_ns_tf_pred["TF"] == df_ie_jp_ns_tf_pred["TF_pred"]).count()[0]
jp_correct = df_ie_jp_ns_tf_pred.where(df_ie_jp_ns_tf_pred["JP"] == df_ie_jp_ns_tf_pred["JP_pred"]).count()[0]

In [None]:
# Microaverage accuracy of the four classifiers.
microavg_accuracy = (ie_correct + ns_correct + tf_correct + jp_correct) / (4*len(df_ie_jp_ns_tf_pred))
print("* Microaverage Accuracy: %.2f%%" % (microavg_accuracy * 100.0))

* Microaverage Accuracy: 51.48%


In [None]:
df_results = df_ie_jp_ns_tf_pred
full_match = df_results[(df_results["IE"] == df_results["IE_pred"]) & (df_results["NS"] == df_results["NS_pred"]) & (df_results["TF"] == df_results["TF_pred"]) & (df_results["JP"] == df_results["JP_pred"])].count()["text"]
print("* Full Match Accuracy: %.2f%%" % (full_match/len(df_ie_jp_ns_tf_pred) * 100.0))

* Full Match Accuracy: 6.36%
