# EmotionX
Best model for Friends dataset <br>
Author: Andrew Nguyen <br>
Date: 9/6/2019 

Summary: <br>
Using the target utterance only (utterance2) <br>
-> feature selection using TFIDF <br>
-> one hot encoding <br>
-> linearSVM

In [9]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# Adding project root and custom functions
from os import listdir, getcwd
from os.path import isfile, join
import sys
from pathlib import Path

def find_root_dir():
    # find config.py
    path = Path(getcwd()).parent
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]

    while "config.py" not in onlyfiles:
        path = path.parent
        onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
        
    return path

ROOT_DIR = str(find_root_dir())
sys.path.append(ROOT_DIR)

from config import get_project_root
# custom functs
from src.features import build_features
from src.visualization.visualize import plot_confusion_matrix

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle
import copy

import random
from collections import Counter

from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import Binarizer
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report, balanced_accuracy_score
from sklearn.utils.multiclass import unique_labels

## Prepare data

In [None]:
ROOT_PATH

In [None]:
ROOT_PATH = get_project_root()
TRAIN_PATH = ROOT_PATH / "data/raw/EmotionX2018/friends.train.json"
DEV_PATH = ROOT_PATH / "data/raw/EmotionX2018/friends.dev.json"
TEST_PATH = ROOT_PATH / "data/raw/EmotionX2018/friends.test.json"

df_train = build_features.to_df(TRAIN_PATH)
df_dev = build_features.to_df(DEV_PATH)
df_test = build_features.to_df(TEST_PATH)

In [None]:
df_train["split"] = "train"
df_dev["split"] = "dev"
df_test["split"] = "test"

In [None]:
# concat df
df = pd.concat([df_train, df_dev, df_test], ignore_index=True)
df

In [None]:
# simplify emotion labels
df = df[df.emotion2.isin(["neutral", "joy", "sadness", "anger"])]

In [None]:
df = df.reset_index(drop=True)

## Preprocess

In [None]:
# make nan utterances -> empty strings
df.utterance1 = df.utterance1.fillna("")
df.utterance3 = df.utterance3.fillna("")

In [None]:
# preappend utterance 1 + utterance 2 with number
tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False)

def preappend(sent, num):
    words = tokenizer.tokenize(sent)
    
    # remove stop words
    stopWords = set(stopwords.words('english'))
    words = [ w for w in words if w not in stopWords]
    
    words = [ num + w for w in words]
    return words

df.utterance1 = df.utterance1.apply(lambda x: preappend(x, "1_"))
df.utterance2 = df.utterance2.apply(lambda x: preappend(x, "2_"))
df.utterance3 = df.utterance3.apply(lambda x: preappend(x, "3_"))

In [None]:
# append utt1 and utt2 arrays together
temp = df[["utterance1", "utterance2"]].values.tolist()
temp = [ t[0] + t[1] for t in temp]
df["utterance12"] = temp

## TFIDF 

In [None]:
df

In [None]:
# get vocab
ll = df.utterance12.values.tolist()

shared_vocab = set()
for l in ll:
    shared_vocab.update(l)

In [None]:
counts_df = pd.DataFrame(columns=list(shared_vocab), index = ["anger", "joy", "neutral", "sadness"])
counts_df = counts_df.fillna(0)

## Counting the number of occurances of each token in the corpus of each class
for i, tokens in enumerate(df['utterance12']):
    for word in tokens:
        if word in shared_vocab:
            labl =  df['emotion2'][i]
            counts_df[word][labl]+=1

In [None]:
tfidf = TfidfTransformer()

X_tfidf = tfidf.fit_transform(counts_df.values).toarray()

In [None]:
word_score = []
word_emote = []
for x in zip(*X_tfidf):
    y = [('angry', x[0]), ('joy', x[1]), ('neutral', x[2]), ('sadness', x[3])]
    y = sorted(y, key=lambda x: x[1])
    word_score.append(y[3][1] - y[2][1])
    word_emote.append(y[3][0])

In [None]:
rank = list(zip(shared_vocab, word_score, word_emote))

In [None]:
rank = sorted(rank, key=lambda x:x[1], reverse=True)

In [None]:
vocab = [ w[0] for w in rank]

## Feature transform (onehot)

In [None]:
X = df.utterance2

def do_nothing(tokens):
    return tokens

#  [1000, 2500, 5000, 10000]:
half = int(len(vocab)/2)
curvocab = vocab[0:half]

freq = CountVectorizer(tokenizer=do_nothing, vocabulary=curvocab, preprocessor=None, lowercase=False)
X = freq.fit_transform(X)

onehot = Binarizer()
X = onehot.fit_transform(X.toarray())

In [None]:
len(vocab)

In [None]:
len(freq.vocabulary_)

##  Train test split

In [None]:
# # X as is from above
y = df.emotion2
split = df.split

In [None]:
trainIdx = split[split.isin(["train", "dev"])].index.tolist()
testIdx = split[split == "test"].index.tolist()

X_train = [X[i] for i in trainIdx]
X_test = [X[i] for i in testIdx]
y_train = [y[i] for i in trainIdx]
y_test = [y[i] for i in testIdx]

## Fit and eval on train data

In [None]:
clf = LinearSVC(random_state=0)

In [None]:
clf.fit(X_train, y_train)

In [None]:
# predict
y_pred = clf.predict(X_test)

## Eval predictions

In [None]:
# np.set_printoptions(precision=2)

plot_confusion_matrix(y_test, y_pred,
                      title='Confusion matrix, without normalization')
plt.show()

In [None]:
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, normalize=True,
                      title='Normalized confusion matrix')

In [None]:
print(classification_report(y_test, y_pred))

## Micro F1

In [None]:
print("{:.4f}".format(f1_score(y_test, y_pred, average='micro')))

## Fit and predict for eval

In [None]:
clf = LinearSVC(random_state=0)
clf.fit(X,y)

In [None]:
ROOT_PATH = get_project_root()
EVAL_PATH = ROOT_PATH / "data/raw/eval/friends_eval.json"
df_eval = build_features.to_df(EVAL_PATH)

def transform(df):
    # make nan utterances -> empty strings
    df.utterance1 = df.utterance1.fillna("")

    # preappend utterance 1 + utterance 2 with number
    tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False)

    def preappend(sent, num):
        words = tokenizer.tokenize(sent)

        # remove stop words
        stopWords = set(stopwords.words('english'))
        words = [ w for w in words if w not in stopWords]

        words = [ num + w for w in words]
        return words

    df.utterance1 = df.utterance1.apply(lambda x: preappend(x, "1_"))
    df.utterance2 = df.utterance2.apply(lambda x: preappend(x, "2_"))
    
    # append utt1 and utt2 arrays together
    temp = df[["utterance1", "utterance2"]].values.tolist()
    temp = [ t[0] + t[1] for t in temp]
    df["utterance12"] = temp
    
    return df

df_evall = transform(df_eval)

In [None]:
def x(df,vocab):

    X = df.utterance2

    def do_nothing(tokens):
        return tokens

    #  [1000, 2500, 5000, 10000]:
    half = int(len(vocab)/2)
    curvocab = vocab[0:half]

    freq = CountVectorizer(tokenizer=do_nothing, vocabulary=curvocab, preprocessor=None, lowercase=False)
    X = freq.fit_transform(X)

    onehot = Binarizer()
    X = onehot.fit_transform(X.toarray())   

#     y = df.emotion2
    return X

X = x(df_evall, vocab)

In [None]:
y_pred = clf.predict(X)

In [None]:
# generate output file
EVAL_PATH = ROOT_PATH / "data/raw/eval/friends_eval.json"
file = EVAL_PATH
if file:
    with open(file, 'r') as f:
        datastore = json.load(f)    

In [None]:
iypred = 0

for i in range(len(datastore)):
    for j in range(len(datastore[i])):
        datastore[i][j]["emotion"] = y_pred[iypred]
        iypred += 1

In [None]:
mydets = {
    "name": "Andrew Nguyen",
    "email": "andrew.nguyen03@adelaide.edu.au"
}

out = [mydets, datastore]

In [None]:
OUT_PATH = ROOT_PATH / "data/processed/friends.submission.json"
filename = OUT_PATH
if filename:
    # Writing JSON data
    with open(filename, 'w') as f:
        json.dump(out, f)

In [None]:
df_eval.shape

In [None]:
len(y_pred)