In [6]:
import json
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [7]:
data_file = open('goemotions.json')
data = np.array(json.load(data_file))

sentences = data[:, 0]
y_emotions = data[:, 1]
y_sentiments = data[:, 2]

In [8]:
# Collecting the words and their frequencies (2.1)       
cv = CountVectorizer(analyzer='word')
X = cv.fit_transform(sentences)

In [9]:
#Printing the size of the vocabulary (2.1)
print(len(cv.vocabulary_))

30449


In [10]:
# Splitting the data in training and test sets (2.2)
X_training, X_testing, y_training_emotions, y_testing_emotions, y_training_sentiments, y_testing_sentiments = train_test_split(X, y_emotions, y_sentiments, test_size=0.2, train_size=0.8, shuffle=False)

In [11]:
# Base Multinomial Naive Bayes classifier (2.3.1)
from sklearn.naive_bayes import MultinomialNB

mnb_classifer_emotions = MultinomialNB()
mnb_classifer_emotions.fit(X_training, y_training_emotions)

mnb_classifer_sentiments = MultinomialNB()
mnb_classifer_sentiments.fit(X_training, y_training_sentiments)

In [7]:
# Running a few tests for both emotions and sentiments for NB classifier
# Note: This doesn't seem to be working very well...... Is this normal?
for i in range(0, 3):
    print("Output emotion from NB: ", mnb_classifer_emotions.predict(X_testing[i]))
    print("Expected emotion: ", y_testing_emotions[i])
    print("Output sentiment from NB: ", mnb_classifer_sentiments.predict(X_testing[i]))
    print("Expected sentiment: ", y_testing_sentiments[i])
    print()

Output emotion from NB:  ['neutral']
Expected emotion:  approval
Output sentiment from NB:  ['negative']
Expected sentiment:  positive

Output emotion from NB:  ['neutral']
Expected emotion:  surprise
Output sentiment from NB:  ['positive']
Expected sentiment:  ambiguous

Output emotion from NB:  ['neutral']
Expected emotion:  annoyance
Output sentiment from NB:  ['ambiguous']
Expected sentiment:  negative



In [8]:
# Base Decision Tree (2.3.2)
from sklearn import tree
dt_classifer_emotions = tree.DecisionTreeClassifier()
dt_classifer_emotions.fit(X_training, y_training_emotions)

dt_classifer_sentiments = tree.DecisionTreeClassifier()
dt_classifer_sentiments.fit(X_training, y_training_sentiments)

In [9]:
# Running a few tests for both emotions and sentiments for DT classifier
for i in range(0, 3):
    print("Output emotion from DT: ", dt_classifer_emotions.predict(X_testing[i]))
    print("Expected emotion: ", y_testing_emotions[i])
    print("Output sentiment from DT: ", dt_classifer_sentiments.predict(X_testing[i]))
    print("Expected sentiment: ", y_testing_sentiments[i])
    print()

Output emotion from DT:  ['disgust']
Expected emotion:  approval
Output sentiment from DT:  ['negative']
Expected sentiment:  positive

Output emotion from DT:  ['excitement']
Expected emotion:  surprise
Output sentiment from DT:  ['positive']
Expected sentiment:  ambiguous

Output emotion from DT:  ['annoyance']
Expected emotion:  annoyance
Output sentiment from DT:  ['negative']
Expected sentiment:  negative



In [10]:
# do (2.3.3)

In [None]:
# Creating NB classifiers with optimized alpha hyper params (2.3.4)
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {"alpha": [0.5, 0, 2]}

top_mnb_classifer_emotions = GridSearchCV(mnb_classifer_sentiments, param_grid)
top_mnb_classifer_emotions.fit(X_training, y_training_emotions)

top_mnb_classifer_sentiments = GridSearchCV(mnb_classifer_sentiments, param_grid)
top_mnb_classifer_sentiments.fit(X_training, y_training_emotions)



In [19]:
print(top_mnb_classifer_emotions.best_estimator_)
print(top_mnb_classifer_sentiments.best_estimator_)

MultinomialNB(alpha=0.5)


In [None]:
# Running a few tests for both emotions and sentiments for NB classifier
# Note: This doesn't seem to be working very well...... Is this normal?
for i in range(0, 3):
    print("Output emotion from NB: ", top_mnb_classifer_emotions.predict(X_testing[i]))
    print("Expected emotion: ", y_testing_emotions[i])
    print("Output sentiment from NB: ", top_mnb_classifer_sentiments.predict(X_testing[i]))
    print("Expected sentiment: ", y_testing_sentiments[i])
    print()