In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
data_file = open('goemotions.json')
data = np.array(json.load(data_file))

sentences = data[:, 0]
y_emotions = data[:, 1]
y_sentiments = data[:, 2]

# 2.1 Collecting the words and their frequencies

In [3]:
cv = CountVectorizer(analyzer='word')
X = cv.fit_transform(sentences)

In [None]:
# size of the vocabulary
print(len(cv.vocabulary_))

# 2.2 Splitting the data in training and test sets

In [5]:
X_training, X_testing, y_training_emotions, y_testing_emotions, y_training_sentiments, y_testing_sentiments = train_test_split(X, y_emotions, y_sentiments, test_size=0.2, train_size=0.8, shuffle=False)

# 2.3.1 Base Multinomial Naive Bayes classifier

In [6]:
from sklearn.naive_bayes import MultinomialNB

mnb_classifer_emotions = MultinomialNB()
mnb_classifer_emotions.fit(X_training, y_training_emotions)

mnb_classifer_sentiments = MultinomialNB()
mnb_classifer_sentiments.fit(X_training, y_training_sentiments)

# 2.3.2 Base Decision Tree Classifier

In [8]:
from sklearn import tree
dt_classifer_emotions = tree.DecisionTreeClassifier()
dt_classifer_emotions.fit(X_training, y_training_emotions)

dt_classifer_sentiments = tree.DecisionTreeClassifier()
dt_classifer_sentiments.fit(X_training, y_training_sentiments)

# 2.3.3 Base Multi-Layered-Perceptron

In [44]:
from sklearn.neural_network import MLPClassifier

mlp_classifer_emotions = MLPClassifier(max_iter=1)
mlp_classifer_emotions.fit(X_training, y_training_emotions)

mlp_classifer_sentiments = MLPClassifier(max_iter=1)
mlp_classifer_sentiments.fit(X_training, y_training_sentiments)



# 2.3.4 Optimized Naive Bayes Classifier

In [12]:
# Creating NB classifiers with optimized alpha hyper params (2.3.4)
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {"alpha": [0.5, 0, 2]}

mnb_classifer_emotions_new = MultinomialNB()
top_mnb_classifer_emotions = GridSearchCV(mnb_classifer_emotions_new, param_grid)
top_mnb_classifer_emotions.fit(X_training, y_training_emotions)

mnb_classifer_sentiments_new = MultinomialNB()
top_mnb_classifer_sentiments = GridSearchCV(mnb_classifer_sentiments_new, param_grid)
top_mnb_classifer_sentiments.fit(X_training, y_training_emotions)



# 2.3.5 Optimized Decision Tree Classifier

In [38]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV

paramter_DT = {"criterion":["gini","entropy"],"max_depth":[5,10] , "min_samples_split":[2,5,10]}

dt_classifer_emotions_new = tree.DecisionTreeClassifier()
top_dt_classifer_emotions = GridSearchCV(dt_classifer_emotions_new, paramter_DT)
top_dt_classifer_emotions.fit(X_training,y_training_emotions)

dt_classifer_sentiments_new = tree.DecisionTreeClassifier()
top_dt_classifer_sentiments = GridSearchCV(dt_classifer_sentiments_new, paramter_DT)
top_dt_classifer_sentiments.fit(X_training,y_training_sentiments)

# 2.3.5 Optimized Multi-Layer Perceptron

In [None]:
from sklearn.model_selection import GridSearchCV

mlp_parameters = {
    'solver': ["adam", "sgd"],
    'hidden_layer_sizes' : [(10,5),(15,10)],
    'activation' : ["sigmoid", "relu", "tanh", "identity"]
}

top_mlp_classifier_emotions = GridSearchCV(estimator=MLPClassifier(max_iter=2), param_grid=mlp_parameters)
top_mlp_classifier_emotions.fit(X_training, y_training_emotions)

top_mlp_classifier_sentiments = GridSearchCV(estimator=MLPClassifier(max_iter=2), param_grid=mlp_parameters)
top_mlp_classifier_sentiments.fit(X_training, y_training_sentiments)



20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/adrien/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/adrien/.local/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 762, in fit
    return self._fit(X, y, incremental=False)
  File "/home/adrien/.local/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 385, in _fit
    self._validate_hyperparameters()
  File "/home/adrien/.local/lib/python3.8/site-packages/sklearn/neural_network/

# 2.4 Performance Metrics

In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Base-MNB (2.3.1)

In [29]:
# Emotions
mnb_classifer_emotions_results = mnb_classifer_emotions.predict(X_testing)
print(confusion_matrix(y_testing_emotions, mnb_classifer_emotions_results))
print(classification_report(y_testing_emotions, mnb_classifer_emotions_results))

[[ 929   18    0   12   30    3    1    3    0    2   10    1    0    7
     0   44    0   13   42    0  949   10    0    4    0    0    1   10]
 [  43  360    4   16   18    0    1    1    2    1    7    0    0    4
     0    6    0   14    8    0  704    3    0    2    0    0    0    4]
 [  15    5  113   76    6    4    2    5    0    6   25   12    0    1
     0    6    0    1    1    0  714    2    0    2    0    1    1    1]
 [  25   25   51  119   35    5    6    7    0   10   27   11    0    1
     0   18    0    1    2    0 1326    3    0    2    0    1    8    6]
 [ 124   15    6   29  220    4    7    4    3    4   31    3    0    2
     1   15    0    4   14    0 1715   12    0   12    0    1    4    1]
 [  18    1    0    5   25   40    1    4    0    4    8    0    0    1
     0   18    0    4    6    0  557   17    0    2    0    2    3    0]
 [   9    4    2   10   15    0   51   23    0    1   13    0    0    0
     0    7    0    1    0    0  782    1    0    9    0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

    admiration       0.47      0.44      0.46      2089
     amusement       0.53      0.30      0.38      1198
         anger       0.32      0.11      0.17       999
     annoyance       0.19      0.07      0.10      1689
      approval       0.25      0.10      0.14      2231
        caring       0.27      0.06      0.09       716
     confusion       0.31      0.05      0.09       934
     curiosity       0.42      0.10      0.17      1181
        desire       0.31      0.03      0.06       450
disappointment       0.23      0.03      0.06       950
   disapproval       0.25      0.08      0.12      1585
       disgust       0.45      0.09      0.15       622
 embarrassment       0.10      0.00      0.01       283
    excitement       0.29      0.04      0.08       600
          fear       0.52      0.04      0.07       364
     gratitude       0.75      0.67      0.71      1409
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# Sentiments
mnb_classifer_sentiments_results = mnb_classifer_sentiments.predict(X_testing)
print(confusion_matrix(y_testing_sentiments, mnb_classifer_emotions_results))
print(classification_report(y_testing_sentiments, mnb_classifer_sentiments_results))

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [  91    0   28   19   47   67    5   74  163    2    6   42    2    0
     7    1   23    0   12    8    0 3076    9    0    0   47    0    3
    64]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0

## Base-DT (2.3.2)

In [34]:
# Emotions
dt_classifer_emotions_results = dt_classifer_emotions.predict(X_testing)
print(confusion_matrix(y_testing_emotions, dt_classifer_emotions_results))
print(classification_report(y_testing_emotions, dt_classifer_emotions_results))

[[1175   46   12   54  116   16   11   18   16   14   24    6    3   48
     3   63    2   54   74    1  256   37    8   15    2    0    1   14]
 [  67  685   18   43   52    4   14   10    3    6   16    9    1   18
     3    5    0   50    8    1  148    8    1   11    0    5    3    9]
 [  32   24  351  142   35   13   27   18    3   31   42   33    7    5
     2    1    1    3    4    1  191    4    0    8    2    4    9    6]
 [  47   79  205  396   97   30   45   44   14   77   93   47   16   15
     3   13    1    9    7    4  391    6    0   21    1    4   14   10]
 [ 207   43   54  132  602   66   44   37   35   45   83   14   14   27
     9   21    1   40   34    8  620   37    5   28    5    2    7   11]
 [  45   16   18   45   81  190    9   12   10   13   20    2    2    4
     4   15    2    7   13    8  156   25    0    5    2    4    8    0]
 [  17   24   27   50   57    9  278  132    2   17   32    6    1    4
     1    4    2    2    4    3  240    1    0    8    0  

In [35]:
# Sentiments
dt_classifer_sentiments_results = dt_classifer_sentiments.predict(X_testing)
print(confusion_matrix(y_testing_sentiments, dt_classifer_emotions_results))
print(classification_report(y_testing_sentiments, dt_classifer_sentiments_results))

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 156    0  107  112  218  270   37  529  574   21   81  130   24   14
    61   11   25    7   32   17    0   12 1016   18    0    2  110    1
     9   13  189]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0    0    0  

## Base-MLP (2.3.3)

In [None]:
# Emotions
mlp_classifer_emotions_results = mlp_classifer_emotions.predict(X_testing)
print(confusion_matrix(y_testing_emotions, mlp_classifer_emotions_results))
print(classification_report(y_testing_emotions, mlp_classifer_emotions_results))

In [None]:
# Sentiments
mlp_classifer_sentiments_results = mlp_classifer_sentiments.predict(X_testing)
print(confusion_matrix(y_testing_sentiments, mlp_classifer_emotions_results))
print(classification_report(y_testing_sentiments, mlp_classifer_sentiments_results))

## Top-MNB (2.3.4)

In [36]:
# Emotions
top_mnb_classifer_emotions_results = top_mnb_classifer_emotions.predict(X_testing)
print(confusion_matrix(y_testing_emotions, top_mnb_classifer_emotions_results))
print(classification_report(y_testing_emotions, top_mnb_classifer_emotions_results))

[[1068   26    2   23   45    6    3   10    3    3   13    2    0   16
     2   53    0   32   51    0  678   23    1    8    0    0    3   18]
 [  44  528   11   25   27    1    3    6    2    3   14    2    0   12
     0    7    0   27   10    0  451    8    0   10    0    0    1    6]
 [  17   11  195   97   11    7    5   11    0   12   39   22    0    2
     0    6    0    2    4    0  540    5    0    4    0    1    3    5]
 [  38   43   97  186   60   11   13   17    2   26   70   26    0    8
     2   22    0    4    7    0 1003    8    0   13    0    1   17   15]
 [ 151   27   14   47  298   24   18   20    9   13   59    7    1    9
     5   33    0   16   23    1 1365   38    0   31    1    3   12    6]
 [  22    4    5   17   25  108    3    7    2    6   11    0    0    1
     0   24    0    7    8    0  422   31    0    4    0    2    7    0]
 [  13   13    6   14   17    0  122   60    0    3   27    2    0    1
     0    9    0    1    2    0  624    1    0   10    0  

In [37]:
# Sentiments
top_mnb_classifer_sentiments_results = top_mnb_classifer_sentiments.predict(X_testing)
print(confusion_matrix(y_testing_sentiments, top_mnb_classifer_emotions_results))
print(classification_report(y_testing_sentiments, top_mnb_classifer_sentiments_results))

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 122    0   54   35   81  104   22  186  307    6   21   73    6    1
    20    3   30    0   17   21    0    1 2400   18    0    0  107    0
     0    8  153]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0    0    0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

    admiration       0.00      0.00      0.00         0
     ambiguous       0.00      0.00      0.00      3796
     amusement       0.00      0.00      0.00         0
         anger       0.00      0.00      0.00         0
     annoyance       0.00      0.00      0.00         0
      approval       0.00      0.00      0.00         0
        caring       0.00      0.00      0.00         0
     confusion       0.00      0.00      0.00         0
     curiosity       0.00      0.00      0.00         0
        desire       0.00      0.00      0.00         0
disappointment       0.00      0.00      0.00         0
   disapproval       0.00      0.00      0.00         0
       disgust       0.00      0.00      0.00         0
 embarrassment       0.00      0.00      0.00         0
    excitement       0.00      0.00      0.00         0
          fear       0.00      0.00      0.00         0
     gratitude       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Top-DT (2.3.5)

In [39]:
# Emotions
top_dt_classifer_emotions_results = top_dt_classifer_emotions.predict(X_testing)
print(confusion_matrix(y_testing_emotions, top_dt_classifer_emotions_results))
print(classification_report(y_testing_emotions, top_dt_classifer_emotions_results))

[[  680    12     0     1     0     0     0     0     0     0     0     0
      0     0     0    11     0     0    59     0  1303    20     0     0
      0     3     0     0]
 [   31   378     0     1     0     0     0     0     0     0     0     0
      0     0     0     4     0     0    16     0   757     5     0     0
      1     5     0     0]
 [   17     4     1     1     0     0     0     0     0     0     1     0
      0     0     0     3     0     0     6     0   955     4     0     0
      0     7     0     0]
 [   25    27     0     2     0     0     0     0     0     0     1     1
      0     0     0    10     0     0     9     0  1599     6     0     0
      1     8     0     0]
 [  113    15     0     0     0     0     0     1     0     0     0     1
      0     0     0     8     0     0    31     0  2029    25     0     0
      0     8     0     0]
 [   35     7     0     0     0     2     0     0     0     0     0     0
      0     0     0     3     0     0    18     0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

    admiration       0.43      0.33      0.37      2089
     amusement       0.52      0.32      0.39      1198
         anger       0.50      0.00      0.00       999
     annoyance       0.18      0.00      0.00      1689
      approval       0.00      0.00      0.00      2231
        caring       0.25      0.00      0.01       716
     confusion       0.00      0.00      0.00       934
     curiosity       0.12      0.00      0.00      1181
        desire       0.00      0.00      0.00       450
disappointment       0.00      0.00      0.00       950
   disapproval       0.00      0.00      0.00      1585
       disgust       0.25      0.00      0.00       622
 embarrassment       0.00      0.00      0.00       283
    excitement       0.00      0.00      0.00       600
          fear       0.00      0.00      0.00       364
     gratitude       0.89      0.72      0.79      1409
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
# Sentiments
top_dt_classifer_sentiments_results = top_dt_classifer_sentiments.predict(X_testing)
print(confusion_matrix(y_testing_sentiments, top_dt_classifer_emotions_results))
print(classification_report(y_testing_sentiments, top_dt_classifer_sentiments_results))

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [   78     0    49     0     0     0     0     0     1     0     0     0
     15     0    24     0  3606    10     0     0    13]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0  

## Top-MLP (2.3.3)

In [None]:
# Emotions
top_mlp_classifer_emotions_results = top_mlp_classifer_emotions.predict(X_testing)
print(confusion_matrix(y_testing_emotions, top_mlp_classifer_emotions_results))
print(classification_report(y_testing_emotions, top_mlp_classifer_emotions_results))

In [None]:
# Sentiments
top_mlp_classifer_sentiments_results = top_mlp_classifer_sentiments.predict(X_testing)
print(confusion_matrix(y_testing_sentiments, top_mlp_classifer_emotions_results))
print(classification_report(y_testing_sentiments, top_mlp_classifer_sentiments_results))