### Importing the libraries

In [None]:
import json
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

data_file = open('goemotions.json')
data = json.load(data_file)

In [None]:
from gensim import downloader
from gensim.models import Word2Vec
from nltk.tokenize import WordPunctTokenizer

### Generate the tokenized words

In [None]:
#Converting Json array to Numpy Array
npData = np.array(data)

In [None]:
#Initializing the tokenizer
tokenizer = WordPunctTokenizer()

In [None]:
#Tokenize all the sentences in the dataset
sentences_tokenized = []
total_n_tokens = 0
for sentence in npData[:,0]:
    tokens = tokenizer.tokenize(sentence)
    sentences_tokenized.append(tokens)
    total_n_tokens+=len(tokens)
print(total_n_tokens)
print(len(sentences_tokenized))

In [None]:
#Checks if any sentences got lost
print(len(sentences_tokenized) == len(npData[:,0]))

In [None]:
#Put the tokens the entry way
tokenized_npData = []
for index in range(0,len(npData[:,0])):
    entry = [sentences_tokenized[index], npData[index,1], npData[index,2]]
    tokenized_npData.append(entry)

In [None]:
#Set it as numpy array, choosing the dtype to be an object
tokenized_npData = np.array(tokenized_npData,dtype=object)

## Using the Word Embedder Word2Vec

### Preparing the data

In [None]:
#Loading the pretrained model
word_embedder = downloader.load("word2vec-google-news-300")

In [None]:
#Variable counts the number of tokens of which their embeddings were generated
n_of_training_tokens_with_embeddings = 0

In [None]:
#Printing the embeddings for a post
word_embed_list = []
n_of_words = 0

for word in tokenized_npData[0][0]:
    try:
        word_embedded = word_embedder[word]
        word_embed_list.append(word_embedded)
        n_of_words = n_of_words + 1
        n_of_training_tokens_with_embeddings += 1
    except KeyError:
        print("Key error found for ", word)
            
if (len(word_embed_list) > 0):
    #Assumed to be of the same size
    sentence_embedded = np.zeros(len(word_embed_list[0]))
    
    #Compute the avg of the values of the tokens inside of the sentence
    for index in range(0,len(sentence_embedded)):
        for word_embedded in word_embed_list:
            sentence_embedded[index] += word_embedded[index]
        sentence_embedded[index] /= n_of_words
            
    entry = [sentence_embedded,tokenized_npData[0][1],tokenized_npData[0][2]]
    print(entry)

In [None]:
#Printing the embeddings for all the posts
data_sentences_embedded = []
n_of_training_tokens_with_embeddings = 0
length_of_array = len(tokenized_npData[:,0])

for j in range(0,length_of_array):
    #Put words embedded in a list
    word_embedded_list = []
    n_of_words = 0
    for word in tokenized_npData[j,0]:
        try:
            #Generate the embbeds
            word_embedded = word_embedder[word]
            word_embedded_list.append(word_embedded)
            n_of_words = n_of_words + 1
            n_of_training_tokens_with_embeddings += 1
        except KeyError:
            print("Key error found for ", word)
            
   
    if (len(word_embedded_list) > 0):
         #Assumed all to be of the same size
        sentence_embedded = np.zeros(len(word_embedded_list[0]))
    
        #Compute the avg of the values of the tokens inside of the sentence
        for index in range(0,len(sentence_embedded)):
            for word_embedded in word_embedded_list:
                sentence_embedded[index] += word_embedded[index]
            sentence_embedded[index] /= n_of_words
            
        entry = [sentence_embedded,tokenized_npData[j,1],tokenized_npData[j,2]]
        data_sentences_embedded.append(entry)

print("Number of tokens embedded: ", n_of_training_tokens_with_embeddings)
print("Percentage of tokens that are embedded: ", (n_of_training_tokens_with_embeddings/total_n_tokens * 100), "%")

In [None]:
npdata_sentences_embedded = np.array(data_sentences_embedded, dtype=object)
print("Number of entries with sentences embedded: ", len(npdata_sentences_embedded[:,0]))

In [None]:
print("Percentage of entries with sentences embedded: ", len(npdata_sentences_embedded[:,0])/len(tokenized_npData[:,0]))

### Training the models

In [None]:
#Importing the libraries for modelling
import joblib
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
#Preparing the data
labEncoder = preprocessing.LabelEncoder()
npDataEmotions = labEncoder.fit_transform(npdata_sentences_embedded[:,1])
npDataSentiments = labEncoder.fit_transform(npdata_sentences_embedded[:,2])

In [None]:
#Splitting the data
train_tokens, test_tokens = train_test_split(npdata_sentences_embedded[:,0], test_size=0.2, train_size=0.8, shuffle=False)
train_emotions, test_emotions = train_test_split(npDataEmotions, test_size=0.2, train_size=0.8, shuffle=False)
train_sentiments, test_sentiments = train_test_split(npDataSentiments, test_size=0.2, train_size=0.8, shuffle=False)

#### Base MLP for Emotions

In [None]:
print(type(train_tokens))
print(type(train_emotions))

In [None]:
#Checks for the size of array
test_token = train_tokens[0]
for i in range(1,len(train_tokens)):
    if(len(test_token)!=len(train_tokens[i])):
        print(i," NOT THE SAME: ", len(train_tokens[i])-len(test_token))

In [None]:
fixed_train_tokens = []
for instance in train_tokens:
    fixed_train_tokens.append(np.array(instance,dtype=float))

In [None]:
fixed_train_tokens = np.array(fixed_train_tokens)

In [None]:
fixed_test_tokens = []
for instance in test_tokens:
    fixed_test_tokens.append(np.array(instance, dtype=float))

In [None]:
fixed_test_tokens = np.array(fixed_test_tokens)

In [None]:
#Train the emotion CLF
emotions_clf = MLPClassifier()
emotions_clf.fit(fixed_train_tokens, train_emotions)

In [None]:
filename1 = "emotions_Base_MLP_model_word2vec.sav"
joblib.dump(emotions_clf,open(filename1,'wb'))

#### Base MLP for Sentiments

In [None]:
sentiments_clf = MLPClassifier()
sentiments_clf.fit(fixed_train_tokens, train_sentiments)

In [None]:
filename2 = "sentiments_Base_MLP_model_word2vec.sav"
joblib.dump(sentiments_clf,open(filename2,'wb'))

#### Top MLP for Emotions

In [None]:
top_mlp_emotions_param = {
    'solver': ["adam", "sgd"],
    'hidden_layer_sizes' : [(10,5),(15,10)],
    'activation' : ["relu", "tanh", "identity"]
}
top_mlp_emotions = GridSearchCV(estimator=MLPClassifier(max_iter=2), param_grid=top_mlp_emotions_param)

In [None]:
top_mlp_emotions.fit(fixed_train_tokens, train_emotions)

In [None]:
filename3 = "emotions_Top_MLP_model_word2vec.sav"
joblib.dump(top_mlp_emotions,open(filename3,'wb'))

#### Top MLP for Sentiments

In [None]:
top_mlp_sentiments_param = {
    'solver': ["adam", "sgd"],
    'hidden_layer_sizes' : [(10,5),(15,10)],
    'activation' : ["relu", "tanh", "identity"]
}
top_mlp_sentiments = GridSearchCV(estimator=MLPClassifier(max_iter=2), param_grid=top_mlp_sentiments_param)

In [None]:
top_mlp_sentiments.fit(fixed_train_tokens, train_sentiments)

In [None]:
filename4 = "sentiments_Top_MLP_model_word2vec.sav"
joblib.dump(top_mlp_emotions,open(filename4,'wb'))

### Evaluating the models

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
emotions_base_MLP_results = emotions_clf.predict(fixed_test_tokens)

In [None]:
print(confusion_matrix(test_emotions,emotions_base_MLP_results))

In [None]:
print(classification_report(test_emotions,emotions_base_MLP_results))

In [None]:
sentiments_base_MLP_results=sentiments_clf.predict(fixed_test_tokens)

In [None]:
print(confusion_matrix(test_sentiments,sentiments_base_MLP_results))

In [None]:
print(classification_report(test_sentiments,sentiments_base_MLP_results))

In [None]:
emotions_top_MLP_results = top_mlp_emotions.predict(fixed_test_tokens)

In [None]:
print(confusion_matrix(test_emotions,emotions_top_MLP_results))

In [None]:
print(classification_report(test_emotions,emotions_top_MLP_results))

In [None]:
sentiments_top_MLP_results = top_mlp_sentiments.predict(fixed_test_tokens)

In [None]:
print(confusion_matrix(test_sentiments,sentiments_top_MLP_results))

In [None]:
print(classification_report(test_sentiments,sentiments_top_MLP_results))

## Using the word embedding model: Glove Wikipedia 2014 Gigaword 5th Ed.

In [None]:
word_embbeder_gigaword = downloader.load("glove-wiki-gigaword-50")

In [None]:
#Printing the embeddings for all the posts
data_sentences_embedded_gigaword = []
n_of_training_tokens_with_embeddings = 0
length_of_array = len(tokenized_npData[:,0])

for j in range(0,length_of_array):
    #Put words embedded in a list
    word_embedded_list = []
    n_of_words = 0
    for word in tokenized_npData[j,0]:
        try:
            #Generate the embbeds
            word_embedded = word_embbeder_gigaword[word]
            word_embedded_list.append(word_embedded)
            n_of_words = n_of_words + 1
            n_of_training_tokens_with_embeddings += 1
        except KeyError:
            print("Key error found for ", word)
            
   
    if (len(word_embedded_list) > 0):
         #Assumed all to be of the same size
        sentence_embedded = np.zeros(len(word_embedded_list[0]))
    
        #Compute the avg of the values of the tokens inside of the sentence
        for index in range(0,len(sentence_embedded)):
            for word_embedded in word_embedded_list:
                sentence_embedded[index] += word_embedded[index]
            sentence_embedded[index] /= n_of_words
            
        entry = [sentence_embedded,tokenized_npData[j,1],tokenized_npData[j,2]]
        data_sentences_embedded_gigaword.append(entry)

In [None]:
print("Number of tokens embedded: ", n_of_training_tokens_with_embeddings)
print("Percentage of tokens that are embedded: ", (n_of_training_tokens_with_embeddings/total_n_tokens * 100), "%")

In [None]:
npdata_sentences_embedded_gigaword = np.array(data_sentences_embedded_gigaword, dtype=object)
print("Number of entries with sentences embedded: ", len(npdata_sentences_embedded_gigaword[:,0]))
print("Percentage of entries with sentences embedded: ", len(npdata_sentences_embedded_gigaword[:,0])/len(tokenized_npData[:,0]))

### Training the models

In [None]:
#Preparing the data
labEncoder = preprocessing.LabelEncoder()
gigaword_npDataEmotions = labEncoder.fit_transform(npdata_sentences_embedded_gigaword[:,1])
gigaword_npDataSentiments = labEncoder.fit_transform(npdata_sentences_embedded_gigaword[:,2])

In [None]:
#Splitting the data
gigaword_train_tokens, gigaword_test_tokens = train_test_split(npdata_sentences_embedded_gigaword[:,0], test_size=0.2, train_size=0.8, shuffle=False)
gigaword_train_emotions, gigaword_test_emotions = train_test_split(gigaword_npDataEmotions, test_size=0.2, train_size=0.8, shuffle=False)
gigaword_train_sentiments, gigaword_test_sentiments = train_test_split(gigaword_npDataSentiments, test_size=0.2, train_size=0.8, shuffle=False)

In [None]:
fixed_gigaword_train_tokens = []
for instance in gigaword_train_tokens:
    fixed_gigaword_train_tokens.append(np.array(instance,dtype=float))
fixed_gigaword_train_tokens = np.array(fixed_gigaword_train_tokens)

In [None]:
fixed_gigaword_test_tokens = []
for instance in gigaword_test_tokens:
    fixed_gigaword_test_tokens.append(np.array(instance,dtype=float))
fixed_gigaword_test_tokens = np.array(fixed_gigaword_test_tokens)

#### Emotions Base MLP

In [None]:
gigaword_emotions_clf = MLPClassifier()
gigaword_emotions_clf.fit(fixed_gigaword_train_tokens,gigaword_train_emotions)

In [None]:
filename5 = "emotions_Base_MLP_model_gigaword.sav"
joblib.dump(gigaword_emotions_clf,open(filename5,'wb'))

#### Sentiments Base MLP

In [None]:
gigaword_sentiments_clf = MLPClassifier()
gigaword_sentiments_clf.fit(fixed_gigaword_train_tokens,gigaword_train_sentiments)

In [None]:
filename6 = "sentiments_Base_MLP_model_gigaword.sav"
joblib.dump(gigaword_sentiments_clf,open(filename6,'wb'))

### Evaluating the models

In [None]:
gigaword_base_mlp_emotions_results = gigaword_emotions_clf.predict(fixed_gigaword_test_tokens)
gigaword_base_mlp_sentiments_results = gigaword_sentiments_clf.predict(fixed_gigaword_test_tokens)

In [None]:
print(confusion_matrix(gigaword_test_emotions,gigaword_base_mlp_emotions_results))

In [None]:
print(classification_report(gigaword_test_emotions,gigaword_base_mlp_emotions_results))

In [None]:
print(confusion_matrix(gigaword_test_sentiments,gigaword_base_mlp_sentiments_results))

In [None]:
print(classification_report(gigaword_test_sentiments,gigaword_base_mlp_sentiments_results))

## Using the word embedding model Fastest Wikipedia News from October 2017

### Preparing the data

In [None]:
print(json.dumps(info, indent=4))

In [None]:
word_embedder_fwiki = downloader.load("fasttext-wiki-news-subwords-300")

In [None]:
#Printing the embeddings for all the posts
data_sentences_embedded_fwiki = []
n_of_training_tokens_with_embeddings = 0
length_of_array = len(tokenized_npData[:,0])

for j in range(0,length_of_array):
    #Put words embedded in a list
    word_embedded_list = []
    n_of_words = 0
    for word in tokenized_npData[j,0]:
        try:
            #Generate the embbeds
            word_embedded = word_embedder_fwiki[word]
            word_embedded_list.append(word_embedded)
            n_of_words = n_of_words + 1
            n_of_training_tokens_with_embeddings += 1
        except KeyError:
            print("Key error found for ", word)
            
   
    if (len(word_embedded_list) > 0):
         #Assumed all to be of the same size
        sentence_embedded = np.zeros(len(word_embedded_list[0]))
    
        #Compute the avg of the values of the tokens inside of the sentence
        for index in range(0,len(sentence_embedded)):
            for word_embedded in word_embedded_list:
                sentence_embedded[index] += word_embedded[index]
            sentence_embedded[index] /= n_of_words
            
        entry = [sentence_embedded,tokenized_npData[j,1],tokenized_npData[j,2]]
        data_sentences_embedded_fwiki.append(entry)

In [None]:
print("Number of tokens embedded: ", n_of_training_tokens_with_embeddings)
print("Percentage of tokens that are embedded: ", (n_of_training_tokens_with_embeddings/total_n_tokens * 100), "%")

In [None]:
npdata_sentences_embedded_fwiki = np.array(data_sentences_embedded_fwiki, dtype=object)
print("Number of entries with sentences embedded: ", len(npdata_sentences_embedded_fwiki[:,0]))
print("Percentage of entries with sentences embedded: ", len(npdata_sentences_embedded_fwiki[:,0])/len(tokenized_npData[:,0]))

### Training the models

In [None]:
#Preparing the data
labEncoder = preprocessing.LabelEncoder()
fwiki_npDataEmotions = labEncoder.fit_transform(npdata_sentences_embedded_fwiki[:,1])
fwiki_npDataSentiments = labEncoder.fit_transform(npdata_sentences_embedded_fwiki[:,2])

In [None]:
#Splitting the data
fwiki_train_tokens, fwiki_test_tokens = train_test_split(npdata_sentences_embedded_fwiki[:,0], test_size=0.2, train_size=0.8, shuffle=False)
fwiki_train_emotions, fwiki_test_emotions = train_test_split(fwiki_npDataEmotions, test_size=0.2, train_size=0.8, shuffle=False)
fwiki_train_sentiments, fwiki_test_sentiments = train_test_split(fwiki_npDataSentiments, test_size=0.2, train_size=0.8, shuffle=False)

In [None]:
fixed_fwiki_train_tokens = []
for instance in fwiki_train_tokens:
    fixed_fwiki_train_tokens.append(np.array(instance,dtype=float))
fixed_fwiki_train_tokens = np.array(fixed_fwiki_train_tokens)

In [None]:
fixed_fwiki_test_tokens = []
for instance in fwiki_test_tokens:
    fixed_fwiki_test_tokens.append(np.array(instance,dtype=float))
fixed_fwiki_test_tokens = np.array(fixed_fwiki_test_tokens)

#### Base MLP Emotions

In [None]:
fwiki_emotions_clf = MLPClassifier()
fwiki_emotions_clf.fit(fixed_fwiki_train_tokens,fwiki_train_emotions)

In [None]:
filename7 = "emotions_Base_MLP_model_fwiki.sav"
joblib.dump(fwiki_emotions_clf,open(filename7,'wb'))

#### Base MLP Sentiments

In [None]:
fwiki_sentiments_clf = MLPClassifier()
fwiki_sentiments_clf.fit(fixed_fwiki_train_tokens,fwiki_train_sentiments)

In [None]:
filename8 = "sentiments_Base_MLP_model_fwiki.sav"
joblib.dump(fwiki_sentiments_clf,open(filename8,'wb'))

### Evaluating the models

In [None]:
fwiki_base_mlp_emotions_results = fwiki_emotions_clf.predict(fixed_fwiki_test_tokens)
fwiki_base_mlp_sentiments_results = fwiki_sentiments_clf.predict(fixed_fwiki_test_tokens)

In [None]:
print(confusion_matrix(fwiki_test_emotions,fwiki_base_mlp_emotions_results))

In [None]:
print(classification_report(fwiki_test_emotions,fwiki_base_mlp_emotions_results))

In [None]:
print(confusion_matrix(fwiki_test_sentiments,fwiki_base_mlp_sentiments_results))

In [None]:
print(classification_report(fwiki_test_sentiments,fwiki_base_mlp_sentiments_results))