In [None]:
import pandas as pd
import numpy as np
import tensorflow
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from aitk.utils import gallery, array_to_image
from aitk.networks import Network

In [None]:
#function to load file; takes in a string and returns an array of data
def loadFile(filename):
    array = []
    try:
        df = pd.read_csv(filename, nrows=70000)
        array = df.to_numpy()
    except:
        print("Invalid File!")
    return array

In [None]:
#This section of the code will prepare the dataset

#maps numerical labels into lexical labels
label_map = ["sadness", "joy", "love", "anger", "fear", "surprise"]

#Hyperparameters
#number of different types of words
vocab_size = 5000
#length of vector of each words after embedding
embd_len = 32
#maximum input size; every input has padding to ensure all inputs have the same size
max_words = 100

#replace with path to text.csv file
filename = "~/Desktop/text.csv"
#calls loadfile with filename; returns array with index, tweets, and labels
loadedFile = loadFile(filename)

#get tweets from loaded file
allText = [tweet[1] for tweet in loadedFile]

#code to turn text into a unique number (token)
allTextTokenizer = Tokenizer(num_words = vocab_size)
allTextTokenizer.fit_on_texts(allText)
allText = allTextTokenizer.texts_to_sequences(allText)

#dictionary mapping words to tokens
wordDict = allTextTokenizer.word_index

#pad inputs so they are all the same size
allText = sequence.pad_sequences(allText, maxlen = max_words)

#get labels from loaded file
allLabels = [tweet[2] for tweet in loadedFile]

#turn each label into a vector of length six: (1, 0, 0, 0, 0, 0) = sadness for example
allLabels_category = to_categorical(allLabels, 6)

#separate data into training and testing sets
text_train = np.array(allText[:60000])
label_train = np.array(allLabels_category[:60000])
text_test = np.array(allText[60000:70000])
label_test = np.array(allLabels_category[60000:70000])

In [None]:
#This section of the code creates the first model, which uses a GRU and trains on 10000 inputs

#Create a sequential model
gru_model = Sequential(name="GRU_Model")

#Add an embedding layer which takes an input (list of tokens) and turn each token in input into a vector of length 32
gru_model.add(Embedding(vocab_size,
                        embd_len,
                        input_length=max_words))

#Add GRU layer with hyperbolic tangent activation
gru_model.add(GRU(128,
                activation='tanh',
                return_sequences=False))

#Add FC layer with six outputs, each represnting an emotion
gru_model.add(Dense(6, activation='softmax'))

#Printing the summary
gru_model.summary()

In [None]:
#This section of the code compiles and trains the first model

#Compiling the model
gru_model.compile(
    loss="categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)

net = Network(gru_model)

#Training the GRU model
history = net.fit(text_train[:10000], label_train[:10000],
                        batch_size=256,
                        epochs=5,
                        verbose=1,
                        validation_data=(text_test, label_test))

In [None]:
#Printing first model score on test data
print("GRU model 1 Score---> ", gru_model.evaluate(text_test, label_test, verbose=0))

In [None]:
#This section of the code creates the second model, which uses a GRU and trains on 60000 inputs

gru_model2 = Sequential(name="GRU_Model2")
gru_model2.add(Embedding(vocab_size,
                        embd_len,
                        input_length=max_words,
                        name="embedding"))
gru_model2.add(GRU(128,
                activation='tanh',
                return_sequences=False))
gru_model2.add(Dense(6, activation='softmax'))

gru_model2.summary()

In [None]:
#This section of the code compiles and trains the second model

gru_model2.compile(
    loss="categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)

net2 = Network(gru_model2)

history = net2.fit(text_train, label_train,
                        batch_size=256,
                        epochs=5,
                        verbose=1,
                        validation_data=(text_test, label_test))

In [None]:
#Printing second model score on test data
print("GRU model 2 Score---> ", gru_model2.evaluate(text_test, label_test, verbose=0))

In [None]:
#Printing second model score on test data with incorrect labels
print("GRU model 2 Score---> ", gru_model2.evaluate(text_test, label_train[0:10000], verbose=0))

In [None]:
#This section of the code creates the third model, which uses a Simple RNN and trains on 60000 inputs

rnn_model = Sequential(name="RNN_Model")
rnn_model.add(Embedding(vocab_size,
                        embd_len,
                        input_length=max_words))
rnn_model.add(SimpleRNN(128,
                activation='tanh',
                return_sequences=False))
rnn_model.add(Dense(6, activation='softmax'))

rnn_model.summary()

In [None]:
#This section of the code compiles and trains the third model

rnn_model.compile(
    loss="categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)

net3 = Network(rnn_model)

history = net3.fit(text_train, label_train,
                        batch_size=256,
                        epochs=5,
                        verbose=1,
                        validation_data=(text_test, label_test))

In [None]:
#Printing second model score on test data with incorrect labels
print("RNN model Score---> ", rnn_model.evaluate(text_test, label_test, verbose=0))

In [None]:
#This section of the code looks at the embeddings of tokens in the second model

import numpy as np
import matplotlib.pyplot as plt

#function that takes in a float array and create a colormap; use to compare different embeddings
def convertToColorMap(float_array):
    normalize = [(data - np.min(float_array))/(np.max(float_array) - np.min(float_array)) for data in float_array]
    fig, ax = plt.subplots()
    ax.imshow([normalize], cmap='Greens', aspect='auto')
    ax.set_yticks([])
    plt.show()

#gets the weights of every token (word) in vocabulary
weights = gru_model2.get_layer('embedding').get_weights()[0]

#displays the colormap of a specific word
convertToColorMap(weights[wordDict["unhappy"]])
convertToColorMap(weights[wordDict["depressed"]])
convertToColorMap(weights[wordDict["sad"]])
convertToColorMap(weights[wordDict["sadness"]])
convertToColorMap(weights[wordDict["happy"]])
convertToColorMap(weights[wordDict["happiness"]])

#To see a different embedding, copy and past the following code: convertToColorMap(weights[wordDict["word"]]) and replace 'word'
#with whatever you want. Note that the vocab size is only 5000, so some words may give you an error.

In [None]:
#This section and all sections below are used to generate results from the model

from numpy import argmax

#inputs the test data into model and returns the results
outputs = net2.predict(text_test)

#array of predicted answers
answers = [argmax(output) for output in outputs]

#array of correct answers
targets = [argmax(target) for target in label_test]

In [None]:
#checks the number of incorrect answers (predicted answer does not equal actual answer)
incorrect = [i for i in range(len(answers)) if answers[i] != targets[i]]
len(incorrect)

In [None]:
#creates a confusion matrix to see where the model is likely to mess up
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
%matplotlib widget

cm = confusion_matrix(targets, answers)
cm_plt = ConfusionMatrixDisplay(cm, display_labels=label_map)
cm_plt.plot()