# Project Unsupervised learning on your own

A work on text classification by Yann Faussurier.

this notebook contains the final model by itself

In [None]:
%matplotlib inline
import json
import tensorflow as tf
import pandas as pd 
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Now because our dataset is placed in three different files, we will load them into panda datasets in order to concatenate all of them and get a unique dataframe for our data : 

In [None]:
df1=pd.read_csv("goemotions_1.csv")
df2=pd.read_csv("goemotions_2.csv")
df3=pd.read_csv("goemotions_3.csv")
df=pd.concat([df1,df2,df3])

We will now define our hyperparameters based on the example that was given with sarcasm, we will set vocab_size to 1000, embedding_dim to 16.

After some trial and errors, we have set max_length to 80 because it worked better.

We also set training_size to 190000 instead of 200000 in order to have a bigger testing_set, the number of data being 211225 it would let about 20 000 testing data.

In [None]:
vocab_size = 1000
embedding_dim = 16
max_length = 80
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 190000

Now we need to define the data that we will put into our neural network, we will set the 27 sentiment as target, and the input will be the text of the reddit comment/post

In [None]:
df = df[['text',
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']].groupby('text').sum()

In [None]:
sentences = df['text']

labels=df[[
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

We can now divide our data into a training and a test set using the parameters that we have defined above

In [None]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In a similar way as we did with the sarcasm dataset, we will use the tokenizer to tranform the vocabulary into tokens, and fit those tokens on the sentences of our data.
once we have done that we use the pad_sequences function to make each of our sentences the same length.

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

# Model Training

Because the fit will take a long time, we create a callback that will stop the training if the loss on the validation set doesnt change epochs by epochs

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length)) #The embedding layer
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150,return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(75))) #Our LSTM layer
model.add(tf.keras.layers.Dense(28,activation='sigmoid'))


model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=[tf.keras.metrics.Precision()])


In [None]:
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2,callbacks=[callback])



In [None]:
def plot_graphs_bis(history, string):
  plt.plot(history.history[string])
  #plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs_bis(history, "precision_2")
plot_graphs_bis(history, "loss")