# Code to train the model

## 1. Data Processing

In [11]:
#import require library for the data processing
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from scipy.stats import itemfreq
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,HashingVectorizer
from sklearn.metrics import confusion_matrix

pd.options.mode.chained_assignment = None

from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import pandas as pd
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1.1 Import dataset

In [12]:
#make connection to the google drive
#from google.colab import drive
#drive.mount('/content/drive/')

In [13]:
#read data from the drive
#data = pd.read_csv('/content/drive/My Drive/emotion_detector/emotion.csv')
data = pd.read_csv('tweets_data.csv')
del data['Unnamed: 0']
#del data['Unnamed: 0.1']
display(data.head())

#Checking the values count for sentiment
print("-"*80)
print ('\t\t\tChecking the value count for sentiment')
print("-"*80)
display(data.sentiment.value_counts())

Unnamed: 0,content,sentiment
0,Layin n bed with a headache ughhhh...waitin o...,sadness
1,Funeral ceremony...gloomy friday...,sadness
2,Re-pinging @ghostridah14: why didn't you go to...,worry
3,"I should be sleep, but im not! thinking about ...",sadness
4,Hmmm. http://www.djhero.com/ is down,worry


--------------------------------------------------------------------------------
			Checking the value count for sentiment
--------------------------------------------------------------------------------


sadness      8995
worry        8459
joy          8240
surprise     6036
happiness    5209
love         3842
Name: sentiment, dtype: int64

In [15]:
#del data['Unnamed: 0']
#del data['tweet_id']

In [16]:
#describe data
print("\n" + "-"*50 + "\n")
print ("\t\t describe data by object")
print("\n" + "-"*50 + "\n")
print(data.describe(include = [np.object]))


--------------------------------------------------

		 describe data by object

--------------------------------------------------

                                                  content sentiment
count                                               40781     40781
unique                                              40522         6
top     RT, follow @unitednude and WIN one of the 5 sp...   sadness
freq                                                   35      8995


## 1.2 Data cleaning

In [17]:
#Code remove extra words and link from the dataset
data['content']=data['content'].str.replace('[^A-Za-z0-9\s]+', '')
data['content']=data['content'].str.replace('http\S+|www.\S+', '', case=False)

In [18]:
display(data.head())

Unnamed: 0,content,sentiment
0,Layin n bed with a headache ughhhhwaitin on y...,sadness
1,Funeral ceremonygloomy friday,sadness
2,Repinging ghostridah14 why didnt you go to pro...,worry
3,I should be sleep but im not thinking about an...,sadness
4,Hmmm is down,worry


In [19]:
#Code convert all dataset into small case
for x in data:
    if data[x].dtype == object:
        #remove extra whitespace
        data[x] = data[x].str.strip()
        #convert into lowercase
        data[x] = data[x].str.lower()
        

In [20]:
#Code to check duplicate and null value and remove it
print("\n" + "-"*60 + "\n")
print ("\t\t Tweets dataset for emotion")
print("\n" + "-"*60 + "\n")
print (" Tweets data size " + str(data.size) + "  ||  Shape: " + str(data.shape) + "\n")
display(data.head())
display(data.tail())

#Checking the Duplicate data
print("-"*80)
print ('\t\t\tChecking the Duplicate data')
print("-"*80)

#display the data size before dropping duplicate
print ("Data size with duplicate value: " + str(data.size) + "  ||  Shape: " + str(data.shape) + "\n")
print ('Is there any Duplicate value? ') 
print (str(data.duplicated().any()) + "\n") #check the duplicate value

#Checking the null values
print("-"*80)
print ('\t\t\tChecking the null values')
print("-"*80)

#check null values
print (data.isnull().sum())
print("-"*80)

display(data.sentiment.value_counts())


------------------------------------------------------------

		 Tweets dataset for emotion

------------------------------------------------------------

 Tweets data size 81562  ||  Shape: (40781, 2)



Unnamed: 0,content,sentiment
0,layin n bed with a headache ughhhhwaitin on y...,sadness
1,funeral ceremonygloomy friday,sadness
2,repinging ghostridah14 why didnt you go to pro...,worry
3,i should be sleep but im not thinking about an...,sadness
4,hmmm is down,worry


Unnamed: 0,content,sentiment
40776,about to have a movie night with my booboo jel...,sadness
40777,thebodyshopuk knowing my dissertation will be ...,joy
40778,hospital tomorrow morning strapped with wires ...,joy
40779,work is soooo slow ready to have a great saturday,joy
40780,you realize that by choosing joy every single ...,joy


--------------------------------------------------------------------------------
			Checking the Duplicate data
--------------------------------------------------------------------------------
Data size with duplicate value: 81562  ||  Shape: (40781, 2)

Is there any Duplicate value? 
True

--------------------------------------------------------------------------------
			Checking the null values
--------------------------------------------------------------------------------
content      0
sentiment    0
dtype: int64
--------------------------------------------------------------------------------


sadness      8995
worry        8459
joy          8240
surprise     6036
happiness    5209
love         3842
Name: sentiment, dtype: int64

In [21]:
display(data.head())

Unnamed: 0,content,sentiment
0,layin n bed with a headache ughhhhwaitin on y...,sadness
1,funeral ceremonygloomy friday,sadness
2,repinging ghostridah14 why didnt you go to pro...,worry
3,i should be sleep but im not thinking about an...,sadness
4,hmmm is down,worry


In [22]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

# Sentences to be stemmed.
#data = ["programers program with programing languages", "my code is working so there must be a bug in the optimizer"] 
    
# Create the Pandas dataFrame.
#df = pd.DataFrame(data, columns = ['unstemmed']) 

# Split the sentences to lists of words.
data['unstemmed'] = data['content'].str.split()

data['stemmed'] = data['unstemmed'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
#df = df.drop(columns=['unstemmed']) # Get rid of the unstemmed column.

In [23]:
display(data.head())

Unnamed: 0,content,sentiment,unstemmed,stemmed
0,layin n bed with a headache ughhhhwaitin on y...,sadness,"[layin, n, bed, with, a, headache, ughhhhwaiti...","[layin, n, bed, with, a, headach, ughhhhwaitin..."
1,funeral ceremonygloomy friday,sadness,"[funeral, ceremonygloomy, friday]","[funer, ceremonygloomi, friday]"
2,repinging ghostridah14 why didnt you go to pro...,worry,"[repinging, ghostridah14, why, didnt, you, go,...","[reping, ghostridah14, whi, didnt, you, go, to..."
3,i should be sleep but im not thinking about an...,sadness,"[i, should, be, sleep, but, im, not, thinking,...","[i, should, be, sleep, but, im, not, think, ab..."
4,hmmm is down,worry,"[hmmm, is, down]","[hmmm, is, down]"


In [24]:
#code to remove stop words from dataset
stop = stopwords.words('english')

data['stemmed'].apply(lambda x: [item for item in x if item not in stop])
data['tweet_without_stopwords'] = data['stemmed'].apply(lambda x: ' '.join([word for word in x if word not in (stop)]))

In [25]:
data.head()

Unnamed: 0,content,sentiment,unstemmed,stemmed,tweet_without_stopwords
0,layin n bed with a headache ughhhhwaitin on y...,sadness,"[layin, n, bed, with, a, headache, ughhhhwaiti...","[layin, n, bed, with, a, headach, ughhhhwaitin...",layin n bed headach ughhhhwaitin call
1,funeral ceremonygloomy friday,sadness,"[funeral, ceremonygloomy, friday]","[funer, ceremonygloomi, friday]",funer ceremonygloomi friday
2,repinging ghostridah14 why didnt you go to pro...,worry,"[repinging, ghostridah14, why, didnt, you, go,...","[reping, ghostridah14, whi, didnt, you, go, to...",reping ghostridah14 whi didnt go prom bc bf di...
3,i should be sleep but im not thinking about an...,sadness,"[i, should, be, sleep, but, im, not, thinking,...","[i, should, be, sleep, but, im, not, think, ab...",sleep im think old friend want hes marri damn ...
4,hmmm is down,worry,"[hmmm, is, down]","[hmmm, is, down]",hmmm


In [26]:
#delete extra column
del data['content']
del data['unstemmed']
del data['stemmed']

In [27]:
data.head()

Unnamed: 0,sentiment,tweet_without_stopwords
0,sadness,layin n bed headach ughhhhwaitin call
1,sadness,funer ceremonygloomi friday
2,worry,reping ghostridah14 whi didnt go prom bc bf di...
3,sadness,sleep im think old friend want hes marri damn ...
4,worry,hmmm


# 2. Model training

In [18]:
#import require library for the data modelling
import numpy as np
import pandas as pd
import pandas as pd
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords

import string

import re
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
# tokenize the string and convert into matrix
tokenizer = Tokenizer(num_words=2000, split=" ")
tokenizer.fit_on_texts(data['tweet_without_stopwords'].values)

X= tokenizer.texts_to_sequences(data['tweet_without_stopwords'].values)
X = pad_sequences(X)


In [20]:
# one hot encoding the labels
Y = pd.get_dummies(data['sentiment']).values
#  divide into training and testing data
import sklearn
X_train,X_test,Y_train,Y_test = sklearn.model_selection.train_test_split(X,Y,random_state=1)

In [21]:
# prepare the RNN model
model = Sequential()
model.add(Embedding(2000, 256, input_length=X_train.shape[1]))
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.3))
model.add(LSTM(256, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(6, activation='softmax'))
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 29, 256)           512000    
_________________________________________________________________
dropout (Dropout)            (None, 29, 256)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 29, 256)           525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 29, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 6)                 1542      
Total params: 1,564,166
Trainable params: 1,564,166
Non-trainable params: 0
______________________________________________

In [23]:
# train the model on training data
batch_size = 50
epochs = 40

model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7f8a5020beb8>

In [24]:
#save model on drive
model.save('/content/drive/My Drive/emotion_detector/model_2/model2.h5')