# 31009 - Final Project - RNNs Model
### Ada, Rohit, Dylan

In [1]:
import numpy as np  
import pandas as pd 
import re   
import nltk  
from nltk.corpus import stopwords           
from nltk.stem.porter import PorterStemmer
from collections import Counter  
import seaborn as sns 
import matplotlib.pyplot as plt     
from IPython.core.display import display, HTML  
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences   
from tqdm import tqdm  
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import Embedding,SimpleRNN
from sklearn.model_selection import train_test_split
from keras import optimizers,initializers

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
##Load Data
train = pd.read_csv("Cleaned_Train.csv")
train_y = train.target


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1
1,4,,,Forest fire near La Ronge Sask Canada,1
2,5,,,All residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfire evacuation order...,1
4,7,,,Just got sent this photo from Ruby Alaska as s...,1


In [4]:
# Tokenizer sequence and index words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.text)   
word_index = tokenizer.word_index    
num_words = len(tokenizer.word_index)+1

In [5]:
print('Number of unique words:',len(word_index))

Number of unique words: 17440


In [6]:
training_sequences = tokenizer.texts_to_sequences(train.text)  

# Ading padding at the front of text sequence
training_padded = pad_sequences(training_sequences,                                  
                                   maxlen=50,                                      
                                   padding='pre',                           
                                   truncating='pre')  

# Split data set for further training and validation
X_train, X_test, Y_train, Y_test = train_test_split(training_padded, train_y, test_size=.25,random_state=0)

In [7]:
# Matching words with Glove embedding 6B.300D
embedding_dict={}
with open('glove.6B.300d.txt','r',encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

embedding_dim=300
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in tqdm(word_index.items()):
    if i < num_words:
        embedding_vector = embedding_dict.get(word)  
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

embedding_matrix.shape

100%|██████████| 17440/17440 [00:00<00:00, 530408.69it/s]


(17441, 300)

## RNN without LSTM Layer 

In [8]:
# Building the model
model = Sequential()
model.add(Embedding(input_dim=num_words,
                    output_dim=300,
                    embeddings_initializer=initializers.Constant(embedding_matrix), 
                    input_length=50,trainable=False))
model.add(Dropout(0.2)) 
model.add(SimpleRNN(units=64, activation="sigmoid"))
model.add(Dense(units=1, activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 300)           5232300   
_________________________________________________________________
dropout (Dropout)            (None, 50, 300)           0         
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 64)                23360     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 5,255,725
Trainable params: 23,425
Non-trainable params: 5,232,300
_________________________________________________________________


In [9]:
#Fit the mode and evaluate the model
model_1_fit = model.fit(X_train, Y_train, validation_split=.25, epochs=10, batch_size=10)
model.evaluate(X_test, Y_test, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.4686012864112854, 0.7951680421829224]

In [10]:
#Save model file to disk
model_json = model.to_json()
with open("rnnmodel.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("rnnmodel.h5")
print("Saved model to disk")

Saved model to disk


## RNN with LSTM Layer 

In [11]:
# Building the model
model2 = Sequential()
model2.add(Embedding(input_dim=num_words,
                    output_dim=300,
                    embeddings_initializer=initializers.Constant(embedding_matrix), 
                    input_length=50,trainable=False))

model2.add(Dropout(0.2)) 
model2.add(LSTM(64,dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(units=1, activation="sigmoid"))

model2.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           5232300   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 300)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                93440     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 5,325,805
Trainable params: 93,505
Non-trainable params: 5,232,300
_________________________________________________________________


In [12]:
#Fit the mode and evaluate the model
model_2_fit = model2.fit(X_train, Y_train, validation_split=.25, epochs=10, batch_size=10)
model2.evaluate(X_test, Y_test,batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.5597984194755554, 0.7909663915634155]

In [13]:
#Save model file to disk
model2_json = model2.to_json()
with open("rnn2model.json", "w") as json_file:
    json_file.write(model2_json)
# serialize weights to HDF5
model.save_weights("rnn2model.h5")
print("Saved model to disk")

Saved model to disk
