# Recurrent Neural Network
Ref: [Tutorial#1](https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/)
[Tutorial#2](http://adventuresinmachinelearning.com/keras-lstm-tutorial/)

### Data Directory

In [1]:
train_direct = '../training_data/train.pkl'
test_direct = '../testing_data/test.pkl'
model01_direct = "./models/RNN_model_v1._.hdf5"
model02_direct = "./models/RNN_model_v2.2.hdf5"

In [2]:
import numpy as np
import pandas as pd
import collections as c
from sklearn import preprocessing
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM, Dropout, Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras import initializers

Using TensorFlow backend.


## Load train and test data

In [3]:
train = pd.read_pickle(train_direct)
train['text'] = train.msg.map(lambda x: x.get_text())  

test = pd.read_pickle(test_direct)
test = test.dropna(subset=['msgID'])
test['text'] = test.msg.map(lambda x: x.get_text())  

##### Get an entire vocabulary of all words in training and testing posts

In [4]:
train['tokens'] = train['text'].map(word_tokenize)
test['tokens'] = test['text'].map(word_tokenize)
posts_list = list(train['tokens'])
posts_list.extend(list(test['tokens']))
vocab =list()
for p  in posts_list: vocab.extend(p)
len(vocab)

9217168

##### Remove the posts without label

In [5]:
# Construct the labeled data
train_labeled = train.dropna(subset=['label'])
test_labeled = test.dropna(subset=['label'])

##### Convert words to integers

In [6]:
def build_vocab(data):
    # a list of all words! for us will be all posts regardless of their label
    counter = c.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))

    return word_to_id

def file_to_word_ids(data, word_to_id):
    # data:  this will be a labeled post
    return [word_to_id[word] for word in data if word in word_to_id]

In [7]:
word_to_id = build_vocab(vocab)
train_labeled['integers'] = train_labeled['tokens'].map(lambda l: file_to_word_ids(l, word_to_id))
test_labeled['integers'] = test_labeled['tokens'].map(lambda l: file_to_word_ids(l, word_to_id))
max(len(l) for l in train_labeled['integers']),max(len(l) for l in test_labeled['integers'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(1616, 776)

## Training and testing data

In [8]:
# Hence we chose 770 words to consider in each post
max_post_len = 770
X_train = sequence.pad_sequences(train_labeled['integers'], maxlen = max_post_len)
X_test = sequence.pad_sequences(test_labeled['integers'], maxlen = max_post_len)
X_train.shape, X_test.shape

((1188, 770), (400, 770))

In [14]:
le = preprocessing.LabelEncoder()
ohe = preprocessing.OneHotEncoder()
le.fit(train_labeled['label'])
y = le.transform(train_labeled['label']).reshape(-1, 1)    # Your predictions are labeles
ohe.fit(y)
y_train = ohe.transform(y)

le.fit(test_labeled['label'])
y = le.transform(test_labeled['label']).reshape(-1, 1)    # Your predictions are labeles
ohe.fit(y)
y_test = ohe.transform(y)

y_train.shape, y_test.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


((1188, 4), (400, 4))

## Define RNN model version 1
Single LSTM layer

##### Define model's parameters

In [None]:
# sigmoid activation, adam optimizer!
embedding_len = 50    # Length of embedding for each word
top_words_len = len(vocab)    
batch_len = 8    # number of posts used to space out weight updates
num_epoch = 3
LSTM_hidden_size = 100
use_dropout=True
drop = 0.2

##### Fit model

In [16]:
model = Sequential()
model.add(Embedding(top_words_len, embedding_len, input_length=max_post_len))
model.add(LSTM(LSTM_hidden_size))
if use_dropout: model.add(Dropout(drop))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          epochs=num_epoch, batch_size=batch_len)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 770, 50)           6762700   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 404       
Total params: 6,823,504
Trainable params: 6,823,504
Non-trainable params: 0
_________________________________________________________________
None
Train on 1188 samples, validate on 400 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7fd96f66d8>

##### Saving the model

In [20]:
model.save(model01_direct)

<br><br><br><br><br><br>

## Evaluation

In [24]:
model = load_model(model01_direct)

In [25]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 60.50%


## Define RNN model version 2
Add a layer of CNN to version 1

##### Define model's parameters

In [None]:
# sigmoid activation, adam optimizer!
embedding_len = 50    # Length of embedding for each word
top_words_len = len(vocab)    
batch_len = 8    # number of posts used to space out weight updates
num_epoch = 3
LSTM_hidden_size = 100
use_dropout=True
drop = 0.2

##### Fit model

In [54]:
from keras.layers import Conv1D, MaxPooling1D
model = Sequential()
model.add(Embedding(top_words_len, embedding_len,input_length=max_post_len))
model.add(Conv1D(filters=32, kernel_size=3,padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(LSTM_hidden_size))
if use_dropout: model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1],
kernel_initializer = initializers.RandomNormal(mean=0.2, stddev=0.4, seed=None),
                bias_initializer='zeros',
                activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          epochs=num_epoch, batch_size=batch_len)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 770, 50)           6762700   
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 770, 32)           4832      
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 385, 32)           0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dropout_14 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 4)                 404       
Total params: 6,821,136
Trainable params: 6,821,136
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x7f7ed2abdb38>

##### Saving the model

In [None]:
model.save(model02_direct)

<br><br><br><br><br><br>

## Evaluation

In [56]:
model = load_model(model02_direct)

In [57]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 65.00%
