In [1]:
import importlib
import pandas as pd
import numpy as np
import nltk
import gensim 
import logging
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import utils
importlib.reload(utils)

import keras
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, SimpleRNN, LSTM, CuDNNLSTM, Dropout

%matplotlib inline  

Using TensorFlow backend.


In [2]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t", quoting=3)
train_data, test_data = train_test_split(data, test_size=0.25)

In [3]:
# Clean the text
train_data['review_cleaned'] = train_data.review.apply(lambda x: utils.normalize_text(x))
test_data['review_cleaned'] = test_data.review.apply(lambda x: utils.normalize_text(x))

unlabel_data = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabel_data['review_cleaned'] = unlabel_data.review.apply(lambda x: utils.normalize_text(x))

In [4]:
def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()
    
def get_total_words(text_list):
    text_set = set()
    for text in text_list:
        text_set |= set(text.split())
    return len(text_set)

def get_max_size_of_sentence(text_list):
    max_size = 0
    for text in text_list:
        max_size = max(max_size, len(text))
    return max_size

In [5]:
## Find the total of unique words
total_words = get_total_words(list(train_data['review_cleaned']) + 
                list(test_data['review_cleaned']) +
                list(unlabel_data['review_cleaned']))

## Find max size of a setence
max_size = get_max_size_of_sentence(list(train_data['review_cleaned']) + 
                list(test_data['review_cleaned']) +
                list(unlabel_data['review_cleaned']))

In [6]:
tokenizer = preprocessing.text.Tokenizer(total_words)

tokenizer.fit_on_texts(list(train_data['review_cleaned']) + 
                       list(test_data['review_cleaned']) +
                       list(unlabel_data['review_cleaned'])
                      )

In [7]:
train_one_hot_index = tokenizer.texts_to_sequences(list(train_data['review_cleaned']))
train_one_hot_index = preprocessing.sequence.pad_sequences(train_one_hot_index, max_size)

test_one_hot_index = tokenizer.texts_to_sequences(list(test_data['review_cleaned']))
test_one_hot_index = preprocessing.sequence.pad_sequences(test_one_hot_index, max_size)

unlabel_data_one_hot_index = tokenizer.texts_to_sequences(list(unlabel_data['review_cleaned']))
unlabel_data_one_hot_index = preprocessing.sequence.pad_sequences(unlabel_data_one_hot_index, max_size)

In [8]:
x_train = train_one_hot_index
y_train = train_data.sentiment

x_test = test_one_hot_index
y_test  = test_data.sentiment

In [22]:
def create_simple_rnn():
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_size))
    model.add(SimpleRNN(32))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [25]:
simple_rnn = create_simple_rnn()
simple_rnn.summary()
simple_rnn.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 9434, 100)         10124600  
_________________________________________________________________
simple_rnn_11 (SimpleRNN)    (None, 32)                4256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 10,128,889
Trainable params: 10,128,889
Non-trainable params: 0
_________________________________________________________________


In [26]:
history = simple_rnn.fit(x_train, y_train, epochs=1, validation_data=(x_test, y_test))

Train on 18750 samples, validate on 6250 samples
Epoch 1/1


In [14]:
def create_lstm(dense_layers=None):
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_size))
    model.add(LSTM(32, return_sequences=False))
    model.add(Dropout(0.2))
    if dense_layers:
        for i in range(dense_layers):
            model.add(Dense(16, activation="relu"))
            model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    return model

lstm = create_lstm(dense_layers=1)
lstm.summary()
lstm.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 9434, 100)         10124600  
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 10,142,169
Trainable params: 10,142,169
Non-trainable params: 0
________________________________________________________________

In [15]:
history = lstm.fit(x_train, y_train, epochs=1, validation_data=(x_test, y_test))

Train on 18750 samples, validate on 6250 samples
Epoch 1/1


## Predict

In [30]:
prediction = [x[0] for x in simple_rnn.predict_classes(unlabel_data_one_hot_index)]
utils.save_predict(unlabel_data, 
                   prediction, 
                   "rnn_1-epoch.csv")

In [16]:
prediction = [x[0] for x in lstm.predict_classes(unlabel_data_one_hot_index)]
utils.save_predict(unlabel_data, 
                   prediction, 
                   "lstm_dense-1_1-epoch.csv")