In [24]:
import importlib
import pandas as pd
import numpy as np
import nltk
import gensim 
import logging
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import utils
importlib.reload(utils)

import keras
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Conv1D, MaxPool1D, GlobalMaxPool1D, AveragePooling1D, GlobalAveragePooling1D

In [25]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t", quoting=3)
train_data, test_data = train_test_split(data, test_size=0.25)

In [26]:
# Clean the text
train_data['review_cleaned'] = train_data.review.apply(lambda x: utils.normalize_text(x))
test_data['review_cleaned'] = test_data.review.apply(lambda x: utils.normalize_text(x))

unlabel_data = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabel_data['review_cleaned'] = unlabel_data.review.apply(lambda x: utils.normalize_text(x))

In [27]:
def get_total_words(text_list):
    text_set = set()
    for text in text_list:
        text_set |= set(text.split())
    return len(text_set)

def get_max_size_of_sentence(text_list):
    max_size = 0
    for text in text_list:
        max_size = max(max_size, len(text))
    return max_size

In [28]:
## Find the total of unique words
total_words = get_total_words(list(train_data['review_cleaned']) + 
                list(test_data['review_cleaned']) +
                list(unlabel_data['review_cleaned']))

## Find max size of a setence
max_size = get_max_size_of_sentence(list(train_data['review_cleaned']) + 
                list(test_data['review_cleaned']) +
                list(unlabel_data['review_cleaned']))

In [29]:
tokenizer = preprocessing.text.Tokenizer(total_words)

tokenizer.fit_on_texts(list(train_data['review_cleaned']) + 
                       list(test_data['review_cleaned']) +
                       list(unlabel_data['review_cleaned'])
                      )

In [30]:
train_one_hot_index = tokenizer.texts_to_sequences(list(train_data['review_cleaned']))
train_one_hot_index = preprocessing.sequence.pad_sequences(train_one_hot_index, max_size)

test_one_hot_index = tokenizer.texts_to_sequences(list(test_data['review_cleaned']))
test_one_hot_index = preprocessing.sequence.pad_sequences(test_one_hot_index, max_size)

unlabel_data_one_hot_index = tokenizer.texts_to_sequences(list(unlabel_data['review_cleaned']))
unlabel_data_one_hot_index = preprocessing.sequence.pad_sequences(unlabel_data_one_hot_index, max_size)

In [31]:
x_train = train_one_hot_index
y_train = train_data.sentiment

x_test = test_one_hot_index
y_test  = test_data.sentiment

In [45]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_size))

model.add(Conv1D(32, 6, activation="relu"))
model.add(MaxPool1D(3))
model.add(Conv1D(32, 6, activation="relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(1, activation="sigmoid"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 9434, 100)         10124600  
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 9429, 32)          19232     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 3143, 32)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 3138, 32)          6176      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 10,150,041
Trainable params: 10,150,041
Non-trainable params: 0
________________________________________________________________

In [46]:
model.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=['acc'])

In [47]:
model.fit(x_train, y_train, epochs=2, validation_data=(x_test, y_test))

Train on 18750 samples, validate on 6250 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fcd1c4f6da0>

## Predict

In [None]:
prediction = [x[0] for x in model.predict_classes(unlabel_data_one_hot_index)]
utils.save_predict(unlabel_data, 
                   prediction, 
                   "cnn_max-2-epoch.csv")