In [1]:
import importlib
import pandas as pd
import numpy as np
import nltk
import gensim 
import logging
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import utils
importlib.reload(utils)

import keras
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

Using TensorFlow backend.


In [2]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t", quoting=3)
train_data, test_data = train_test_split(data, test_size=0.25)

In [3]:
# Clean the text
train_data['review_cleaned'] = train_data.review.apply(lambda x: utils.normalize_text(x))
test_data['review_cleaned'] = test_data.review.apply(lambda x: utils.normalize_text(x))

unlabel_data = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabel_data['review_cleaned'] = unlabel_data.review.apply(lambda x: utils.normalize_text(x))

In [4]:
def get_total_words(text_list):
    text_set = set()
    for text in text_list:
        text_set |= set(text.split())
    return len(text_set)

def get_max_size_of_sentence(text_list):
    max_size = 0
    for text in text_list:
        max_size = max(max_size, len(text))
    return max_size

In [5]:
## Find the total of unique words
total_words = get_total_words(list(train_data['review_cleaned']) + 
                list(test_data['review_cleaned']) +
                list(unlabel_data['review_cleaned']))

## Find max size of a setence
max_size = get_max_size_of_sentence(list(train_data['review_cleaned']) + 
                list(test_data['review_cleaned']) +
                list(unlabel_data['review_cleaned']))

In [6]:
total_words=50000
tokenizer = preprocessing.text.Tokenizer(total_words)

tokenizer.fit_on_texts(list(train_data['review_cleaned']) + 
                       list(test_data['review_cleaned']) +
                       list(unlabel_data['review_cleaned'])
                      )

In [7]:
def vectorize_sequences(sequences, dimension=total_words):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results
train_one_hot_index = vectorize_sequences(tokenizer.texts_to_sequences(list(train_data['review_cleaned'])))
test_one_hot_index = vectorize_sequences(tokenizer.texts_to_sequences(list(test_data['review_cleaned'])))
unlabel_data_one_hot_index = vectorize_sequences(tokenizer.texts_to_sequences(list(unlabel_data['review_cleaned'])))


In [8]:
x_train = train_one_hot_index
y_train = train_data.sentiment

x_test = test_one_hot_index
y_test  = test_data.sentiment

In [22]:
model = Sequential()
model.add(Dense(32, activation="relu", input_shape=(total_words,)))
model.add(Dense(16, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 32)                1600032   
_________________________________________________________________
dense_13 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_14 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 17        
Total params: 1,600,849
Trainable params: 1,600,849
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=['acc'])

In [24]:
model.fit(x_train, y_train, epochs=1, validation_data=(x_test, y_test))

Train on 18750 samples, validate on 6250 samples
Epoch 1/1


<keras.callbacks.History at 0x7f59b8a03860>

## Predict

In [25]:
prediction = [x[0] for x in model.predict_classes(unlabel_data_one_hot_index)]
utils.save_predict(unlabel_data, 
                   prediction, 
                   "mlp_50k-words-3-layers_rmsprop_1-epoch.csv")