# essential imports

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
import tensorflow as tf
from keras import models
from keras import Model
from keras import layers
from keras import regularizers
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Input, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.
  return f(*args, **kwds)


# reading train data

In [2]:
data = []
j = 0
with open("train.ft.txt","r") as f:
    for i in f:
        if j == 30000: #for computation reasons
            break
        data.append(i)
        j+=1

# initializing hyperparameters

In [3]:
NB_WORDS = 50000  # Parameter indicating the number of words we'll put in the dictionary
NB_START_EPOCHS = 26  # Number of epochs we usually start to train with
BATCH_SIZE = 512  #

# extracting labels from sentences

In [4]:
def get_labels(data) :
    labels = []
    for i in data :
        if(((i.split()[0]).replace("__label__",""))=='1'):
            labels.append([1,0])
        else :
            labels.append([0,1])
    return labels

In [5]:
labels = get_labels(data)

# removing stopwords and "__label__"

In [6]:
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 

In [7]:
def remove_label(data):
    d = []
    for i in data :
        d.append((i.replace("__label__",""))[2:].replace("\n",""))
    return d

In [8]:
data = remove_label(data)

In [9]:
data[10]

"The Worst!: A complete waste of time. Typographical errors, poor grammar, and a totally pathetic plot add up to absolutely nothing. I'm embarrassed for this author and very disappointed I actually paid for this book."

In [10]:
def cleaning_sentences(data):    
    for i in range(len(data)):
        data[i] = remove_stopwords(data[i])

In [11]:
cleaning_sentences(data)

In [12]:
data[0]

'Stuning even non-gamer: This sound track beautiful! It paints senery mind well would recomend even people hate vid. game music! played game Chrono Cross games ever played best music! It backs away crude keyboarding takes fresher step grate guitars soulful orchestras. It would impress anyone cares listen! ^_^'

In [13]:
data = np.array(data)
labels = np.array(labels)

# reading word embeddings

In [15]:
def read_embeddings(filename = 'glove.6B.300d.txt' ,dimension = 300) :
    embeddings_index = {}
    with open(filename) as f:
        for line in f:
            values = line.split()
            word = values[0]

            coefs = [float(i) for i in values[1:]]
            embeddings_index[word] = coefs

    embeddings_index['<PAD>'] = [0] * dimension
    embeddings_index['<UNK>'] = [1] * dimension
    return embeddings_index
    
embeddings_index = read_embeddings()

# tokenizing and removing punctuation

In [16]:
def tokenizing(data):
    tokenizer = TreebankWordTokenizer()
    training = []
    for i in range(len(data)):
        training.append(tokenizer.tokenize(data[i]))
    for i in range(len(training)):
        training[i] = [x.lower() for x in training[i]]
    return training

training = tokenizing(data)

In [18]:
def punct(training) :
    punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
             '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
    train_X =[]
    for i in range(len(training)):
        sentence = []
        for j in range(len(training[i])):
            if training[i][j] in punct:
                pass
            else:
                sentence.append(training[i][j])
        train_X.append(sentence)
    return train_X

train_X = punct(training)

# converting words to embedding vectors

In [19]:
def word2embed(train_X , MAX_SEQ = 120) :
    #MAX_SEQ = 120
    for s in range(len(train_X)):
        n = MAX_SEQ - len(train_X[s])
        if n < 0:
            train_X[s] = train_X[s][:MAX_SEQ]
        else:
            for i in range(n):
                train_X[s].append('<PAD>')
        for v in range(len(train_X[s])):
            if train_X[s][v] not in embeddings_index:
                train_X[s][v] = embeddings_index['<UNK>']
            else:
                train_X[s][v] = embeddings_index[train_X[s][v]]
word2embed(train_X , 120)

In [20]:
#train_X = train_X[:20000]

In [21]:
train_X = np.array(train_X)
train_X.shape

(30000, 120, 300)

# LSTM model

In [23]:
MAX_SEQ = 120
input_layer = Input( batch_shape = (None, MAX_SEQ, 300))
lstm_layer = Bidirectional(LSTM(units=MAX_SEQ, dropout = 0.25, recurrent_dropout=0.25))(input_layer)
x = Dropout(0.25)(lstm_layer)
merged = Dense(units=20, activation='relu')(x)
merged = Dropout(0.25)(merged)
merged = BatchNormalization()(merged)
output_layer = Dense(2, activation="softmax")(merged)

model = Model(inputs=input_layer, outputs=output_layer)
model.summary()

UnboundLocalError: local variable 'self' referenced before assignment

In [None]:
model.compile(optimizer='adam'
              , loss='binary_crossentropy'
              , metrics=['accuracy'])

# adding checkpoint for weights and setting validation set

In [None]:
checkpoint = ModelCheckpoint('weight_sentiment_amazon.{epoch:02d}.hdf5')
callbacks_list = [checkpoint]

In [None]:
X_valid = train_X[:BATCH_SIZE]
Y_valid = labels[:BATCH_SIZE]
train_X = train_X[BATCH_SIZE:]
y_train_oh = labels[BATCH_SIZE:]

In [None]:
train_X = np.array(train_X)
X_valid = np.array(X_valid)
Y_valid = np.array(Y_valid)
y_train_oh = np.array(y_train_oh)


In [None]:
train_X[0]
#X_valid.shape


In [None]:
history = model.fit(train_X, y_train_oh, validation_data=(X_valid, Y_valid), callbacks=callbacks_list, epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE)

In [None]:
max(history.history['val_acc'])

# loading final weights on model

In [None]:
model.load_weights("weight_sentiment_amazon.26.hdf5")

# converting sentences to embedding vectors for prediction

In [None]:
def prep_data(inp):
    punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
         '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
    #neg = negate_sequence(inp)
    for i in punct:
        inp.replace(i, '')
    tokenizer = TreebankWordTokenizer()
    inp = tokenizer.tokenize(inp)
    MAX_SEQ = 120
    n = MAX_SEQ - len(inp)
    if n < 0:
        inp = inp[:MAX_SEQ]
    else:
        for i in range(n):
            inp.append('<PAD>')
    for v in range(len(inp)):
        if inp[v] not in embeddings_index:
            inp[v] = embeddings_index['<UNK>']
        else:
            inp[v] = embeddings_index[inp[v]]
    return np.reshape(np.array(inp) , (1, MAX_SEQ, 300))

In [None]:
sent = "the food was cold but I liked the taste."

In [None]:
model.predict(prep_data(sent))

# loading test set

In [None]:
data_test = []
with open("test.ft.txt","r") as f:
    for i in f:
        data_test.append(i)

# test set preprocess and labels extraction

In [None]:
test_labels = get_labels(data_test)

In [None]:
data_test = remove_label(data_test)

In [None]:
cleaning_sentences(data_test)

In [None]:
data_test = np.array(data_test)
test_labels = np.array(test_labels)

In [None]:
testing = tokenizing(data_test)

In [None]:
test_X = punct(testing)

In [None]:
testing_X = test_X

In [None]:
test_X = test_X[:8000]

In [None]:
word2embed(test_X , 120)

# evaluating the model on test set

In [None]:
model.evaluate(np.array(test_X),np.array(test_labels[:8000]))