In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import pandas as pd
import numpy as np
import tensorflow as tf
import re
from gensim.models import Word2Vec

In [2]:
test = pd.read_csv( "dataset/testData.tsv", header=0, delimiter="\t", quoting=1 )

In [3]:
def split_sentence(sentence):
    reg1 = re.compile(r"\x3C.{1,8}\x3E")
    reg2 = re.compile("[^a-zA-Z0-9]")
    a = re.sub(reg1,"",sentence)
    b = re.sub(reg2," ",a)
    return b.lower().split()

In [4]:
sentences = list()

for review in test["review"].values:
    sentences.append(split_sentence(review))

In [5]:
model = Word2Vec.load("imdb_word2Vec")

###### Build Id martix

In [6]:
optimal_len = 250
num_examples = len(sentences)

id_matrix = np.zeros((num_examples, optimal_len), dtype='int32')
for i in range(num_examples):
    for j in range(len(sentences[i])):
        if j < optimal_len:
            try:
                id_matrix[i,j] = model.wv.vocab.get(sentences[i][j]).index
            except AttributeError:
                id_matrix[i,j] = 0 # 99999 before

In [7]:
def batches(num):
    i = batch_size
    start  = (num*i)
    end = (num*i) + i
    return start, end

In [8]:
batch_size = 25
num_dims = model.wv.vectors.shape[1]
lstm_units = 64
num_classes = 2

In [9]:
tf.reset_default_graph()

# lookup
idx = tf.placeholder(tf.int32, [batch_size, optimal_len], name="idx")
vectors = tf.Variable(tf.constant(model.wv.vectors))
X_embedd = tf.nn.embedding_lookup(vectors, idx)

In [10]:
X = tf.placeholder(tf.float32, [batch_size, optimal_len, num_dims],name="X")

In [11]:
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(lstm_units)
lstm_cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstm_cell, X, dtype=tf.float32)

In [12]:
weight = tf.Variable(tf.truncated_normal([lstm_units, num_classes]))
bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

y_pred = tf.equal(tf.argmax(prediction,1), 0)

In [None]:
Y_pred = np.zeros((test.shape[0]), dtype=np.int32)

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    saver = tf.train.Saver()
    saver.restore(sess, "models/pretrained_lstm.ckpt-50")
    
    for batch in range(test.shape[0]//batch_size):
        
        #helper function
        start, end = batches(batch)
            
        #curren batch
        X_batch = id_matrix[start:end,:]
        
        X_value = sess.run(X_embedd, {idx: X_batch})
        
        Y_pred[start:end]  = sess.run(y_pred, {X: X_value})
        
        if batch % 100 == 0:
            print("Predicted {} labels".format(batch*batch_size + (batch_size*100)))

INFO:tensorflow:Restoring parameters from models/pretrained_lstm.ckpt-50
Predicted 2500 labels
Predicted 5000 labels
Predicted 7500 labels
Predicted 10000 labels
Predicted 12500 labels
Predicted 15000 labels
Predicted 17500 labels


###### Save predictions

In [None]:
import csv
pred_df = pd.DataFrame(Y_pred.T, columns=["sentiment"])
result = pd.concat([test, pred_df],sort=False,axis=1)
final = result.drop(columns=['review'])
final.to_csv("result_2.csv",index=False, quoting=csv.QUOTE_NONNUMERIC)