In [6]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing import sequence
import numpy as np
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
import bz2
import random

In [16]:
vocab_size = 6000                 
max_length = 500                  
train_set_proportion = 0.9        
num_data_points = 10000           
embedding_size = 128             
train_size = int(num_data_points * train_set_proportion)
batch_size = 1024
num_epochs = 10

In [15]:
reTokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
def labelsandtexts(file):
  labels=[]
  reviews=[]
  for li in bz2.BZ2File(file):
    x=li.decode("utf-8")
    labels.append(int(x[9])-1)
    reviews.append(x[10:].strip())
  return np.array(labels), reviews
labels, reviews= labelsandtexts('train.ft.txt.bz2')

In [None]:
all_words = []
for i in range(len(reviews)):
    tokens = reTokenizer.tokenize(reviews[i])
    reviews[i] = []
    for word in tokens:
        word = word.lower()
        all_words.append(word)
        reviews[i].append(word)

In [None]:
all_words = FreqDist(all_words)
all_words = all_words.most_common(vocab_size)

word2int = {all_words[i][0]: i+1 for i in range(vocab_size)}
int2word = {x: y for y, x in word2int.items()}
dict_as_list = list(word2int)

In [None]:
def review2intlist(rev_text):
    int_list = []
    for word in rev_text:
        if word in word2int.keys():
            int_list.append(word2int[word])
    return int_list


In [None]:
X = []
for i in reviews:
    X.append(np.asarray(review2intlist(i), dtype=int))
X = sequence.pad_sequences(X, maxlen=max_length)

In [None]:
LSTM_inputs = np.zeros(shape=(max_length, num_data_points), dtype=np.float32)
for i in range(len(X)):
    LSTM_inputs[:, i] = X[i]
LSTM_inputs = LSTM_inputs.T

LSTM_outputs = np.zeros(shape=num_data_points)
for i in range(len(labels)):
    LSTM_outputs[i] = labels[i]


In [None]:
x_train, y_train = LSTM_inputs[:train_size], LSTM_outputs[:train_size]
x_test, y_test = LSTM_inputs[train_size:], LSTM_outputs[train_size:]

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size + 1, output_dim=64, input_length=max_length))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_split=0.2, batch_size=batch_size, epochs=num_epochs, verbose=2)


In [None]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print(accuracy)