In [1]:
import os
from gensim import models
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
embeddings_index = {}
f = open('../embeddings/glove.6B/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [3]:
from numpy import zeros
import keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [4]:
# Reading data
train_df = pd.read_csv('../Data/imdb_master.csv', encoding = "ISO-8859-1")

In [5]:
train_df = train_df.drop(["Unnamed: 0","file", "type"], axis=1)
train_df = train_df[train_df.label != 'unsup']
train_df["label"] = train_df["label"].map({"pos":1,"neg":0})

In [6]:
# Cleaning and Pre Processing text
import re

def clean_numbers(text):
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)
    return text

def clean_text(text):
    text = clean_numbers(text)
    text = str(text)

    for punct in "/-'":
        text = text.replace(punct, ' ')
    for punct in '&':
        text = text.replace(punct, f' {punct} ')
    for punct in '?!.,"$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        text = text.replace(punct, '')

    text = text.lower()
    return text

In [7]:
# Clean text in dataFrame
train_df["processed_data"] = train_df["review"].progress_apply(lambda x: clean_text(x))
train_df["length"] = train_df["processed_data"].progress_apply(lambda x: len(x.split()))

100%|██████████| 50000/50000 [00:09<00:00, 5397.00it/s]
100%|██████████| 50000/50000 [00:00<00:00, 53167.12it/s]


In [8]:
docs = train_df["processed_data"].values
labels = train_df["label"].values

In [9]:
# Import gc, clearing some memory in ram
import gc
del train_df
gc.collect()

28

In [10]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [11]:
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)

In [12]:
# pad documents to a max length of max words
max_length = 150
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [13]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [14]:
# Create Train/Test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    padded_docs, labels, test_size=0.2, random_state=42)

In [15]:
vocab_size = vocab_size

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 50, weights=[embedding_matrix], 
                                 input_length=max_length, trainable=False))

model.add(keras.layers.LSTM((128), activation='relu', return_sequences=True))
model.add(keras.layers.LSTM((64), activation='relu', return_sequences=False))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 50)           7018250   
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 1280)         6814720   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                344320    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 14,177,355
Trainable params: 7,159,105
Non-trainable params: 7,018,250
_________________________________________________________________


In [None]:
model.fit(X_train,y_train, epochs=10, validation_data=(X_test,y_test))

Train on 40000 samples, validate on 10000 samples
Epoch 1/10
   96/40000 [..............................] - ETA: 3:24:44 - loss: 0.6981 - acc: 0.4792