In [231]:
import numpy as np
import pandas as pd
import os, re
import pickle
from keras import models, regularizers, layers, optimizers, losses, metrics
from keras.models import Sequential
from keras.layers import Dense, embeddings, Flatten
from keras.utils import np_utils, to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb

In [16]:
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with open(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

In [17]:
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)


In [18]:
train_df = load_dataset(os.path.join('data', 
                                       "aclImdb", "train"))
test_df = load_dataset(os.path.join('data', 
                                      "aclImdb", "test"))

In [20]:
print("train_data ", train_df.shape)
print("train_labels ", test_df.shape)

print(train_df)
# print("_"*100)
# print("test_data ", X_test.shape)
# print("test_labels ", y_test.shape)
# print("_"*100)
# print("Maximum value of a word index ")
# print(max([max(sequence) for sequence in X_train]))
# print("Maximum length num words of review in train ")
# print(max([len(sequence) for sequence in train_data]))

train_data  (25000, 3)
train_labels  (25000, 3)
                                                sentence sentiment  polarity
0      Joe Don Baker is one of a handful of actors wh...         8         1
1      Interesting story about a soldier in a war who...         4         0
2      Not only did they get the characters all wrong...         1         0
3      When a friend gave me a boxed set of "12 Amazi...         3         0
4      The 1997 low-key indie dramedy Henry Fool woul...         7         1
5      THE SECRET OF KELLS may be the most exquisite ...        10         1
6      Uneven Bollywood drama. Karisma Kapoor is exce...         7         1
7      I went into this movie hoping for the best. I ...         2         0
8      While I totally disagree with one reviewer who...         3         0
9      Worry not, Disney fans--this special edition D...         9         1
10     Lisa is a hotel manager or owner and she gets ...         8         1
11     what can i say?, ms E

In [207]:
input_dim = 150
train_sentence = train_df['sentence'].tolist()
train_sentence = [' '.join(x.split()[:input_dim]) for x in train_sentence]
train_sentence = np.array(train_sentence, dtype=object)
train_polarity = train_df['polarity'].tolist()

In [208]:
test_sentence = test_df['sentence'].tolist()
test_sentence = [' '.join(x.split()[:input_dim]) for x in test_sentence]
test_sentence = np.array(test_sentence, dtype=object)
test_polarity = test_df['polarity'].tolist()

In [209]:
tok = Tokenizer(num_words=1000,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)
tok.fit_on_texts(train_sentence)

In [210]:
train_matrix = tok.texts_to_sequences(train_sentence)
train_matrix = pad_sequences(train_matrix, maxlen=150, padding='post', truncating='post', value=0)
# train_matrix = np.array(train_matrix, dtype=object)

test_matrix = tok.texts_to_sequences(test_sentence)
test_matrix = pad_sequences(test_matrix, maxlen=150, padding='post', truncating='post', value=0)
# test_matrix = np.array(test_matrix, dtype=object)

In [211]:
# train_matrix = train_matrix[:,:,np.newaxis]
# test_matrix = test_matrix[:,:,np.newaxis]
# train_matrix.shape
train_polarity = np.array(train_polarity)
test_polarity = np.array(test_polarity)

In [225]:

model = Sequential()
model.add(embeddings.Embedding(1000, 32, input_length=150))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_48 (Embedding)     (None, 150, 32)           32000     
_________________________________________________________________
flatten_19 (Flatten)         (None, 4800)              0         
_________________________________________________________________
dense_51 (Dense)             (None, 250)               1200250   
_________________________________________________________________
dense_52 (Dense)             (None, 1)                 251       
Total params: 1,232,501
Trainable params: 1,232,501
Non-trainable params: 0
_________________________________________________________________
None


In [234]:
history = model.fit(train_matrix, train_polarity, validation_data=(test_matrix, test_polarity), epochs=40, batch_size=128, verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/40
 - 5s - loss: 0.2586 - acc: 0.8952 - val_loss: 0.4826 - val_acc: 0.7987
Epoch 2/40
 - 5s - loss: 0.1612 - acc: 0.9409 - val_loss: 0.6114 - val_acc: 0.7860
Epoch 3/40
 - 5s - loss: 0.0798 - acc: 0.9746 - val_loss: 0.7790 - val_acc: 0.7752
Epoch 4/40
 - 6s - loss: 0.0348 - acc: 0.9902 - val_loss: 1.0896 - val_acc: 0.7712
Epoch 5/40
 - 5s - loss: 0.0173 - acc: 0.9955 - val_loss: 1.3058 - val_acc: 0.7668
Epoch 6/40
 - 6s - loss: 0.0098 - acc: 0.9974 - val_loss: 1.4436 - val_acc: 0.7724
Epoch 7/40
 - 5s - loss: 0.0068 - acc: 0.9980 - val_loss: 1.5578 - val_acc: 0.7670
Epoch 8/40
 - 6s - loss: 0.0054 - acc: 0.9986 - val_loss: 1.6428 - val_acc: 0.7696
Epoch 9/40
 - 5s - loss: 0.0041 - acc: 0.9988 - val_loss: 1.7846 - val_acc: 0.7675
Epoch 10/40
 - 6s - loss: 0.0032 - acc: 0.9993 - val_loss: 1.9270 - val_acc: 0.7642
Epoch 11/40
 - 5s - loss: 0.0034 - acc: 0.9990 - val_loss: 1.9166 - val_acc: 0.7674
Epoch 12/40
 - 5s - loss: 0.0030 - 

In [227]:
model.evaluate(test_matrix,test_polarity)



[0.41619171557426454, 0.81024]

In [229]:
# save model
model.save('models/dl_arch.h5')

In [233]:
# saving tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)