In [1]:
import keras
import pandas as pd
from keras import Sequential
from keras.layers import LSTM,Dense,Dropout, Embedding, Conv1D, MaxPooling1D, Flatten
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

%matplotlib inline

Using TensorFlow backend.


In [2]:
def load_imdb(path):
    from bs4 import BeautifulSoup
    import re, json
    import pandas as pd
    
    def preprocess(text):
        text = BeautifulSoup(text.lower(), "html5lib").text #removed html tags
        text = re.sub(r"[\W]+", " ", text)
        return text
    
    with open(path, "r", encoding="utf8") as f:
        comments = pd.DataFrame.from_dict([json.loads(l) for l in f])
        comments["content"] = comments["content"].apply(preprocess)
        return comments
        
comments = load_imdb("/data/imdb-comments.json")
comments.head()

Unnamed: 0,content,label,name,sentiment
0,i went and saw this movie last night after bei...,test,0_10.txt,pos
1,actor turned director bill paxton follows up h...,test,10000_7.txt,pos
2,as a recreational golfer with some knowledge o...,test,10001_9.txt,pos
3,i saw this film in a sneak preview and it is d...,test,10002_8.txt,pos
4,bill paxton has taken the true story of the 19...,test,10003_8.txt,pos


In [3]:
comments.sample(10)

Unnamed: 0,content,label,name,sentiment
26829,i can give you four reasons to see this movie ...,train,11647_8.txt,pos
47053,polyester was the very first john water s film...,train,7349_1.txt,neg
28994,i really enjoyed the first episode and am look...,train,2345_9.txt,pos
41945,i hate to even waste the time it takes to writ...,train,2751_1.txt,neg
33300,throughly enjoy all the musical numbers each t...,train,6220_8.txt,pos
40100,transylvania 6 5000 is an insignificant but o...,train,12340_4.txt,neg
8860,flowers if it s one thing you ll take away fro...,test,6725_8.txt,pos
47551,this review is based on the dubbed shock o ram...,train,7798_1.txt,neg
31818,this has just been broadcast on bbc and i am a...,train,4888_8.txt,pos
11625,wow this was a great movie i just got it from ...,test,9213_10.txt,pos


In [4]:
y = np.where(comments.sentiment == "pos", 1, 0)
is_training = comments.label == "train"

In [63]:
maxlen = 1250

In [None]:
tokenizer = Tokenizer(num_words=maxlen)
tokenizer.fit_on_texts(comments.content)
doc_terms = tokenizer.texts_to_sequences(comments.content)
len(doc_terms)

In [None]:
comments.content[0]

In [None]:
print(doc_terms[0])

In [None]:
str(tokenizer.word_docs)[:500]

In [None]:
words_by_index = dict([(i, word) for word, i in tokenizer.word_index.items()])

In [None]:
print([words_by_index[t] for t in doc_terms[0]])

In [None]:
vocab_size = len(tokenizer.word_index)
vocab_size

In [None]:
pd.Series([len(r) for r in doc_terms]).plot.kde()

In [None]:

doc_terms_padded = pad_sequences(doc_terms, maxlen=maxlen)
doc_terms_padded[0]

In [None]:
x_train = doc_terms_padded[is_training]
y_train = y[is_training]
x_test = doc_terms_padded[~is_training]
y_test = y[~is_training]

In [None]:
tf.set_random_seed(1)
np.random.seed(1)

model = Sequential()
model.add(Embedding(maxlen, 10, input_length=maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='tanh'))
model.add(MaxPooling1D(pool_size=4))
model.add(Conv1D(128, 3, activation='tanh'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(400, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=64, epochs=100)
