In [84]:
import urllib.request
import os
import tarfile

In [85]:
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath = "data/aclImdb_v1.tar.gz"

In [86]:
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url, filepath)
    print("downloaded:", result)

In [87]:
if not os.path.exists("data/aclImdb"):
    tfile = tarfile.open("data/aclImdb_v1.tar.gz", "r:gz")
    result = tfile.extractall("data/")

In [88]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

In [89]:
import re
def rm_tags(text):
    re_tag = re.compile(r"<[^>]+>")
    return re_tag.sub("", text)

In [90]:
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
        
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
        
    print("read", filetype, "files:", len(file_list))
    
    all_labels = [1] * 12500 + [0] * 12500
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding="utf-8") as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels, all_texts

In [91]:
y_train, train_text = read_files("train")

read train files: 25000


In [92]:
y_test, test_text = read_files("test")

read test files: 25000


In [93]:
train_text[0]

"Before viewing, please make sure you have seen Night of the Living Dead... This might well be THE best 7 minute parody I have ever seen! Absurd, crappy 'special effects' (the rope, the rope!!!), and maneating slices of bread... what more do you need???(Do not watch this movie while eating bread... you might get scared!)"

In [94]:
y_train[0]

1

In [95]:
train_text[12501]

'Bestselling writer George Plimpton(Alan Alda)takes on an assignment for Sports Illustrated. He is to go incognito to the Detroit Lions training camp and try out for a position as third string Quarterback. He is quickly found out by the team members featuring Alex Karras and Mike Lucci. The entire team finds it amusing to cause stumbling blocks in the writer\'s determination to Quarterback for a series in a real game.This movie is Alda\'s debut and also helped Karras leave the gridiron for acting. Besides the 1968 Detroit Lions, the cast also includes "Sugar Ray" Robinson, Roy Schieder and Lauren Hutton.Alex March directs this story based on Plimton\'s book.'

In [96]:
y_train[12501]

0

In [97]:
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)

In [98]:
print(token.document_count)

25000


In [99]:
print(token.word_index)



In [100]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [101]:
print(train_text[0])

Before viewing, please make sure you have seen Night of the Living Dead... This might well be THE best 7 minute parody I have ever seen! Absurd, crappy 'special effects' (the rope, the rope!!!), and maneating slices of bread... what more do you need???(Do not watch this movie while eating bread... you might get scared!)


In [102]:
print(x_train_seq[0])

[154, 823, 588, 93, 248, 21, 24, 106, 310, 4, 1, 576, 346, 10, 234, 69, 25, 1, 114, 689, 780, 2115, 9, 24, 122, 106, 1750, 2128, 1, 1, 2, 4, 47, 49, 78, 21, 355, 78, 20, 102, 10, 16, 133, 1882, 21, 234, 74, 1758]


In [103]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test = sequence.pad_sequences(x_test_seq, maxlen=380)

In [104]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

In [70]:
model = Sequential()

In [105]:
model.add(Embedding(input_dim=3800, output_dim=32, input_length=380))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(units=256, activation="relu"))
model.add(Dropout(0.35))
model.add(Dense(units=1, activation="sigmoid"))

In [106]:
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_3 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 12160)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               3113216   
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 3,235,073
Trainable params: 3,235,073
Non-trainable params: 0
____________________________________________

In [107]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [108]:
train_history = model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=100, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [109]:
loss, accuracy = model.evaluate(x_test, y_test)



In [110]:
accuracy

0.8525199890136719

### RNN

In [111]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN

In [112]:
model = Sequential()

In [113]:
model.add(Embedding(input_dim=3800, output_dim=32, input_length=380))
model.add(Dropout(0.35))
model.add(SimpleRNN(units=16))
model.add(Dense(units=256, activation="relu"))
model.add(Dropout(0.35))
model.add(Dense(units=1, activation="sigmoid"))

In [114]:
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_5 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_5 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 257       
Total params: 126,993
Trainable params: 126,993
Non-trainable params: 0
________________________________________________

In [115]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [116]:
train_history = model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=100, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### LSTM

In [117]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

In [118]:
model = Sequential()

In [119]:
model.add(Embedding(input_dim=3800, output_dim=32, input_length=380))
model.add(Dropout(0.2))
model.add(LSTM(units=32))
model.add(Dense(units=256, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation="sigmoid"))

In [120]:
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_7 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_7 (Dense)              (None, 256)               8448      
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 257       
Total params: 138,625
Trainable params: 138,625
Non-trainable params: 0
________________________________________________

In [121]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [122]:
train_history = model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=100, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [123]:
loss, accuracy = model.evaluate(x_test, y_test)



In [124]:
accuracy

0.850600004196167