In [10]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder

In [11]:
from tensorflow.python.client import device_lib

In [12]:
DATA_PATH = './data/imdb/'
data = pd.read_csv(os.path.join(DATA_PATH, 'reviews.csv'))
labels = pd.read_csv(os.path.join(DATA_PATH, 'labels.csv'))

In [13]:
assert data.shape == labels.shape

In [14]:
reviews = data['review'].values
classes = labels['class'].values

train_reviews = reviews[:35000]
y_train = classes[:35000]

test_reviews = reviews[35000:]
y_test = classes[35000:]

In [15]:
t = Tokenizer(oov_token='<UNK>')
# fit the tokenizer on the documents
t.fit_on_texts(train_reviews)
t.word_index['<PAD>'] = 0

In [24]:
train_sequences = t.texts_to_sequences(train_reviews)
test_sequences = t.texts_to_sequences(test_reviews)

In [25]:
print("Vocabulary size={}".format(len(t.word_index)))
print("Number of Documents={}".format(t.document_count))

Vocabulary size=136471
Number of Documents=35000


In [35]:
MAX_SEQUENCE_LENGTH = 1000

In [36]:
X_train = sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_test.shape

((35000, 1000), (15000, 1000))

In [43]:
EMBED_SIZE = 100
EPOCHS=2
BATCH_SIZE=128
VOCAB_SIZE = len(t.word_index)

In [44]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBED_SIZE, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
#model.add(MaxPooling1D(pool_size=2))
#model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
#model.add(MaxPooling1D(pool_size=2))
#model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
#model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
#model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 1000, 100)         13647100  
                                                                 
 conv1d_9 (Conv1D)           (None, 997, 128)          51328     
                                                                 
 flatten_5 (Flatten)         (None, 127616)            0         
                                                                 
 dense_7 (Dense)             (None, 1)                 127617    
                                                                 
Total params: 13,826,045
Trainable params: 13,826,045
Non-trainable params: 0
_________________________________________________________________


In [45]:
# Fit the model
history = model.fit(X_train, y_train, 
          validation_split=0.1,
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, 
          verbose=1)

Epoch 1/2

KeyboardInterrupt: 

In [None]:
batch_sizes = [64, 128, 256, 512]
optimizers = ['adam', 'RMSProp', 'SGD']

histories = {}

for batch in batch_sizes:
    for optimizer in optimizers:
        model = Sequential()
        model.add(Embedding(VOCAB_SIZE, EMBED_SIZE, input_length=MAX_SEQUENCE_LENGTH))
        model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(1, activation='softmax'))
        model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        histories += {'history' :model.fit(X_train, y_train, 
          validation_split=0.1,
          epochs=EPOCHS, 
          batch_size=batch, 
          verbose=1), 'batch' : batch, 'optimizer': optimizer}