In [0]:
import numpy as np
import string
import collections
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras import preprocessing
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive/')

baseFilepath = '/content/drive/My Drive/School/COMP551/Assignment4/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Sample LSTM on Keras' IMDB Dataset

In [0]:
# LSTM and CNN with Dropout for sequence classification in the IMDB dataset
# fix random seed for reproducibility
np.random.seed(7)

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# truncate and pad input sequences
max_review_length = 500
X_train = preprocessing.sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = preprocessing.sequence.pad_sequences(X_test, maxlen=max_review_length)

# create the model
embedding_vecor_length = 32
model_imdb = Sequential()
model_imdb.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model_imdb.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_imdb.add(MaxPooling1D(pool_size=2))
model_imdb.add(Dropout(0.2))
model_imdb.add(LSTM(100))
model_imdb.add(Dropout(0.2))
model_imdb.add(Dense(1, activation='sigmoid'))
model_imdb.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_imdb.summary())

# train the model
model_imdb.fit(X_train, y_train, epochs=3, batch_size=64)

# Final evaluation of the model
scores = model_imdb.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 250, 32)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

#  Data Preparation

In [0]:
# LSTM and CNN with Dropout for sequence classification of the project dataset
pos_examples = open(baseFilepath + 'rt-polarity.pos', encoding='ISO-8859-1').readlines()
neg_examples = open(baseFilepath + 'rt-polarity.neg', encoding='ISO-8859-1').readlines()

vocab = collections.defaultdict(int)
translator = str.maketrans('', '', string.punctuation)

# Tokenize all strings
token_pos = list(map(lambda ex: word_tokenize(ex.translate(translator)), pos_examples))
token_neg = list(map(lambda ex: word_tokenize(ex.translate(translator)), neg_examples))

# Get vocabulary size and max sentence length
max_sentence_length = 0
for ex_p, ex_n in zip(token_pos, token_neg):
    max_sentence_length = max(max_sentence_length, len(ex_p), len(ex_n))
    for word in ex_p:
        vocab[word] += 1
    for word in ex_n:
        vocab[word] += 1
        
vocab_size = len(vocab)
print("Vocabulary size: " + str(vocab_size))
print("Max sentence length: " + str(max_sentence_length))

# One-Hot encode and pad.
encoded_pos = [preprocessing.text.one_hot(ex, vocab_size) for ex in pos_examples]
padded_pos = preprocessing.sequence.pad_sequences(encoded_pos, maxlen=max_sentence_length, padding='post')
encoded_neg = [preprocessing.text.one_hot(ex, vocab_size) for ex in neg_examples]
padded_neg = preprocessing.sequence.pad_sequences(encoded_neg, maxlen=max_sentence_length, padding='post')

# Create train-test split
X = np.concatenate((padded_pos, padded_neg))
y = np.concatenate((np.ones(padded_pos.shape[0]), np.zeros(padded_neg.shape[0])))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Vocabulary size: 20490
Max sentence length: 51


# Barebones LSTM

In [0]:
# create the model
embedding_vector_length = 128
model_reproduce_base = Sequential()
model_reproduce_base.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_sentence_length, embeddings_initializer='random_uniform'))
model_reproduce_base.add(LSTM(100))
model_reproduce_base.add(Dense(1, activation='sigmoid'))
model_reproduce_base.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_reproduce_base.summary())

# train the model
model_reproduce_base.fit(X_train, y_train, epochs=3, batch_size=64)

# Final evaluation of the model
scores = model_reproduce_base.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 51, 32)            655680    
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 708,981
Trainable params: 708,981
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 69.76%


# LSTM with Dropout

In [0]:
# create the model
embedding_vector_length = 128
model_reproduce_mid = Sequential()
model_reproduce_mid.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_sentence_length, embeddings_initializer='random_uniform'))
model_reproduce_mid.add(Dropout(0.2))
model_reproduce_mid.add(LSTM(100))
model_reproduce_mid.add(Dropout(0.2))
model_reproduce_mid.add(Dense(1, activation='sigmoid'))
model_reproduce_mid.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_reproduce_mid.summary())

# train the model
model_reproduce_mid.fit(X_train, y_train, epochs=3, batch_size=64)

# Final evaluation of the model
scores = model_reproduce_mid.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 51, 32)            655680    
_________________________________________________________________
dropout_9 (Dropout)          (None, 51, 32)            0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 101       
Total params: 708,981
Trainable params: 708,981
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 49.84%


# LSTM with CNN and Dropout

In [0]:
# create the model
embedding_vector_length = 128
model_reproduce_complex = Sequential()
model_reproduce_complex.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_sentence_length, embeddings_initializer='random_uniform'))
model_reproduce_complex.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_reproduce_complex.add(MaxPooling1D(pool_size=2))
model_reproduce_complex.add(Dropout(0.2))
model_reproduce_complex.add(LSTM(100))
model_reproduce_complex.add(Dropout(0.2))
model_reproduce_complex.add(Dense(1, activation='sigmoid'))
model_reproduce_complex.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_reproduce_complex.summary())

# train the model
model_reproduce_complex.fit(X_train, y_train, epochs=3, batch_size=64)

# Final evaluation of the model
scores = model_reproduce_complex.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 51, 32)            655680    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 51, 32)            3104      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 25, 32)            0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 25, 32)            0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                