In [11]:
import numpy as np
import string
import collections
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import gensim
import pickle

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras import preprocessing
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive/')

baseFilepath = '/content/drive/My Drive/School/COMP551/Assignment4/'
epochNum = 3
batchSize = 64

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Sample LSTM on Keras' IMDB Dataset

In [0]:
# LSTM and CNN with Dropout for sequence classification in the IMDB dataset
# fix random seed for reproducibility
np.random.seed(7)

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# truncate and pad input sequences
max_review_length = 500
X_train = preprocessing.sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = preprocessing.sequence.pad_sequences(X_test, maxlen=max_review_length)

# create the model
embedding_vecor_length = 32
model_imdb = Sequential()
model_imdb.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model_imdb.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_imdb.add(MaxPooling1D(pool_size=2))
model_imdb.add(Dropout(0.2))
model_imdb.add(LSTM(100))
model_imdb.add(Dropout(0.2))
model_imdb.add(Dense(1, activation='sigmoid'))
model_imdb.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_imdb.summary())

# train the model
model_imdb.fit(X_train, y_train, epochs=epochNum, batch_size=batchSize)

# Final evaluation of the model
scores = model_imdb.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

#  Data Preparation (Non-Word2Vec)

In [4]:
# file load
pos_examples = open(baseFilepath + 'rt-polarity.pos', encoding='ISO-8859-1').readlines()
neg_examples = open(baseFilepath + 'rt-polarity.neg', encoding='ISO-8859-1').readlines()

vocab = collections.defaultdict(int)
translator = str.maketrans('', '', string.punctuation)

# Tokenize all strings
token_pos = list(map(lambda ex: word_tokenize(ex.translate(translator)), pos_examples))
token_neg = list(map(lambda ex: word_tokenize(ex.translate(translator)), neg_examples))

# Get vocabulary size and max sentence length
max_sentence_length = 0
for ex_p, ex_n in zip(token_pos, token_neg):
    max_sentence_length = max(max_sentence_length, len(ex_p), len(ex_n))
    for word in ex_p:
        vocab[word] += 1
    for word in ex_n:
        vocab[word] += 1
        
vocab_size = len(vocab)
print("Vocabulary size: " + str(vocab_size))
print("Max sentence length: " + str(max_sentence_length))

# One-Hot encode and pad.
encoded_pos = [preprocessing.text.one_hot(ex, vocab_size) for ex in pos_examples]
padded_pos = preprocessing.sequence.pad_sequences(encoded_pos, maxlen=max_sentence_length, padding='post')
encoded_neg = [preprocessing.text.one_hot(ex, vocab_size) for ex in neg_examples]
padded_neg = preprocessing.sequence.pad_sequences(encoded_neg, maxlen=max_sentence_length, padding='post')

# Create train-test split
X = np.concatenate((padded_pos, padded_neg))
y = np.concatenate((np.ones(padded_pos.shape[0]), np.zeros(padded_neg.shape[0])))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Vocabulary size: 20490
Max sentence length: 51


# Barebones LSTM

In [15]:
# create the model
embedding_vector_length = 128
model_reproduce_base = Sequential()
model_reproduce_base.add(Embedding(vocab_size, embedding_vector_length, input_length=max_sentence_length, embeddings_initializer='random_uniform'))
model_reproduce_base.add(LSTM(100))
model_reproduce_base.add(Dense(1, activation='relu'))
model_reproduce_base.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_reproduce_base.summary())

# train the model
model_reproduce_base.fit(X_train, y_train, epochs=epochNum, batch_size=batchSize)

# Final evaluation of the model
scores = model_reproduce_base.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# 75.34

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 51, 128)           2622720   
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               91600     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 101       
Total params: 2,714,421
Trainable params: 2,714,421
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 74.82%


# LSTM with Dropout

In [0]:
# create the model
embedding_vector_length = 128
model_reproduce_mid = Sequential()
model_reproduce_mid.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_sentence_length, embeddings_initializer='random_uniform'))
model_reproduce_mid.add(Dropout(0.2))
model_reproduce_mid.add(LSTM(100))
model_reproduce_mid.add(Dropout(0.2))
model_reproduce_mid.add(Dense(1, activation='sigmoid'))
model_reproduce_mid.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_reproduce_mid.summary())

# train the model
model_reproduce_mid.fit(X_train, y_train, epochs=epochNum, batch_size=batchSize)

# Final evaluation of the model
scores = model_reproduce_mid.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# LSTM with CNN and Dropout

In [0]:
# create the model
embedding_vector_length = 128
model_reproduce_complex = Sequential()
model_reproduce_complex.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_sentence_length, embeddings_initializer='random_uniform'))
model_reproduce_complex.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_reproduce_complex.add(MaxPooling1D(pool_size=2))
model_reproduce_complex.add(Dropout(0.2))
model_reproduce_complex.add(LSTM(100))
model_reproduce_complex.add(Dropout(0.2))
model_reproduce_complex.add(Dense(1, activation='sigmoid'))
model_reproduce_complex.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_reproduce_complex.summary())

# train the model
model_reproduce_complex.fit(X_train, y_train, epochs=epochNum, batch_size=batchSize)

# Final evaluation of the model
scores = model_reproduce_complex.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# Data Preparation (Word2Vec Embeddings)
Note that *mr.p* pickle file was generated in OriginalCode.ipynb from the same data.

In [0]:
def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = filter_h - 1
    for i in range(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data_cv(revs, word_idx_map, cv, max_l=51, k=300, filter_h=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, test = [], []
    for rev in revs:
        sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k, filter_h)   
        sent.append(rev["y"])
        if rev["split"]==cv:            
            test.append(sent)        
        else:
            train.append(sent)
    train = np.array(train,dtype="int")
    test = np.array(test,dtype="int")
    return [train, test]

x = pickle.load(open(baseFilepath + "OriginalCode/mr.p","rb"), encoding='latin1')
revs, W, W2, word_idx_map, vocabNew = x[0], x[1], x[2], x[3], x[4]
kFoldData = []
for i in range(10):
  kFoldData.append(make_idx_data_cv(revs, word_idx_map, i, max_l=56,k=300, filter_h=5))

# for simplification, only use the first fold
X_train, y_train, X_test, y_test = [], [], [], []
for i in range(kFoldData[0][0].shape[0]):
  X_train.append(kFoldData[0][0][i][:-1])
  y_train.append(kFoldData[0][0][i][-1])
for i in range(kFoldData[0][1].shape[0]):
  X_test.append(kFoldData[0][1][i][:-1])
  y_test.append(kFoldData[0][1][i][-1])
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Barebones LSTM (on Word2Vec-encoded data)

In [71]:
# create the model
embedding_vector_length = X_train.shape[1]
model_reproduce_base = Sequential()
model_reproduce_base.add(Embedding(len(vocabNew), embedding_vector_length, input_length=64, embeddings_initializer='random_uniform'))
model_reproduce_base.add(LSTM(100))
model_reproduce_base.add(Dense(1, activation='sigmoid'))
model_reproduce_base.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_reproduce_base.summary())

# train the model
model_reproduce_base.fit(X_train, y_train, epochs=25, batch_size=50)

# Final evaluation of the model
scores = model_reproduce_base.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

(9591, 64) (9591,) (1071, 64) (1071,)
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 64, 64)            1200896   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 1,266,997
Trainable params: 1,266,997
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Accuracy: 48.55%


# LSTM with CNN and Dropout (on Word2Vec data)

In [75]:
# create the model
embedding_vector_length = X_train.shape[1]
model_reproduce_complex = Sequential()
model_reproduce_complex.add(Embedding(len(vocabNew), embedding_vector_length, input_length=64, embeddings_initializer='random_uniform'))
model_reproduce_complex.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_reproduce_complex.add(MaxPooling1D(pool_size=2))
model_reproduce_complex.add(Dropout(0.2))
model_reproduce_complex.add(LSTM(100))
model_reproduce_complex.add(Dropout(0.2))
model_reproduce_complex.add(Dense(1, activation='sigmoid'))
model_reproduce_complex.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_reproduce_complex.summary())

# train the model
model_reproduce_complex.fit(X_train, y_train, epochs=3, batch_size=batchSize)

# Final evaluation of the model
scores = model_reproduce_complex.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# 75.91%

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 64, 64)            1200896   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 64, 32)            6176      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 32, 32)            0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 32, 32)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                