In [1]:
import os
import re
from tqdm import tqdm
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN
from keras.layers import Conv1D, GlobalMaxPooling1D
from nltk.tokenize import TreebankWordTokenizer
from random import shuffle

In [2]:
train_filepath = 'aclImdb/train/'
test_filepath = 'aclImdb/test/'

In [3]:
# # Vocabulary: All words used, starting by the most frequent
# with open('aclImdb/imdb.vocab') as f:
#     vocab = [word.rstrip() for word in f]
#     # Keep only most frequent 5000 words rather than all 90000
#     # Just saving memory - the long tail occurs too few times
#     # for the model to learn anything anyway
#     vocab = vocab[:5000]
#     print('%d words in vocabulary' % (len(vocab),))

In [4]:
# files = []
# dirs = []
# for (dir_path, dir_names, file_names) in os.walk('aclImdb'):
#     dirs.extend(dir_names)
#     files.extend(file_names)
# print(dirs)
# print(len(files))

In [5]:
# dir_path = 'aclImdb/train/pos'

# files = []
# folders = []

# # Iterate directory
# for path in os.listdir(dir_path):
#     # check if current path is a file
#     if os.path.isfile(os.path.join(dir_path, path)):
#         files.append(path)
#     else:
#         folders.append(path)
# print(files)
# print(folders)

In [6]:
# !    wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !    tar xfz aclImdb_v1.tar.gz

In [7]:
def pre_process_data(filepath):
    paths = ['aclImdb/train/', 'aclImdb/test/']
    # positive_path = 'aclImdb/train/pos/'
    # negative_path = 'aclImdb/train/neg/'
    pos_label = 1
    neg_label = 0
    dataset = []

    for path in paths:
        positive_path = path + 'pos/'
        negative_path = path + 'neg/'

        pos_files = []
        # Iterate directory
        for path in os.listdir(positive_path):
            # check if current path is a file
            if os.path.isfile(os.path.join(positive_path, path)):
                pos_files.append(path)

        for filename in pos_files:
            with open(positive_path + filename, 'r') as f:
                dataset.append((pos_label, f.read()))

        
        neg_files = []
        # Iterate directory
        for path in os.listdir(negative_path):
            # check if current path is a file
            if os.path.isfile(os.path.join(negative_path, path)):
                neg_files.append(path)

        for filename in neg_files:
            with open(negative_path + filename, 'r') as f:
                dataset.append((neg_label, f.read()))
    
    shuffle(dataset)

    return dataset

dataset = pre_process_data(train_filepath)

In [8]:
# ! wget https://nlp.stanford.edu/data/glove.6B.zip
# ! unzip glove.6B.zip

--2022-07-13 05:56:03--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-07-13 05:56:04--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2022-07-13 05:58:46 (5.08 MB/s) - ‘glove.6B.zip.1’ saved [862182613/862182613]

Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove.6B.100d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove.6B.200d.txt? [y

In [9]:
# let's create a dictionary of each word in the pre-trained GloVe embeddings, saving its location indexes 
EMBEDDING_DIM = 50

GLOVE_DIR = "."
word_vectors = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.%dd.txt' % EMBEDDING_DIM))
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_vectors[word] = coefs
f.close()

print('Found %s word vectors.' % len(word_vectors))

400000it [00:05, 77230.89it/s]

Found 400000 word vectors.





In [10]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            
            except KeyError:
                pass # No matching token in w2v vocab
        
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [11]:
def collect_expected(dataset):
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    
    return expected

In [12]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [13]:
split_point_train = int(len(vectorized_data)*.6)
split_point_val = int(len(vectorized_data)*.8)

x_train = vectorized_data[:split_point_train]
y_train = expected[:split_point_train]

x_val = vectorized_data[split_point_train:split_point_val]
y_val = expected[split_point_train:split_point_val]

x_test = vectorized_data[split_point_val:]
y_test = expected[split_point_val:]

In [14]:
maxlen = 400
batch_size = 8
embedding_dims = 50
epochs = 20

In [15]:
def pad_trunc(data, maxlen):
    new_data = []

    # Create a vector of 0s the length of our word vector
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample

            # Append the appropriate number 0 vectors to the last
            additional_elems = maxlen - len(sample)
            for _ in range (additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [16]:
x_train = pad_trunc(x_train, maxlen)
x_val = pad_trunc(x_val, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)

x_val = np.reshape(x_val, (len(x_val), maxlen, embedding_dims))
y_val = np.array(y_val)

In [17]:
num_neurons = 25
model = Sequential()
model.add(SimpleRNN(
    num_neurons, return_sequences=True,
    input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 400, 25)           1900      
                                                                 
 dropout (Dropout)           (None, 400, 25)           0         
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 1)                 10001     
                                                                 
Total params: 11,901
Trainable params: 11,901
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa13372e850>

In [None]:
x_train = None
y_train = None

x_val = None
y_val = None

In [None]:
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [None]:
# Evaluate the model on the test data
print("Evaluate on test data")
predictions = model.predict(x_test)

y_test = np.reshape(y_test, (len(predictions), 1))

from google.colab import drive
drive.mount('/drive')

model_structure = model.to_json()
filename = f"/drive/My Drive/science/ml={maxlen}__n={num_neurons}.npy"
with open(filename, "wb") as f:
    np.save(f, np.array([predictions, y_test]))
            
drive.flush_and_unmount()

In [None]:
drive.mount('/drive')

model_structure = model.to_json()
filename = f"/drive/My Drive/science/rnn_model_ml={maxlen}__n={num_neurons}.json"

with open(filename, "w") as json_file:
    json_file.write(model_structure)
    
model.save_weights("cnn_weights.h5")
drive.flush_and_unmount()