# DIT Natural Language Processing lesson 2025

## Recurrent neural networks

In this lesson, we build a recurrent neural network (RNN) to treat text as a sequence of words.

In [None]:
! pip install gensim

In [None]:
# Importing the dependencies
import glob
import numpy as np
import os
import tarfile

from gensim.models.keyedvectors import KeyedVectors

from keras.models import Sequential
from keras.layers import Dense, Dropout, Input, Flatten, SimpleRNN

from nltk.tokenize import TreebankWordTokenizer
from random import shuffle
from tqdm.auto import tqdm  # to show a smart progress meter
from urllib import request

In [None]:
PATH_TO_CORPUS = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
CORPUS_FILE_NAME = "aclImdb_v1.tar.gz"

PATH_TO_GOOGLENEWS_VECTORS ="https://www.dropbox.com/s/965dir4dje0hfi4/GoogleNews-vectors-negative300.bin.gz?dl=1"
GOOGLE_VECTORS = "GoogleNews-vectors-negative300.bin.gz"

CORPUS_PATH = "aclImdb/train"

def download_file(url_to_file, path_to_file):
  if os.path.isfile(path_to_file):
    print("A local copy of the file exists already:", path_to_file, "\nDoing nothing")
  else:
    request.urlretrieve(url_to_file, path_to_file)

In [None]:
# Downloading the embeddings

download_file(PATH_TO_GOOGLENEWS_VECTORS, GOOGLE_VECTORS)

# Downloading and untaring the corpus

download_file(PATH_TO_CORPUS, CORPUS_FILE_NAME)
with tarfile.open(CORPUS_FILE_NAME) as f:
  f.extractall(path=".")

# # Add the paths to the corpus. It should end in aclImdb/train
# CORPUS_PATH = "aclImdb/train"
# # Add the path to the embeddings. It should end in GoogleNews-vectors-negative300.bin.gz
# GOOGLE_VECTORS = "GoogleNews-vectors-negative300.bin.gz"

**Note**: I am using the same methods as in the previous session.
I could simply store them all in a .py file and import them, as
with the libraries

In [None]:
# Loading the embeddings
word_vectors = KeyedVectors.load_word2vec_format(GOOGLE_VECTORS,
    binary=True, limit=400000)

In [None]:
# Data preprocessor
def pre_process_data(filepath):
    """
    Load pos and neg examples from separate dirs then shuffle them
    together.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)
    return dataset


In [None]:
# Tokenizing and vectorizing all the instances
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in tqdm(dataset):
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
    return vectorized_data

In [None]:
# Extracting the expected output for all the instances
def collect_expected(dataset):
  """ Peel off the target values from the dataset """
  return [sample[0] for sample in dataset]

In [None]:
# Data preparation
dataset = pre_process_data(CORPUS_PATH)
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

# Define training and validation data (even if it's called test here)
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]

x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [None]:
# Network parameters
maxlen = 400
batch_size = 32
embedding_dims = 300
epochs = 2
num_neurons = 50

In [None]:
# Padding and truncating the input is not strictly necessary for RNNs; we
# due to the recursion strategy we are applying here and to have a fair
# comparison against the CNN (same input length)
class Collator:
  def __init__(self,
                maxlen,
                batch_size,
                ) -> None:
    self.maxlen = maxlen
    self.batch_size = batch_size

  def padding_and_truncating(self, x, y):
    """
    Add zeros at the end of the representation for short instances,
    truncate longer ones to the maxlen
    """
    vec_dim = len(x[0][0])
    N = len(x)
    X = np.zeros((N, self.maxlen, vec_dim))  # preallocate a np array
    Y = np.array(y)

    for i, tokens in enumerate(x):
      length = min(len(tokens), self.maxlen)
      if length > 0:
        X[i, :length] = np.asarray(tokens[:length])  # fill the np array
    return X, Y

  def collate(self, X, Y, N, epochs = 1):
    """
    This method is used to feed batches into the `model.fit` method
    """
    for _ in range(epochs):
      """
      This `for _ in range(epochs):` loop is here because
      the `for` below needs to be able to be called `epochs` times
      so each time the `for` is called the iterator is replenished
      for the new epoch
      """
      for start in range(0, N, self.batch_size):
        end = start + self.batch_size
        x_batch, y_batch = self.padding_and_truncating(
          X[start:end],
          Y[start:end]
          )
        yield x_batch, y_batch

In [None]:
collator = Collator(maxlen = maxlen, batch_size = batch_size)

In [None]:
# Initializing the (empty) network
model = Sequential()

In [None]:
# Adding one recurrent layer

# In previous versions of keras (and in the book), the input
# shape was defined as an argument to SimpleRNN. That way
# still works, but adding an Input instead is adviced
model.add(Input([maxlen, embedding_dims]))
model.add(SimpleRNN(
    num_neurons,
    return_sequences=True,
    # input_shape=(maxlen, embedding_dims)
    )
  )

Homework: experiment with return_sequences=False and compare the results

**Back to the slides**

In [None]:
# Adding a dropout layer (remember why?)
model.add(Dropout(.2))

# Adding a flattening layer
model.add(Flatten())

# Adding the classifier
model.add(Dense(1, activation='sigmoid'))

**Flatten?** back to the slides

In [None]:
# Compiling the network
model.compile('rmsprop',
              'binary_crossentropy',
              metrics=['accuracy'])
model.summary()

# 37,551 parameters!

In [None]:
# Training the network

# model.fit(x_train, y_train,
#     batch_size=batch_size,
#     epochs=epochs,
#     validation_data=(x_test, y_test))

model.fit(
    collator.collate(x_train, y_train, N = len(x_train), epochs=epochs),
    steps_per_epoch=len(x_train) // batch_size,
    validation_data=collator.collate(x_test, y_test, N = len(x_test), epochs=epochs),
    validation_steps=len(x_test) // batch_size,
    epochs=epochs,
    )

**Back to the slides**

In [None]:
# Building a bigger network
num_neurons = 100
model = Sequential()
model.add(SimpleRNN(
    num_neurons,
    return_sequences=True,
    input_shape=(maxlen, embedding_dims))
     )
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Training
# model.fit(x_train,
#     y_train,
#     batch_size=batch_size,
#     epochs=epochs,
#     validation_data=(x_test, y_test)
#      )
model.fit(
    collator.collate(x_train, y_train, N = len(x_train), epochs=epochs),
    steps_per_epoch=len(x_train) // batch_size,
    validation_data=collator.collate(x_test, y_test, N = len(x_test), epochs=epochs),
    validation_steps=len(x_test) // batch_size,
    epochs=epochs,
    )

The improvement is tiny*$\rightarrow$ perhaps the network is too complex.

\* (depending on the random initialisation, it could even be worst!)

Homework: try with 25 neurons

In [None]:
# Saving the network
model_structure = model.to_json()
with open("simplernn_model2.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("simplernn2.weights.h5")

Predicting on new instances

In [None]:
# Notice we have both positive and negative words here
sample_negative = """I hate that the dismal weather had me down for so long,
when will it break! Ugh, when does happiness return? The sun is
blinding and the puffy clouds are too thin. I can't wait for the weekend."""
# Super positive sample
sample_positive = """I love that incredible weather!
I feel like this happiness will last forever!
The sun is super nice and the breeze is soothing.
I could stay like this forever.
"""

# The first value is a "fake" class (this is the expected input)
data_dummy = [(0, sample_negative), (0, sample_positive)]
x_dummy = tokenize_and_vectorize(data_dummy)
y_dummy = collect_expected(data_dummy)


# model.predict(test_vec)
x_dummy_padded, y_dummy = collator.padding_and_truncating(x_dummy, y_dummy)
preds = model.predict(x_dummy_padded)
print(preds)

In [None]:
# Get the class by thresholding the logits
(preds > 0.5).astype("int32")

**End of the notebook**