# DIT Natural Language Processing Lesson 2025

## CNNs on text

This notebook starts to run apart from the originally publishe in NLP in Action. The reason is that the original implementation requires way too many resources and, as a result, crashes in the freely available colab servers.

In [None]:
! pip install gensim

In [None]:
import glob
import numpy as np
import os
import os.path
import tarfile

from gensim.models.keyedvectors import KeyedVectors

from keras.preprocessing import sequence   # necessary for padding
from keras.models import Sequential        # Base Keras NN model
from keras.layers import Conv1D, GlobalMaxPooling1D # Convolution layer and pooling
from keras.layers import Dense, Dropout, Activation # The objects for each layer
from keras.layers import Input

from nltk.tokenize import TreebankWordTokenizer
from psutil import virtual_memory
from random import shuffle
from urllib import request
from tqdm.auto import tqdm  # to show a smart progress meter


In [None]:
# To check what is the amount of memory available

ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

For this exercise, we first download Stanford's [Large Movie Review Dataset](https://ai.stanford.edu/%7eamaas/data/sentiment/aclImdb_v1.tar.gz). More information on the corpus at [Learning Word Vectors for Sentiment Analysis](https://ai.stanford.edu/%7eamaas/papers/wvSent_acl2011.pdf). We need pre-trained embeddings as well

In [None]:
PATH_TO_CORPUS = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
CORPUS_FILE_NAME = "aclImdb_v1.tar.gz"

PATH_TO_GOOGLENEWS_VECTORS ="https://www.dropbox.com/s/965dir4dje0hfi4/GoogleNews-vectors-negative300.bin.gz?dl=1"
GOOGLE_VECTORS = "GoogleNews-vectors-negative300.bin.gz"

CORPUS_PATH = "aclImdb/train"

def download_file(url_to_file, path_to_file):
  if os.path.isfile(path_to_file):
    print("A local copy of the file exists already:", path_to_file, "\nDoing nothing")
  else:
    request.urlretrieve(url_to_file, path_to_file)

In [None]:
# Downloading the embeddings

download_file(PATH_TO_GOOGLENEWS_VECTORS, GOOGLE_VECTORS)

In [None]:
# Downloading and untaring the corpus

download_file(PATH_TO_CORPUS, CORPUS_FILE_NAME)
with tarfile.open(CORPUS_FILE_NAME) as f:
  f.extractall(path=".")

In [None]:
# A method to read and shuffle all instances (one per file).
# Positive (negative) instances are in the pos (neg) folder

def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will
    try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []

    # glob.glob returns a list of path names that match pathname
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)
    return dataset

In [None]:
# Preprocessing the data
dataset = pre_process_data(CORPUS_PATH)
dataset[0]

In [None]:
# Loading the word2vec embeddings

word_vectors = KeyedVectors.load_word2vec_format(GOOGLE_VECTORS,
    binary=True, limit=400000)

# If you want to use less memory (e.g., you are just playing around), you could
# reduce the size of the vocabulary
# (e.g., to 200000 or to 100000)
# word_vectors = KeyedVectors.load_word2vec_format(GOOGLE_VECTORS,
#     binary=True, limit=200000)

In [None]:
# Method to tokenise and vectorise the data

def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    # expcted = [] this line appears in the book, but it's not necessary here!
    for sample in tqdm(dataset):
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass # No matching token in the Google w2v vocab
        np_vec = np.vstack(sample_vecs)
        vectorized_data.append(np_vec)
    return vectorized_data

In [None]:
# Method to get the target labels
def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [None]:
# Vectorising the dataset and extracting the gold standard
x = tokenize_and_vectorize(dataset)
y = collect_expected(dataset)

In [None]:
# Creating training and validation partitions
# n_samples = 0
# if n_samples > 0:
#     x = x[:n_samples]
#     y = y[:n_samples]

split_point = int(len(x)*.8)

# Original alternative
x_train = x[:split_point] # there's a typo in this line, if copying from the book
y_train = y[:split_point]
x_test = x[split_point:]
y_test = y[split_point:]

In [None]:
# Network parameters

maxlen = 400          # maximum length of the text (why?)
batch_size = 32       # number of samples before backpropagating
embedding_dims = 300  # Same as Google's
filters = 250         # (!)
kernel_size = 3       # remember: filter=kernel (we have a scalar this time)
hidden_dims = 250     # number of neurons in the final layer
epochs = 2            # number of training epochs

# If you want to try to run with less memory, an alternative is to reduce
# the maximum length of the input and the size of the network
# maxlen = 100          # maximum length of the text (why?)
# batch_size = 32       # number of samples before backpropagating
# embedding_dims = 300  # Same as Google's
# filters = 120
# kernel_size = 3       # remember: filter=kernel (we have a scalar this time)
# hidden_dims = 150     # number of neurons in the final layer
# epochs = 2            # number of training epochs

**Back to the slides** to see what is "padding" in text

## Padding

In [None]:
class Collator:
    def __init__(self,
                 maxlen,
                 batch_size,
                 ) -> None:
        self.maxlen = maxlen
        self.batch_size = batch_size
        pass

    def padding_and_truncating(self, x, y):
        """
        Add zeros at the end of the representation for short instances,
        truncate longer ones to the maxlen
        """
        vec_dim = len(x[0][0])
        N = len(x)
        X = np.zeros((N, self.maxlen, vec_dim))  # preallocate a np array
        Y = np.array(y)

        for i, tokens in enumerate(x):
            length = min(len(tokens), self.maxlen)
            if length > 0:
                X[i, :length] = np.asarray(tokens[:length])  # fill the np array
        return X, Y

    def collate(self, X, Y, N, epochs = 1):
        """
        This method is used to feed batches into the `model.fit` method
        """
        for _ in range(epochs):
            '''
            This `for _ in range(epochs):` loop is here because
            the `for` below needs to be able to be called `epochs` times
            so each time the `for` is called the iterator is replenished
            for the new epoch
            '''
            for start in range(0, N, self.batch_size):
                end = start + self.batch_size
                x_batch, y_batch = self.padding_and_truncating(
                    X[start:end],
                    Y[start:end]
                    )
                yield x_batch, y_batch



In [None]:
collator = Collator(maxlen = maxlen, batch_size = batch_size)

One nice thing is that since we're using Keras (which is based on TensorFlow) then we need to pad each batch of the dataset to the same length, otherwise Keras will rebuild the whole computation graph for the backpropagation for each batch, making training very (very) slow. So if instead you keep each sequence in the batch to the same `maxlen` then it goes very fast.

## Building the network

Adding the convolutional layer

In [None]:
print('Building model...')
model = Sequential()   # The standard NN model
model.add(
    Input(
        shape=(maxlen, embedding_dims)
        )
    )
model.add(Conv1D(      # Adding a convolutional layer
        filters,
        kernel_size,
        padding='valid',   # in this example the output is going to be slightly smaller
        activation='relu',
        strides=1,         # the shift
        )
    )
# Formulation: max (0, dot(filter, 3-gram))

In [None]:
# Adding the max pooling
# Alternatives
# - GlobalMaxPooling1D() (the max for the entire filter's output)
# - MaxPooling1D(n)  (the max for a specific area of n; default n=2)
# - AvgPooling1D(n)

model.add(GlobalMaxPooling1D())

**back to the slides to see what is pooling (and drop out)**

In [None]:
# Adding dropout (20% of the data will be "cancelled")
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

In [None]:
# Adding the classification layer
# sigmoid range: [0,1]
model.add(Dense(1))
model.add(Activation('sigmoid'))

**back to the slides**

Now we compile the network, using binary cross entropy as the loss function and Adam as the optimiser. Visit [this website for further details on this loss](https://towardsdatascience.com/understanding-binary-cross-entropy-log-loss-a-visual-explanation-a3ac6025181a/) and [this optimiser](https://optimization.cbe.cornell.edu/index.php?title=Adam).

In [None]:
# Compiling the CNN

model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
        )

In [None]:
# Here we use the collator, which is in charge of iterating over the dataset
# for each batch, rather than just loading the whole dataset.
model.fit(
    collator.collate(x_train, y_train, N = len(x_train), epochs=epochs),
    steps_per_epoch=len(x_train) // batch_size,
    validation_data=collator.collate(x_test, y_test, N = len(x_test), epochs=epochs),
    validation_steps=len(x_test) // batch_size,
    epochs=epochs,
)

In [None]:
# Saving the model
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)  # saves just the architecture
model.save_weights("cnn.weights.h5")  # saves the weights
# You can run fit many times on the same model (it will continue)

In [None]:
# (Re)loading the model (not necessary here, but anyway)
from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('cnn.weights.h5')

In [None]:
# Predicting a new instance

# Notice we have both positive and negative words here
sample_negative = """I hate that the dismal weather had me down for so long,
when will it break! Ugh, when does happiness return? The sun is
blinding and the puffy clouds are too thin. I can't wait for the weekend."""
# Super positive sample
sample_positive = """I love that incredible weather!
I feel like this happiness will last forever!
The sun is super nice and the breeze is soothing.
I could stay like this forever.
"""

# The first value is a "fake" class (this is the expected input)
data_dummy = [(0, sample_negative), (0, sample_positive)]
x_dummy = tokenize_and_vectorize(data_dummy)
y_dummy = collect_expected(data_dummy)

In [None]:
# Get model logits (logits = raw model output)
x_dummy_padded, y_dummy = collator.padding_and_truncating(x_dummy, y_dummy)
preds = model.predict(x_dummy_padded)
print(preds)

In [None]:
# Get the class by thresholding the logits
(preds > 0.5).astype("int32")

**End of the notebook**