<a href="https://colab.research.google.com/github/fgs2/f20aa-2024/blob/main/cw2/transformers/groundUpTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# F20AA Applied Text Analytics: Coursework 2 - Custom Transformer Notebook
#### Deadline: 11:59pm, Monday 1st April 2024 via Canvas group space

#### Members:
- Francis Sandrino (fgs2)
- Jai Varsani (jv81)
- Ahmed Moussa Abdelfattah (asa30)
- Aamir Nazir (mn2025)

### What is this?
The purpose of this notebook is to serve as a form of parallelization with different Google Colab accounts to speed up experimentation. This notebook will have minimal documentation, only to aid the group members in understanding the code. The proper documentation, results, and discussion for all processing notebooks is included in the [main file](../amazonCW.ipynb).

### What does this specific notebook deal with?
Creating a transformer from the ground up and experimenting with it.

### TODO: Experimental Design

In [1]:
# This is so I don't have to keep uploading on Colab.

import os
import requests
from requests.auth import HTTPBasicAuth

def downloadFileFromRepo(username, repository, branch, filepath, token):
    # Construct the URL to download the file from GitHub
    url = f"https://raw.githubusercontent.com/{username}/{repository}/{branch}/{filepath}"

    # Send a GET request to download the file
    response = requests.get(url, auth=HTTPBasicAuth(username, token))

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the file name from the URL
        fileName = filepath.split('/')[-1]

        # Create the 'data' directory if it doesn't exist
        if not os.path.exists('data'):
            os.makedirs('data')

        # Define the file path within the 'data' directory
        localFilepath = os.path.join('data', fileName)

        # Write the file content to a local file
        with open(localFilepath, 'wb') as f:
            f.write(response.content)
        print(f"File '{fileName}' downloaded successfully.")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

username = ""
repository = ""
branch = ""
path_to_file = ""
repoToken = ""
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/trainStemmed.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/testLemmatized.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/testStemmed.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/lemmaTokenizer.json"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/stemTokenizer.json"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/train.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/test.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)


File 'trainLemmatized.csv' downloaded successfully.
File 'trainStemmed.csv' downloaded successfully.
File 'testLemmatized.csv' downloaded successfully.
File 'testStemmed.csv' downloaded successfully.
File 'lemmaTokenizer.json' downloaded successfully.
File 'stemTokenizer.json' downloaded successfully.
File 'train.csv' downloaded successfully.
File 'test.csv' downloaded successfully.


In [2]:
!pip install --upgrade pip
!pip install --upgrade torch
!pip install tensorflow
!pip install pyyaml h5py

import tensorflow as tf
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import torch.nn.functional as F

from nltk.tokenize import word_tokenize
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

seed = 50

[0m

In [3]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output, attn_output_weights = F.multi_head_attention_forward(
            query=x, key=x, value=x, embed_dim_to_check=x.size(-1),
            num_heads=self.self_attn.num_heads,
            in_proj_weight=self.self_attn.in_proj_weight,
            in_proj_bias=self.self_attn.in_proj_bias,
            bias_k=self.self_attn.bias_k,
            bias_v=self.self_attn.bias_v,
            add_zero_attn=self.self_attn.add_zero_attn,
            dropout_p=self.self_attn.dropout,
            out_proj_weight=self.self_attn.out_proj.weight,
            out_proj_bias=self.self_attn.out_proj.bias,
            training=True,  # Set training to True
            key_padding_mask=mask,
            need_weights=False,
            attn_mask=None,
            use_separate_proj_weight=False,
            q_proj_weight=None,
            k_proj_weight=None,
            v_proj_weight=None,
            static_k=None,
            static_v=None,
            average_attn_weights=False,
            is_causal=False)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, num_classes, dropout):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.positional_encoding(self.embedding(x)))

        enc_output = embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, None)

        # Global average pooling
        pooled_output = enc_output.mean(dim=1)

        logits = self.fc(pooled_output)
        return logits

In [4]:
# Instantiate the Transformer model
vocab_size = 77414  # Example vocabulary size
d_model = 256
num_heads = 8
num_layers = 6
d_ff = 1024
max_seq_length = 1885
num_classes = 5  # Example number of classes
dropout = 0.1

model = TransformerClassifier(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, num_classes, dropout)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for processing")
model = model.to(device)

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

Using cuda:0 for processing


In [5]:
dataset = pd.read_csv("data/trainLemmatized.csv")
training, testing = train_test_split(dataset, stratify = dataset['labels'], test_size=0.1, random_state=42)
training.to_csv("data/trainLemmatizedTr.csv", index = False)
testing.to_csv("data/trainLemmatizedTe.csv", index = False)

In [6]:
# # Empirical value
# MAXLENGTH = 1885

# # Loading stemmed tokenizer from the JSON file
# with open("data/stemTokenizer.json", "r") as json_file:
#     tokenizerJSON = json_file.read()
#     stemTokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizerJSON)
# stemVocabSize = len(stemTokenizer.word_index)
# print(f"Stemmed Tokenizer loaded successfully with {stemVocabSize} words.")

# Determines number of rows per batch to process on
trainBatchSize = 32

lemmatizedDataset = tf.data.experimental.make_csv_dataset("data/trainLemmatizedTr.csv",
                                                batch_size = trainBatchSize,
                                                select_columns = ["data", "labels"],
                                                label_name = "labels",
                                                num_epochs = 5,
                                                shuffle_seed = 43,
                                                shuffle = True)

# Loading tokenizers from the JSON files
with open("data/lemmaTokenizer.json", "r") as json_file:
    tokenizerJSON = json_file.read()
    lemmaTokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizerJSON)
lemmaVocabSize = len(lemmaTokenizer.word_index)
print(f"Lemmatized Tokenizer loaded successfully with {lemmaVocabSize} words.")

Lemmatized Tokenizer loaded successfully with 77413 words.


In [None]:
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
!mkdir models

# Iterator to avoid loading the entire dataset
iterator = iter(lemmatizedDataset)

# To keep track of which batch we're operating on
TESTINTERVAL = 5000
accuracy = 0
progress = 0
updateAccuracy = 5000
maxAccuracy = 0
aveAccuracy = 0
batch_loss = 0
batch_accuracy = 0
totalLoss = 0
totalCorrect = 0
totalSamples = 0

dataset = pd.read_csv("data/trainLemmatizedTe.csv")
training = dataset['data'].tolist()
testing = dataset['labels'].tolist()

training = [str(item) for item in training]
training = lemmaTokenizer.texts_to_sequences(training)
training = tf.keras.preprocessing.sequence.pad_sequences(training, maxlen=max_seq_length, padding="post")

testing = [x - 1 for x in testing]
testing = tf.keras.utils.to_categorical(testing, num_classes = 5)

training = torch.tensor(training)
training = training.to(device)
testing = torch.tensor(testing)
testing = testing.to(device)
toTest = TensorDataset(training, testing)
dataset = DataLoader(toTest, batch_size=trainBatchSize, shuffle=False)

try:
  while True:
    # Admin stuff
    model.train()
    progress = progress + 1
    if progress % TESTINTERVAL == 0:
      updateAccuracy = progress
    clear_output(wait = True)
    print(f"Batch number: {progress}")
    print(f"Batch loss: {batch_loss}")
    print(f"Batch accuracy: {batch_accuracy}\n")
    print(f"Next test accuracy update at batch: {updateAccuracy}")
    print(f"Max accuracy: {maxAccuracy}")
    print(f"Latest accuracy: {accuracy}")
    print(f"Average accuracy: {aveAccuracy}")


    # Obtain batch of text as a list
    batch = next(iterator)
    current = batch[0]['data'].numpy().tolist()
    decoded = list(map((lambda x : x.decode()), current))

    # Keep track of labels of each batch
    currentLabels = batch[1].numpy().tolist()

    tokenizedDocs = lemmaTokenizer.texts_to_sequences(decoded)
    paddedData = tf.keras.preprocessing.sequence.pad_sequences(tokenizedDocs, maxlen=1885, padding="post")

    currentLabels = [x - 1 for x in currentLabels]
    currentLabels = tf.keras.utils.to_categorical(currentLabels, num_classes = 5)

    padded_data_tensor = torch.tensor(paddedData)
    current_labels_tensor = torch.tensor(currentLabels)
    train_dataset = TensorDataset(padded_data_tensor, current_labels_tensor)
    train_dataloader = DataLoader(train_dataset, batch_size=trainBatchSize, shuffle=True)

    totalLoss = 0
    totalCorrect = 0
    totalSamples = 0

    for batch_inputs, batch_labels in train_dataloader:
        batch_inputs = batch_inputs.to(device)
        batch_labels = batch_labels.to(device)
        optimizer.zero_grad()  # Clear gradients
        # print(batch_inputs)
        # print(f"Length of inputs: {len(batch_inputs)}")
        # print(batch_labels)
        # print(f"Length of labels: {len(batch_labels)}")
        outputs = model(batch_inputs)  # Forward pass
        # loss = criterion(outputs.view(-1, num_classes), batch_labels)
        loss = criterion(outputs, batch_labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update parameters
        totalLoss += loss.item() * batch_inputs.size(0)
        actual = batch_labels.argmax(dim = 1)
        _, predicted = torch.max(outputs, 1)
        totalCorrect += (predicted == actual).sum().item()
        totalSamples += batch_inputs.size(0)

    batch_loss = totalLoss / len(train_dataset)
    batch_accuracy = totalCorrect / totalSamples

    if progress % TESTINTERVAL == 0:
      val_loss = 0
      val_correct = 0
      val_samples = 0
      model.eval()
      with torch.no_grad():
        print()
        print("Saving model...")
        torch.save(model.state_dict(), f"models/groundUpTransformerB{progress}.pth")
        print(f"Model saved under: models/groundUpTransformerB{progress}.pth")
        for i, (batch_inputs, batch_labels) in enumerate(dataset):
            if i % 100 == 0:
              print(f"Testing batch {i} to {i + 100}")
            batch_inputs = batch_inputs.to(device)
            batch_labels = batch_labels.to(device)
            outputs = model(batch_inputs)
            val_loss += criterion(outputs, batch_labels).item() * batch_inputs.size(0)
            actual = batch_labels.argmax(dim = 1)
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == actual).sum().item()
            val_samples += batch_inputs.size(0)
        accuracy = val_correct / val_samples
      if maxAccuracy < accuracy:
        maxAccuracy = accuracy
      aveAccuracy = (((aveAccuracy * (progress - 1)) + accuracy) / (updateAccuracy / TESTINTERVAL))
      updateAccuracy = updateAccuracy + TESTINTERVAL

except StopIteration:
  print("End of iterator reached.")

# # Split data into training and validation sets
# train_texts, val_texts, train_labels, val_labels = train_test_split(padded_texts, labels, test_size=0.2, random_state=42)

# batch_size = 64

# # Create TensorFlow Dataset
# train_dataset = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
# train_dataset = train_dataset.batch(batch_size)

# val_dataset = tf.data.Dataset.from_tensor_slices((val_texts, val_labels))
# val_dataset = val_dataset.batch(batch_size)

# testDataset = tf.data.Dataset.from_tensor_slices(val_texts)
# testDataset = testDataset.batch(batch_size)

Batch number: 9048
Batch loss: 0.3726446330547333
Batch accuracy: 0.9375

Next test accuracy update at batch: 10000
Max accuracy: 0.7634832370566323
Latest accuracy: 0.7634832370566323
Average accuracy: 0.7634832370566323


In [None]:
concatenated = np.concatenate(preds)
results = pd.DataFrame(concatenated)
results.rename(columns = {0 : 'overall'}, inplace = True)
results.insert(0, 'id', range(len(results)))
results

In [None]:
results.to_csv("SimpleCNN1.csv", index = False)

In [None]:
model.save('SimpleCNN1.keras')
# loaded_model = tf.keras.saving.load_model('insert-model-name.keras')