In [11]:
# Basic imports
import requests
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
from collections import Counter
import math
import random
import numpy as np
import html
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
import string
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
#Daniel's Encoder
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        self.W_q = nn.Linear(embed_size, embed_size, bias=False)
        self.W_k = nn.Linear(embed_size, embed_size, bias=False)
        self.W_v = nn.Linear(embed_size, embed_size, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.heads, self.head_dim)
        K = self.W_k(x).view(batch_size, seq_len, self.heads, self.head_dim)
        V = self.W_v(x).view(batch_size, seq_len, self.heads, self.head_dim)

        Q, K, V = Q.transpose(1, 2), K.transpose(1, 2), V.transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim).float())
        scores = F.softmax(scores, dim=-1)
        out = torch.matmul(scores, V)
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        out = self.fc_out(out)
        return out

class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * -(math.log(10000.0) / embed_size))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x, use_positional_encoding=True):
        if use_positional_encoding:
            return x + self.encoding[:, :x.size(1)].detach()
        else:
            return x

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout=0.1):
        super(TransformerBlock, self).__init__()
        # Self-attention layer
        self.attention = SelfAttention(embed_size, heads)
        # Normalization layer 1
        self.norm1 = nn.LayerNorm(embed_size)
        # Feedforward layers
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )
        # Normalization layer 2
        self.norm2 = nn.LayerNorm(embed_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Apply self-attention
        attention = self.attention(x)
        # Add & normalize (residual connection)
        x = self.norm1(x + attention)
        # Apply feedforward layers
        forward = self.feed_forward(x)
        # Add & normalize (residual connection)
        out = self.norm2(x + forward)
        out = self.dropout(out)
        return out

class TransformerMLMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, max_len, num_heads, forward_expansion, num_layers, dropout):
        super(TransformerMLMModel, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, max_len)

        self.layers = nn.ModuleList([
            TransformerBlock(embed_size, num_heads, forward_expansion, dropout)
            for _ in range(num_layers)
        ])

        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.token_embedding(x)
        x = self.positional_encoding(x)

        for layer in self.layers:
            x = layer(x)

       # x = self.fc_out(x)
        return x

In [13]:
#use Ben Gutenberg dataset for training in fine tuning
with open('/content/BGLLM_1.txt', "r", encoding="utf-8") as file:
        text = file.read()
        data = html.unescape(text)
        lines = text.split('\n')
print(lines[0])

The Project Gutenberg eBook of The Bird Book


In [14]:

#David's Transformer Class
class DecoderLayer(nn.Module):
    # d_model stands for dimension of the word vector in a model
    # drop out defines a dropout rate for regularization;
    # scr_attention? Source-target attention, i.e. encoder decoder attention
    # d_ff, feed-forward dimension, is the dimension of inner layer of the Feedforward
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiheadAttention(d_model, n_heads, dropout)
        self.src_attention = MultiheadAttention(d_model, n_heads, dropout)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
        # Initiate three different Normal Layers
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropout)

    # tgt_mask? target mask, i.e. the musked attention in decoder
    # src_mask? Source mask, it is used to eliminate padding from the encoder part
    def forward(self, x, memory, src_mask, tgt_mask):
        # Self-Attention
        # Residual is used for add layer after the norm
        residual = x
        x = self.layer_norm1(x)
        # What type of mask is tgt_mask; target mask used for masked attention
        # x, _ means _ is a placeholder for attentionweights, which is not
        # important in this context
        x, _ = self.self_attention(x, x, x, tgt_mask)
        x = self.dropout(x)
        x += residual

        # Source-Target Attention
        residual = x
        x = self.layer_norm2(x)
        # ? Cross attention with encoder, memory is from encoder representations
        x, _ = self.src_attention(x, memory, memory, src_mask)
        x = self.dropout(x)
        x += residual

        # Feed Forward
        residual = x
        x = self.layer_norm3(x)
        x = self.feed_forward(x)
        x = self.dropout(x)
        x += residual

        return x


class MultiheadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout):
        super(MultiheadAttention, self).__init__()
        # What is assert; assert makes sure d_model can be divided by n_heads
        # Otherwise, it will raise an attribute error
        assert d_model % n_heads == 0
        self.d_head = d_model // n_heads
        self.n_heads = n_heads
        # what is nn.Linear; initiate a matrix
        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)
        self.linear_out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # Linear projections
        query = self.linear_q(query)
        key = self.linear_k(key)
        value = self.linear_v(value)

        # Split into multiple heads
        # ? ; view methods reshaped the tensor to a 4D tensor, -1 makes it automatically
        # calculate the sequence length, transpose swapped the second and third positions
        # i.e. the number of heads with the length of the sequence
        query = query.view(batch_size, -1, self.n_heads, self.d_head).transpose(1, 2)
        key = key.view(batch_size, -1, self.n_heads, self.d_head).transpose(1, 2)
        value = value.view(batch_size, -1, self.n_heads, self.d_head).transpose(1, 2)

        # Compute attention scores
        # ? transpose the last two dimensions of the tensor
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_head)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        # Apply softmax
        # Apply softmax function to the last dimension of the scores, i.e. the sequence length
        # ? still confused why dim = -1 not -2
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)

        # Apply attention
        context = torch.matmul(attention_weights, value)

        # Merge heads
        # ? contiguous ensures memory is contiguous during transpose
        # we swap the number of heads and length of sequence back
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_head)

        # Linear transformation
        # ? times another linear transformation
        output = self.linear_out(context)

        return output, attention_weights


# d_ff stands for diffuse, similar to a dense layer
class PositionwiseFeedforward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(PositionwiseFeedforward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])
        # Use the linear layer to transform embedded vector back to word
        self.linear = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    # trg? Stands for the target that is used during teacher forcing
    # memory? The vector representation from encoder
    # takes in a tensor with 2D shape for trg, memory as 3D tensor,
    def forward(self, trg, memory, src_mask, trg_mask):
        trg = self.embedding(trg) * math.sqrt(self.d_model)  # Scale embedding
        trg = self.dropout(trg)

        for layer in self.layers:
            trg = layer(trg, memory, src_mask, trg_mask)

        # F.log_softmax? we use log_softmax for numerical stability
        output = F.log_softmax(self.linear(trg), dim=-1)
        return output


class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        self.W_q = nn.Linear(embed_size, embed_size, bias=False)
        self.W_k = nn.Linear(embed_size, embed_size, bias=False)
        self.W_v = nn.Linear(embed_size, embed_size, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.heads, self.head_dim)
        K = self.W_k(x).view(batch_size, seq_len, self.heads, self.head_dim)
        V = self.W_v(x).view(batch_size, seq_len, self.heads, self.head_dim)

        Q, K, V = Q.transpose(1, 2), K.transpose(1, 2), V.transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim).float())
        scores = F.softmax(scores, dim=-1)
        out = torch.matmul(scores, V)
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        out = self.fc_out(out)
        return out


class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout=0.1):
        super(TransformerBlock, self).__init__()
        # Self-attention layer
        self.attention = SelfAttention(embed_size, heads)
        # Normalization layer 1
        self.norm1 = nn.LayerNorm(embed_size)
        # Feedforward layers
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )
        # Normalization layer 2
        self.norm2 = nn.LayerNorm(embed_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Apply self-attention
        attention = self.attention(x)
        # Add & normalize (residual connection)
        x = self.norm1(x + attention)
        # Apply feedforward layers
        forward = self.feed_forward(x)
        # Add & normalize (residual connection)
        out = self.norm2(x + forward)
        out = self.dropout(out)
        return out


class Transformer(nn.Module):
    def __init__(self, d_model, heads, vocab_size, forward_expansion, dropout=0.1, n_layers=1):
        super(Transformer, self).__init__()
        self.encoder = TransformerMLMModel(vocab_size, d_model,10, heads, forward_expansion, n_layers, dropout)
        #load parameters from Daniel's model
        self.encoder.load_state_dict(torch.load('/content/drive/MyDrive/Dhar Aamina LLM Assignments/Copy of transformer_mlm_model.pth'))
        self.decoder = Decoder(vocab_size, d_model, n_layers, heads, forward_expansion, dropout)

    # The forward function returns the original input from encoder and the next word id from vocab dictionary
    # as a 2d tensor
    def forward(self, src_input, trg_input, trg_mask, scr_mask=None):
        memory = self.encoder(src_input)
        log_probs = self.decoder(trg_input, memory, scr_mask, trg_mask)
        # probs = torch.exp(log_probs[:, :, :])
        # next_ids = probs.argmax(dim=-1)
        return log_probs



In [15]:
#Data preprocessing!
with open('/content/BGLLM_1.txt', "r", encoding="utf-8") as file:
        text = file.read()
        data = html.unescape(text)
        lines = text.split('\n')
print(lines[0])


nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from gensim.test.utils import common_texts
import regex as re

# %%
# to get the data, go to https://www.gutenberg.org/. Then go to search and browse, then go to animals. I chose a book randomly and downloaded it to a txt file
# Let me know if you need help with that, it wasn't super apparent how to download it

with open('BGLLM_1.txt', "r", encoding="utf-8") as file:
    text = file.read()
data = html.unescape(text)
sentences = sent_tokenize(text)

# %%
def cleanUp(data):
    # tokenize using WhitespaceTokenizer, which includes some punctuation to keep contractions as single word
    data = data.translate(str.maketrans('', '', string.punctuation))
    words = nltk.WhitespaceTokenizer().tokenize(data)
    # remove stop words, punctuation, make lowercase
    cleaned = [w.lower() for w in words if not w.isnumeric()]
    return cleaned


def vocab_dictionary(list_of_list_of_words):
    # Flatten the list of lists into a single list of tokens
    all_tokens = [token for sublist in list_of_list_of_words for token in sublist]

    # Use a set to find unique tokens, then sort them (optional, for consistency)
    unique_tokens = sorted(set(all_tokens))

    # Create a dictionary mapping each unique token to a unique integer
    out_vocab = {token: idx for idx, token in enumerate(unique_tokens)}
    return out_vocab


def get_keys_from_value(d, val):
    keys = [k for k, v in d.items() if v == val]
    return keys


def truncate_and_pad(list_of_lists_of_tokens, desired_length, pad_symbol):
    output = []
    for sentence in list_of_lists_of_tokens:
        if len(sentence) >= desired_length:
            sentence = sentence[:desired_length]
        else:
            for i in range(desired_length-len(sentence)):
                sentence.append(pad_symbol)
        # manually add sos to each sentence
        sentence = [-2] + sentence
        output.append(sentence)
    return output


# clean up all reviews
tokenized = [cleanUp(i) for i in sentences]
vocab = vocab_dictionary(tokenized)
# add sos, eos, pad, to vocab
vocab['<pad>'] = -1
vocab['<sos>'] = -2
vocab['<eos>'] = -3
# convert to int
tokenized_int = list([vocab.get(word) for word in sentence] for sentence in tokenized)
# for the uniform_int_tokens, we manually add sos and eos to start and end of each sentence
# so the length of each sentence becomes 12
uniform_int_tokens = truncate_and_pad(tokenized_int, 10, -1)
print(f'the length of each sentence is {len(uniform_int_tokens[0])}')
print(f'the number of sentences is {len(uniform_int_tokens)}')
# the first sentence is always the same and has some weird token I can't get rid of
print(len(tokenized))
# covert to words to integers
word_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=1)
# use Word2Vec's method wv to get a KeyedVectors object, which stores the unique vector related to each word in
# the reviews
embeddings = word_model.wv
print(f'vocab size is {len(embeddings)}')

tokenized = tokenized[1:]
tokenized = [np.array(i) for i in tokenized]
int_sequences = [[embeddings[word] for word in sentence if word in embeddings] for sentence in tokenized]
print(np.shape(int_sequences[0]))
# Pad sequences
max_length = 10
padded_sentences = np.zeros((10, 100, len(int_sequences)))
print(np.shape(padded_sentences[:, :, 0]))
for counter, sentence in enumerate(int_sequences):
    num_padding = max_length - np.shape(sentence)[0]
    if num_padding > 0:
        if np.array(sentence).ndim == 1:
            continue
        padded_sentence = np.pad(sentence, ((0, num_padding), (0, 0)), mode='constant', constant_values=0)

    else:
        padded_sentence = np.array(sentence)[:max_length, :]

    padded_sentences[:, :, counter] = padded_sentence

# Convert to array
print(np.shape(padded_sentences))
padded_sentences = np.array(padded_sentences)

# %%
encoder_input = padded_sentences[:-1, ::]
decoder_input = np.array(uniform_int_tokens)[:, :-1]
decoder_target = np.array(uniform_int_tokens)[:, 1:]
print(f'the shape of the decoder target is {np.shape(decoder_target)}')
print(f'the shape of the encoder input is {np.shape(encoder_input)}')
print(f'the shape of the decoder input is {np.shape(decoder_input)}')
np.save('ADencoder_input', encoder_input)
np.save('ADdecoder_input', decoder_input)
np.save('ADdecoder_target', decoder_target)

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

class TransformerDataset(Dataset):
    def __init__(self, encoder_input, decoder_input, output_target):
        self.encoder_input = encoder_input
        self.decoder_input = decoder_input
        self.output_target = output_target

    def __len__(self):
        return len(self.encoder_input)

    def __getitem__(self, idx):
        return {
            'encoder_input': self.encoder_input[idx],
            'decoder_input': self.decoder_input[idx],
            'output_target': self.output_target[idx]
        }
def create_mask(size):
    reversed_mask = torch.triu(torch.ones(size, size), diagonal=1).to(torch.bool)
    mask = ~reversed_mask
    return mask

trg_input = torch.tensor(np.load('ADdecoder_input.npy')[:-1, :] + 3).long()
np_input = np.load('/content/ADencoder_input.npy')
src_input = torch.tensor(np_input, dtype=torch.float32).transpose(0, 2).transpose(1, 2)
decoder_target = np.load('/content/ADdecoder_target.npy')[:-1, :] + 3
target = torch.tensor(decoder_target, dtype=torch.long)
# Splitting the data
enc_inp_train, enc_inp_val, dec_inp_train, dec_inp_val, out_tar_train, out_tar_val = train_test_split(
    src_input, trg_input, target, test_size=0.2, random_state=42)
# Define batch size
batch_size = 32

# Create training and validation datasets
train_dataset = TransformerDataset(enc_inp_train, dec_inp_train, out_tar_train)
val_dataset = TransformerDataset(enc_inp_val, dec_inp_val, out_tar_val)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, optimizer, and loss function
d_model = 100
heads = 5
vocab_size = 7620
forward_expansion = 4

model = Transformer(d_model, heads, vocab_size, forward_expansion)
seq_length = 10  # Assuming your decoder input sequence length is 10
trg_mask = create_mask(seq_length)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

#train entire transformer model using new data-- this should only update the decoder layers
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(train_loader):
        encoder_inp = batch['encoder_input']
        decoder_inp = batch['decoder_input']
        targets = batch['output_target']

        optimizer.zero_grad()
        print(type(encoder_inp))
        log_probs = model(encoder_inp, decoder_inp, trg_mask=trg_mask)
        loss = criterion(log_probs.view(-1, 7620), targets.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Compute accuracy
        _, predicted = torch.max(log_probs, -1)  # Get the index of the max log-probability
        correct_predictions += (predicted.view(-1) == targets.view(-1)).sum().item()
        total_predictions += targets.numel()

    train_accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}, Training Accuracy: {train_accuracy}")

30522


<All keys matched successfully>

In [18]:
#create new final layer to train
new_final_layer = nn.Linear(forward_expansion*d_model, vocab_size)  # Adjust vocabulary_size accordingly

# Replace the final layer of the model with the new one
model.final_layer = new_final_layer
#Freeze parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the parameters of the final layer
for param in model.final_layer.parameters():
    param.requires_grad = True

shape of input_idstorch.Size([32, 76])
target shape is torch.Size([32, 75])


RuntimeError: Expected target size [32, 30522], got [32, 75]