In [11]:
# Basic imports
import requests
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
from collections import Counter
import math
import random
import numpy as np
import html
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
import string
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
#Daniel's encoder class
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        self.W_q = nn.Linear(embed_size, embed_size, bias=False)
        self.W_k = nn.Linear(embed_size, embed_size, bias=False)
        self.W_v = nn.Linear(embed_size, embed_size, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.heads, self.head_dim)
        K = self.W_k(x).view(batch_size, seq_len, self.heads, self.head_dim)
        V = self.W_v(x).view(batch_size, seq_len, self.heads, self.head_dim)

        Q, K, V = Q.transpose(1, 2), K.transpose(1, 2), V.transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim).float())
        scores = F.softmax(scores, dim=-1)
        out = torch.matmul(scores, V)
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        out = self.fc_out(out)
        return out

class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * -(math.log(10000.0) / embed_size))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x, use_positional_encoding=True):
        if use_positional_encoding:
            return x + self.encoding[:, :x.size(1)].detach()
        else:
            return x

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout=0.1):
        super(TransformerBlock, self).__init__()
        # Self-attention layer
        self.attention = SelfAttention(embed_size, heads)
        # Normalization layer 1
        self.norm1 = nn.LayerNorm(embed_size)
        # Feedforward layers
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )
        # Normalization layer 2
        self.norm2 = nn.LayerNorm(embed_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Apply self-attention
        attention = self.attention(x)
        # Add & normalize (residual connection)
        x = self.norm1(x + attention)
        # Apply feedforward layers
        forward = self.feed_forward(x)
        # Add & normalize (residual connection)
        out = self.norm2(x + forward)
        out = self.dropout(out)
        return out

class TransformerMLMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, max_len, num_heads, forward_expansion, num_layers, dropout):
        super(TransformerMLMModel, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, max_len)

        self.layers = nn.ModuleList([
            TransformerBlock(embed_size, num_heads, forward_expansion, dropout)
            for _ in range(num_layers)
        ])

        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.token_embedding(x)
        x = self.positional_encoding(x)

        for layer in self.layers:
            x = layer(x)

        x = self.fc_out(x)
        return x

In [13]:
#use Ben Gutenberg dataset for training in fine tuning
with open('/content/BGLLM_1.txt', "r", encoding="utf-8") as file:
        text = file.read()
        data = html.unescape(text)
        lines = text.split('\n')
print(lines[0])

﻿The Project Gutenberg eBook of The Bird Book


In [14]:
#I will be using Daniel's preprocessing steps for consistenciey
def clean_text(text_list):
    cleaned_text = []
    for line in text_list:
        # Remove leading and trailing whitespaces
        line = line.strip()
        # Remove punctuation and weird symbols
        line = re.sub(r'[^a-zA-Z\s]', '', line)
        # Convert to lowercase
        line = line.lower()
        # If the line is not empty after cleaning, add it to the cleaned text
        if line:
            cleaned_text.append(line)
    return cleaned_text

#clean text and tokenize using BertTokenizer
cleaned_text = clean_text(lines)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_texts = tokenizer(cleaned_text, return_tensors="pt", padding=True, truncation=True)




In [15]:
# Define your EnglishTextDataset, which pads to the max length and converts to tokens to ids
class EnglishTextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer.tokenize(text)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        # Pad or truncate to max_len
        token_ids = token_ids[:self.max_len]
        padding_length = self.max_len - len(token_ids)
        token_ids += [0] * padding_length
        return torch.tensor(token_ids)

max_len = max(len(line) for line in cleaned_text)
dataset = EnglishTextDataset(cleaned_text, tokenizer, max_len)

In [24]:
#define variables to instantiate model
vocab_size = tokenizer.vocab_size
print(vocab_size)
d_model = 512
max_len = 100
num_heads = 8
forward_expansion = 4
num_layers = 6
drop_prob = 0.1
batch_size = 32
num_epochs = 2
learning_rate = 1e-4
mask_prob = 0.15

#create model class
model = TransformerMLMModel(vocab_size, d_model, max_len, num_heads, forward_expansion, num_layers, drop_prob)
#load parameters from Daniel's model
model.load_state_dict(torch.load('/content/drive/MyDrive/Dhar Aamina LLM Assignments/Copy of transformer_mlm_model.pth'))

30522


<All keys matched successfully>

In [18]:
#create new final layer to train
new_final_layer = nn.Linear(forward_expansion*d_model, vocab_size)  # Adjust vocabulary_size accordingly

# Replace the final layer of the model with the new one
model.final_layer = new_final_layer
#Freeze parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the parameters of the final layer
for param in model.final_layer.parameters():
    param.requires_grad = True

In [21]:
# Define your optimizer and loss function
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create your data loader
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

#create new training loop for next word prediction--I am getting some error here
for epoch in range(num_epochs):
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch.to(device)  # Send input to device if using GPU
        print(f'shape of input_ids{np.shape(input_ids)}')
        # Forward pass
        outputs = model(input_ids[:, :-1])  # Predict next token for each position except the last one
        targets = input_ids[:, 1:]  # Shifted target tokens
        print(f'target shape is {np.shape(targets)}')
        loss = criterion(outputs, targets)
        # Backward pass
        loss.backward()
        optimizer.step()

shape of input_idstorch.Size([32, 76])
target shape is torch.Size([32, 75])


RuntimeError: Expected target size [32, 30522], got [32, 75]