In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from neural_language_model import ft_embedding
from neural_language_model.preprocess_data import TextDataset

scratch_location = '/scratch/hmnshpl/anlp_data'
filename = 'Auguste_Maquet.txt'
filepath = os.path.join(scratch_location, filename)
glove_file_path = '/scratch/hmnshpl/anlp_data/glove.6B.300d.txt' 

In [2]:
import re

def clean_text(text):
    # Remove special characters and multiple spaces
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Read and clean the data
with open(filepath, 'r', encoding='utf-8') as file:
    corpus = file.read()

cleaned_corpus = clean_text(corpus)

In [3]:
from transformers import GPT2Tokenizer

# Initialize GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a new [PAD] token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Verify that the pad_token is added
print(f"Padding token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")

# Tokenize the cleaned text
tokens = tokenizer(cleaned_corpus, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
input_ids = tokens['input_ids']



Padding token: [PAD], ID: 50257


In [30]:
class TransformerDecoderOnly(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_decoder_layers, dim_feedforward, dropout):
        super(TransformerDecoderOnly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, d_model))
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.nhead = nhead

    def forward(self, input_ids, tgt_mask=None):
        # Embedding and positional encoding
        embedded = self.embedding(input_ids) + self.positional_encoding[:, :input_ids.size(1), :]
        
        # Generate the mask with the correct shape [Batch Size, Tgt Length, Tgt Length]
        tgt_mask = tgt_mask if tgt_mask is not None else self.generate_square_subsequent_mask(input_ids.size(0), input_ids.size(1)).to(input_ids.device)
        
        # Transformer decoder
        decoder_output = self.transformer_decoder(embedded, embedded, tgt_mask=tgt_mask)
        
        # Output layer
        output = self.fc_out(decoder_output)
        return output
    
    def generate_square_subsequent_mask(self, batch_size, size):
        # Generate a square mask for the sequence. The masked positions are filled with float('-inf').
        # Unmasked positions are filled with float(0.0).
        mask = torch.triu(torch.ones(size, size) * float('-inf'), diagonal=1)
        mask = mask.float().masked_fill(mask == 0, float(0.0))
        # Repeat mask for batch size
        mask = mask.unsqueeze(0).repeat(batch_size, 1, 1)  # Shape: [Batch Size, Tgt Length, Tgt Length]
        return mask

In [5]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx, :-1], self.input_ids[idx, 1:]  # Input and target

# Create the dataset and dataloader
dataset = TextDataset(input_ids)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [31]:
# Model parameters
d_model = 512
nhead = 8
num_decoder_layers = 6
dim_feedforward = 2048
dropout = 0.1
vocab_size = tokenizer.vocab_size

# Initialize the model, loss function, and optimizer
model = TransformerDecoderOnly(vocab_size, d_model, nhead, num_decoder_layers, dim_feedforward, dropout).to('cuda')
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# # Training loop
# epochs = 5
# for epoch in range(epochs):
#     model.train()
#     total_loss = 0
#     for inputs, targets in dataloader:
#         inputs, targets = inputs.to('cuda'), targets.to('cuda')
#         optimizer.zero_grad()
#         output = model(inputs)
#         loss = criterion(output.view(-1, vocab_size), targets.view(-1))
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to('cuda'), targets.to('cuda')
        optimizer.zero_grad()
        
        # Generate the mask for the current batch
        tgt_mask = model.generate_square_subsequent_mask(inputs.size(0), inputs.size(1)).to(inputs.device)
        
        # Forward pass
        output = model(inputs, tgt_mask=tgt_mask)
        
        # Compute loss
        loss = criterion(output.view(-1, vocab_size), targets.view(-1))
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

RuntimeError: The shape of the 3D attn_mask is torch.Size([1, 511, 511]), but should be (4088, 1, 1).

In [15]:
# 4088//511