In [8]:
# %%
import os
import math
import numpy as np
import torch
import torch.nn as nn
from transformer import Transformer
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from tqdm import tqdm

# %%
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device('mps')
# EXPERIMENT 1:
EMB_DIM = 128
N_LAYERS = 1
N_HEADS = 8
FORWARD_DIM = 512
DROPOUT = 0.05
LEARNING_RATE = 7e-4
BATCH_SIZE = 64
GRAD_CLIP = 1
MAX_LEN = 128 # ????

In [9]:
# Task 0: DataLoader and Preprocessing
class TasksData(Dataset):
    def __init__(self, data_dir, file, transform=None):
        self.data_dir = data_dir
        self.file = file
        text_file = os.path.join(data_dir, file)

        data_dict = {"src": [], "tgt": []}

        with open(text_file, 'r') as f:
            lines = f.readlines()
            for line in lines:
                src = line.split('OUT:')[0]
                src = src.split('IN:')[1].strip()
                tgt = line.split('OUT:')[1].strip()

                data_dict['src'].append(src)
                data_dict['tgt'].append(tgt)

        self.data = pd.DataFrame(data_dict)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = self.data['src'].iloc[idx] + ' <EOS>'
        tgt = '<SOS> ' + self.data['tgt'].iloc[idx] + ' <EOS>'
        return src, tgt

def create_vocab(dataset):
    vocab = set()

    for sample in dataset:
        vocab.update(sample.split())
    return vocab

# %%
# creating datasets
train_data = TasksData(data_dir='./data', file='tasks_train_simple.txt')
test_data = TasksData(data_dir='./data', file='tasks_test_simple.txt')

#creating source and target vocab
src_train_data = [src for src, tgt in train_data]
vocab_train_src = create_vocab(src_train_data)

tgt_train_data = [tgt for src, tgt in train_data]
vocab_train_tgt = create_vocab(tgt_train_data)

# we need to do word2idx to map the words to indexes. Bc the input for nn.Embedding has to be numbers
# since nn.Embdding has different weights in input andoutput embedding the same index will not be encoded to the same vector
word2idx_src = {w: idx + 1 for (idx, w) in enumerate(vocab_train_src)}
word2idx_src['<PAD>'] = 0

word2idx_tgt= {w: idx + 1 for (idx, w) in enumerate(vocab_train_tgt)}
word2idx_tgt['<PAD>'] = 0

# We need Vocabulary size without padding
# word2idx
# padding
#vocabulary and word2idx

def custom_collate_fn(batch):
    #input: batch of sentences
    # tokenize, word2idx, pad
    padded_src = pad_sequence([torch.tensor([word2idx_src[w] for w in src.split()]) for src, tgt in batch], batch_first=True, padding_value=0).to(device)
    padded_tgt = pad_sequence([torch.tensor([word2idx_tgt[w] for w in tgt.split()]) for src, tgt in batch], batch_first=True, padding_value=0).to(device)

    return padded_src, padded_tgt

# %%
# create dataloaders
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

In [12]:
# define the model
model = Transformer(
    src_vocab_size=len(word2idx_src),
    tgt_vocab_size=len(word2idx_tgt),
    src_pad_idx=word2idx_src['<PAD>'],
    tgt_pad_idx=word2idx_tgt['<PAD>'],
    emb_dim=EMB_DIM,
    num_layers=N_LAYERS,
    num_heads=N_HEADS,
    forward_dim=FORWARD_DIM,
    dropout=DROPOUT,
    max_len=MAX_LEN,
).to(device)

## TRAINING

In [11]:
# model = Transformer(
#     src_vocab_size=len(word2idx_src),
#     tgt_vocab_size=len(word2idx_tgt),
#     src_pad_idx=word2idx_src['<PAD>'],
#     tgt_pad_idx=word2idx_tgt['<PAD>'],
#     emb_dim=EMB_DIM,
#     num_layers=N_LAYERS,
#     num_heads=N_HEADS,
#     forward_dim=FORWARD_DIM,
#     dropout=DROPOUT,
#     max_len=MAX_LEN,
# ).to(device)

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx_tgt['<PAD>'])

# Training loop
num_epochs = 30
losses = []
accuraacy = []
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for step, (src, tgt) in (pbar := tqdm(enumerate(train_loader), total=len(train_loader))):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()

        # output = model(src, tgt)
        output = model(src, tgt[:, :-1])
        output_dim = output.shape[-1]

        output = output.contiguous().view(-1, output_dim)
        tgt = tgt[:, 1:].contiguous().view(-1)

        loss = criterion(output, tgt)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        pbar.set_description(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss:.4f}')

        optimizer.step()
        epoch_loss += loss.item()
    avg_epoch_loss = epoch_loss / len(train_loader)
    losses.append(avg_epoch_loss)
checkpoint_path = f"transformer_exp1.pth"
torch.save(
    {'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, checkpoint_path)


  K_transposed = K.T.permute(3, 1, 0, 2)
Epoch [1/30], Loss: 0.6764: 100%|██████████| 262/262 [00:11<00:00, 22.04it/s]
Epoch [2/30], Loss: 0.5681: 100%|██████████| 262/262 [00:11<00:00, 23.24it/s]
Epoch [3/30], Loss: 0.5263: 100%|██████████| 262/262 [00:12<00:00, 21.61it/s]
Epoch [4/30], Loss: 0.4217: 100%|██████████| 262/262 [00:11<00:00, 22.38it/s]
Epoch [5/30], Loss: 0.4353: 100%|██████████| 262/262 [00:11<00:00, 22.92it/s]
Epoch [6/30], Loss: 0.3861: 100%|██████████| 262/262 [00:11<00:00, 22.12it/s]
Epoch [7/30], Loss: 0.2752: 100%|██████████| 262/262 [00:11<00:00, 23.44it/s]
Epoch [8/30], Loss: 0.2896: 100%|██████████| 262/262 [00:11<00:00, 21.89it/s]
Epoch [9/30], Loss: 0.3171:  23%|██▎       | 60/262 [00:02<00:07, 26.40it/s]

In [5]:
# Reverse the word-to-index mapping
idx2word_src = {idx: w for w, idx in word2idx_src.items()}
idx2word_tgt = {idx: w for w, idx in word2idx_tgt.items()}

def decode_indices(indices, idx2word):
    return ' '.join(idx2word[idx] for idx in indices if idx in idx2word and idx != word2idx_src['<PAD>'])


## TOKEN LEVEL ACC

In [5]:
# testing
checkpoint_path = f"transformer_exp1.pth"

model.eval()
with torch.no_grad():
    for src, tgt in test_loader:
        src, tgt = src.to(device), tgt.to(device)

        # output = model(src, tgt)  # Output predictions for target tokens
        output = model.forward(src)
        predictions = output.argmax(dim=-1)  # Get most probable token indices

        # Decode result
        src_indices = src[0].tolist()
        tgt_indices = tgt[0].tolist()
        pred_indices = predictions[0].tolist()
        decoded_src = decode_indices(src_indices, idx2word_src)
        decoded_tgt = decode_indices(tgt_indices, idx2word_tgt)
        decoded_pred = decode_indices(pred_indices, idx2word_tgt)

        print(f"Source    : {decoded_src}")
        print(f"Target    : {decoded_tgt}")
        print(f"Prediction: {decoded_pred}")

        # Exclude padding tokens from evaluation
        mask = (tgt != word2idx_tgt['<PAD>'])
        total_tokens += mask.sum().item()
        correct_tokens += ((predictions == tgt) & mask).sum().item()

accuracy = correct_tokens / total_tokens * 100
print(f"Token-Level Accuracy: {accuracy:.2f}%")

NameError: name 'checkpoint_path' is not defined

## TESTING

In [55]:
checkpoint_path = f"transformer_exp1.pth"
ckp = torch.load(checkpoint_path)
model.load_state_dict(ckp['model_state_dict'])
total_tokens = 0
correct_tokens = 0

def inference(input_seq: str):
    model.eval()
    pass

model.eval()

src, true_tgt = next(iter(test_loader))
print(src.shape)
src = src[0].unsqueeze(0).to(device)
true_tgt = true_tgt[0].unsqueeze(0).to(device)
# src, true_tgt = next(iter(test_loader))[0][0].unsqueeze(0).to(device)

print(src)
tgt = torch.tensor([[word2idx_tgt['<SOS>']]]).to(device)
# tgt = torch.tensor([[word2idx_tgt['<SOS>']]]).to(device)

# print(f'tgt {tgt.shape}')
# print(f'src {src.shape}')
iterations = 20
pred_sequence = [tgt.item()]
# print(pred_sequence)

for i in range(iterations):
    with torch.no_grad():
        # print(tgt)
        output = model.forward(src, tgt)
        predictions = nn.functional.softmax(output[:, -1, :], dim=-1)
        # print("argmax:", predictions.argmax(-1).shape)
        next_token = predictions.argmax(-1).item()

        if next_token == word2idx_tgt['<EOS>']:
            break

        next_token_tensor = torch.tensor([[next_token]]).to(device)
        tgt = torch.cat([tgt, next_token_tensor], dim=1)
        
        # pred_sequence.append(next_token)
        # tgt = torch.tensor(pred_sequence).unsqueeze(0).to(device)
        # print(tgt.shape)
        # print("tgt", tgt)

print(f'ground t: {true_tgt}')
print(f'predicted: {tgt}')


torch.Size([64, 10])
tensor([[ 1,  9,  7,  2,  3,  1,  9, 14,  8,  0]])
ground t: tensor([[5, 8, 8, 8, 8, 8, 8, 7, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])
predicted: tensor([[5, 7, 7, 7, 7, 7, 7, 8]])


  ckp = torch.load(checkpoint_path)
