# Dependencies

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import math
import numpy as np
import torch
import torch.nn as nn
from drive.MyDrive.ATNLP.crispy_fortnight_exp3.transformer import Transformer
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
from drive.MyDrive.ATNLP.crispy_fortnight_exp3.accuracy import sequence_level_accuracy, token_lvl_accuracy

# %%
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}")

# Experiment 2 & 3 Hyperparameters
EMB_DIM = 128
N_LAYERS = 2
N_HEADS = 8
FORWARD_DIM = 256
DROPOUT = 0.15
LEARNING_RATE = 2e-4
GRAD_CLIP = 1
BATCH_SIZE = 16
MAX_LEN = 128
# Optimizer: AdamW

Running on cuda


# DataLoader

In [3]:
class TasksData(Dataset):
    def __init__(self, data_dir, file, transform=None):
        self.data_dir = data_dir
        self.file = file
        text_file = os.path.join(data_dir, file)

        data_dict = {"src": [], "tgt": []}

        with open(text_file, 'r') as f:
            lines = f.readlines()
            for line in lines:
                src = line.split('OUT:')[0]
                src = src.split('IN:')[1].strip()
                tgt = line.split('OUT:')[1].strip()

                data_dict['src'].append(src)
                data_dict['tgt'].append(tgt)

        self.data = pd.DataFrame(data_dict)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = self.data['src'].iloc[idx] + ' <EOS>'
        tgt = '<SOS> ' + self.data['tgt'].iloc[idx] + ' <EOS>'
        return src, tgt

def create_vocab(dataset):
    vocab = set()

    for sample in dataset:
        vocab.update(sample.split())
    return vocab

# Training, saving, etc

In [11]:
# composed_commands = [0] # [, 1, 2, 4, 8, 16, 32]
composed_commands = 1
rep_number = 2

exp_losses = []
exp_accuracies = []
exp_times = []
token_acc_results = []
seq_scc_results = []
num_epochs = 10

print(f"EXPERIMENT {composed_commands}")
train_data = TasksData(data_dir='/content/drive/MyDrive/ATNLP/data/Experiment-3/data/1_composed_command/train', file=f'tasks_train_addprim_complex_jump_num{composed_commands}_rep{rep_number}.txt')
test_data = TasksData(data_dir='/content/drive/MyDrive/ATNLP/data/Experiment-3/data/1_composed_command/test', file=f'tasks_test_addprim_complex_jump_num{composed_commands}_rep{rep_number}.txt')

# creating source and target vocab - and word2idx
src_train_data = [src for src, tgt in train_data]
vocab_train_src = create_vocab(src_train_data)
tgt_train_data = [tgt for src, tgt in train_data]
vocab_train_tgt = create_vocab(tgt_train_data)
word2idx_src = {w: idx + 1 for (idx, w) in enumerate(vocab_train_src)}
word2idx_src['<PAD>'] = 0
word2idx_tgt= {w: idx + 1 for (idx, w) in enumerate(vocab_train_tgt)}
word2idx_tgt['<PAD>'] = 0

# custom collate function
def custom_collate_fn(batch):
    padded_src = pad_sequence([torch.tensor([word2idx_src[w] for w in src.split()]) for src, tgt in batch], batch_first=True, padding_value=0).to(device)
    padded_tgt = pad_sequence([torch.tensor([word2idx_tgt[w] for w in tgt.split()]) for src, tgt in batch], batch_first=True, padding_value=0).to(device)
    return padded_src, padded_tgt

# create dataloaders
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)


# define the model
model = Transformer(
    src_vocab_size=len(word2idx_src),
    tgt_vocab_size=len(word2idx_tgt),
    src_pad_idx=word2idx_src['<PAD>'],
    tgt_pad_idx=word2idx_tgt['<PAD>'],
    emb_dim=EMB_DIM,
    num_layers=N_LAYERS,
    num_heads=N_HEADS,
    forward_dim=FORWARD_DIM,
    dropout=DROPOUT,
    max_len=MAX_LEN,
).to(device)


# define the optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx_tgt['<PAD>'])

losses = []
accuracies = []

# training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for step, (src, tgt) in (pbar := tqdm(enumerate(train_loader), total=len(train_loader))):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        # inference
        output = model(src, tgt[:, :-1])
        output_dim = output.shape[-1]

        # some sexy reshaping
        output = output.contiguous().view(-1, output_dim)
        tgt = tgt[:, 1:].contiguous().view(-1)

        # loss calculation + backward pass
        loss = criterion(output, tgt)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        pbar.set_description(f'Composed {composed_commands} commands, Epoch [{epoch+1}/{num_epochs}], Loss: {loss:.4f}')

        # optimizer step and loss accumulation
        optimizer.step()
        epoch_loss += loss.item()

    # after one epoch
    avg_epoch_loss = epoch_loss / len(train_loader)
    losses.append(avg_epoch_loss)
    print(f'Composed {composed_commands} commands, Epoch [{epoch+1}/{num_epochs}], Loss: {avg_epoch_loss:.4f}')

# save the model
checkpoint_path = f"exp3_jump_num{composed_commands}_epoch{num_epochs}_v2.pth"
torch.save(
    {'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'losses': losses
}, checkpoint_path)



EXPERIMENT 1


Exp 1, Epoch [1/10], Loss: 0.4671: 100%|██████████| 917/917 [00:18<00:00, 48.34it/s]


Exp 1, Epoch [1/10], Loss: 0.7237


Exp 1, Epoch [2/10], Loss: 0.4454: 100%|██████████| 917/917 [00:18<00:00, 49.16it/s]


Exp 1, Epoch [2/10], Loss: 0.4230


Exp 1, Epoch [3/10], Loss: 0.2546: 100%|██████████| 917/917 [00:18<00:00, 49.63it/s]


Exp 1, Epoch [3/10], Loss: 0.3309


Exp 1, Epoch [4/10], Loss: 0.1396: 100%|██████████| 917/917 [00:18<00:00, 50.21it/s]


Exp 1, Epoch [4/10], Loss: 0.2103


Exp 1, Epoch [5/10], Loss: 0.1241: 100%|██████████| 917/917 [00:18<00:00, 49.97it/s]


Exp 1, Epoch [5/10], Loss: 0.1363


Exp 1, Epoch [6/10], Loss: 0.1796: 100%|██████████| 917/917 [00:18<00:00, 50.12it/s]


Exp 1, Epoch [6/10], Loss: 0.1014


Exp 1, Epoch [7/10], Loss: 0.0572: 100%|██████████| 917/917 [00:18<00:00, 49.97it/s]


Exp 1, Epoch [7/10], Loss: 0.0775


Exp 1, Epoch [8/10], Loss: 0.0639: 100%|██████████| 917/917 [00:18<00:00, 50.28it/s]


Exp 1, Epoch [8/10], Loss: 0.0627


Exp 1, Epoch [9/10], Loss: 0.0294: 100%|██████████| 917/917 [00:18<00:00, 49.33it/s]


Exp 1, Epoch [9/10], Loss: 0.0494


Exp 1, Epoch [10/10], Loss: 0.0294: 100%|██████████| 917/917 [00:18<00:00, 49.46it/s]


Exp 1, Epoch [10/10], Loss: 0.0417


In [12]:
token_tmp = []
seq_tmp = []
MAX_STEP = len(test_loader) # // 5

for step, (src_batch, tgt_batch) in (pbar := tqdm(enumerate(test_loader), total=len(test_loader))):
    if step > MAX_STEP:
        break
    for src, tgt in zip(src_batch, tgt_batch):
        src = src.unsqueeze(0).to(device)
        true_tgt = tgt.unsqueeze(0).to(device)
        tgt = torch.tensor([[word2idx_tgt['<SOS>']]]).to(device)

        iterations = MAX_LEN - 1
        pred_sequence = [tgt.item()]

        for i in range(iterations):
            with torch.no_grad():
                output = model.forward(src, tgt)
                predictions = nn.functional.softmax(output[:, -1, :], dim=-1)
                next_token = predictions.argmax(-1).item()

                pred_sequence.append(next_token)
                tgt = torch.tensor(pred_sequence).unsqueeze(0).to(device)

                # stop if end of sequence
                if next_token == word2idx_tgt['<EOS>']:
                    break

        token_acc = token_lvl_accuracy(word2idx_tgt, true_tgt, tgt)
        seq_acc = sequence_level_accuracy(true_tgt, tgt, word2idx_tgt)
        token_tmp.append(token_acc)
        seq_tmp.append(seq_acc)
        # print(f'ground t: {true_tgt}')
        # print(f'predicted: {tgt}')
        # print(m)
        # print()
    # l += 1

    pbar.set_description(f'Composed {composed_commands} commands{"\n"}Token Acc: {sum(token_tmp)/len(token_tmp):.4f}, Seq Acc: {sum(seq_tmp)/len(seq_tmp):.4f}')
print("")
print(sum(token_tmp)/len(token_tmp))
print(sum(seq_tmp)/len(seq_tmp))

token_acc_results.append(token_tmp)
seq_scc_results.append(seq_tmp)

Exp 1, Token Acc: 0.6203, Seq Acc: 0.0297: 100%|██████████| 482/482 [09:47<00:00,  1.22s/it]


0.6202677628171834
0.02972096041531473



