In [52]:


# !pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

In [53]:
# !pip uninstall -y wandb protobuf

# # 2. Install specific versions known to work together
# !pip install protobuf==3.20.0
# !pip install wandb==0.15.5

In [54]:
import os
from os.path import exists
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

import numpy as np
# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True


In [55]:
class EncoderDecoder(nn.Module):
    def __init__(self,encoder,decoder,src_embed,tgt_embed,generator):
        super(EncoderDecoder,self).__init__()
        self.encoder = encoder 
        self.decoder = decoder 
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed 
        self.generator = generator 
        
    def forward(self,src,tgt,src_mask,tgt_mask):
        return self.generator(self.decode(self.encode(src,src_mask), src_mask,tgt,tgt_mask))
        
    def encode(self,src,src_mask):
        return self.encoder(self.src_embed(src), src_mask)
        
    def decode(self,memory,src_mask,tgt,tgt_mask):
        return self.decoder(self.tgt_embed(tgt),memory,src_mask,tgt_mask)
      

In [56]:
class Generator(nn.Module):
    def __init__(self,model_size,vocab):
        super(Generator,self).__init__()
        self.proj = nn.Linear(model_size,vocab)
        
    def forward(self,x):
        return F.log_softmax(self.proj(x), dim=-1)
        

In [57]:
def clones (module,N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [58]:
class LayerNorm(nn.Module):
    def __init__(self,features,eps=1e-6):
        super().__init__()
        self.a = nn.Parameter(torch.ones(features))
        self.b = nn.Parameter(torch.zeros(features))
        self.eps=eps
        
    def forward(self,x):
        mean=x.mean(-1,keepdim=True)
        std=x.std(-1,keepdim=True)
        return self.a*(x-mean)/(std+self.eps)+self.b

In [59]:
class Encoder(nn.Module):
    def __init__(self,layer,N):
        super().__init__()
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)
    def forward(self,x,mask):
        for layer in self.layers:
            x=layer(x,mask)
        return self.norm(x)
    

In [60]:
class SubLayerConection(nn.Module):
    def __init__(self,features,dropout):
        super().__init__()
        self.norm = LayerNorm(features)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x,sublayer):
        return x + self.dropout(sublayer(self.norm(x)))
        
        

In [61]:
class EncoderLayer(nn.Module):
    def __init__(self,size,self_attn,feed_forward,dropout):
        super().__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward 
        self.dropout = dropout 
        self.size = size 

        self.sublayersconections = clones(SubLayerConection(size,dropout),2)

    def forward(self,x,mask):
        x = self.sublayersconections[0](x,lambda x:self.self_attn(x,x,x,mask))
        x = self.sublayersconections[1](x,self.feed_forward)
        return x

In [62]:
class Decoder(nn.Module):
    def __init__(self,layer,N):
        super().__init__()
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self,x,memory,src_mask,tgt_mask):
        for layer in self.layers:
            x = layer(x,memory,src_mask,tgt_mask)
        return self.norm(x)
            


In [63]:
def subsequent_mask(size):
    mask_size=(1,size,size)
    mask=np.triu(np.ones(mask_size),k=1).astype('uint8')
    mask = torch.from_numpy(mask==0)
    return mask

In [64]:
class DecoderLayer(nn.Module):
    def __init__(self,size,self_attn,cross_attn,feed_forward,dropout):
        super().__init__()
        self.self_attn = self_attn
        self.cross_attn = cross_attn
        self.feed_forward = feed_forward
        self.size = size
        self.dropout = dropout
        self.subs = clones(SubLayerConection(size,dropout),3)
    def forward(self,x,memory,src_mask,tgt_mask):
        m = memory
        x=self.subs[0](x,lambda x:self.self_attn(x,x,x,tgt_mask))
        x=self.subs[1](x,lambda x:self.cross_attn(x,m,m,src_mask))
        x=self.subs[2](x,self.feed_forward)
        return x

In [65]:
subsequent_mask(4)[0]

tensor([[ True, False, False, False],
        [ True,  True, False, False],
        [ True,  True,  True, False],
        [ True,  True,  True,  True]])

In [66]:
subsequent_mask(4)[0][0][1]

tensor(False)

In [67]:
  data =    pd.DataFrame([
                
                {
                    "Subsequent Mask": subsequent_mask(20)[0][x, y].item(),
                    "Window": y,
                    "Masking": x,
                }
            
            for y in range(20)
            for x in range(20)
                       
                         ]
                        )
  

In [68]:
data

Unnamed: 0,Subsequent Mask,Window,Masking
0,True,0,0
1,True,0,1
2,True,0,2
3,True,0,3
4,True,0,4
...,...,...,...
395,False,19,15
396,False,19,16
397,False,19,17
398,False,19,18


In [69]:
chart=alt.Chart(data).mark_rect().properties(height=250, width=250).encode(
x=alt.X("Window:O"),
y=alt.Y("Masking:O"),
color=alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis"))).interactive()
chart

In [70]:
def Attention(query,key,value,mask=None,dropout=None):
    size = query.size(-1)
    
    scores = torch.matmul(query,key.transpose(-2,-1))/math.sqrt(size)
    if mask is not None:
        mask = mask.unsqueeze(1) if mask.dim() == 3 else mask
        scores = scores.masked_fill(mask==0,float('-inf'))
    p_atten = scores.softmax(dim=-1)
    if dropout is not None:
        p_atten=dropout(p_atten)
    
    x = torch.matmul(p_atten,value)
        
    return x ,p_atten


In [71]:
class MultiHeadedAttention(nn.Module):
    def __init__(self,head,model_size,dropout=0.1):
        super().__init__()

        assert model_size%head==0
        self.dk = model_size//head
        self.linears = clones(nn.Linear(model_size,model_size),4)
        self.dropout = nn.Dropout(p=dropout)
        self.head = head
    def forward(self,query,key,value,mask=None):
        n_batches=query.size(0)
    
        if mask is not None:
            mask=mask.unsqueeze(1)
        
        query,key,value =[
            linear(x).view(n_batches,-1,self.head,self.dk).transpose(1,2)
        
            for linear,x in zip(self.linears,(query,key,value))
            ]
        x, attention_weights = Attention(query,key,value,mask=mask,dropout=self.dropout)

        x = x.transpose(1,2).contiguous().view(n_batches,-1,self.head*self.dk)
        del query
        del key 
        del value 
        return self.linears[-1](x)
    
    
    
    

In [72]:
class PositionalEncoding(nn.Module):
    def __init__(self,model_size,dropout,max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len,model_size)
        pos = torch.arange(0,max_len).unsqueeze(1)
        i=torch.arange(0,model_size,2).float()

        div_term = torch.exp(-i*math.log(10000)/model_size)
        
        pe[:,0::2] = torch.sin(pos * div_term)
        pe[:,1::2] = torch.cos(pos * div_term)
        
        pe=pe.unsqueeze(0)
        self.register_buffer("pe",pe)
        
    def forward(self,x):
        
        return self.dropout(x+self.pe[:,:x.size(1),:])
        

In [73]:
def example_positional():
    pe = PositionalEncoding(20, 0)
    y = pe.forward(torch.zeros(1, 100, 20))

    data = pd.concat(
        [
            pd.DataFrame(
                {
                    "embedding": y[0, :, dim],
                    "dimension": dim,
                    "position": list(range(100)),
                }
            )
            for dim in [4, 5, 6, 7]
        ]
    )

    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N")
        .interactive()
    )


example_positional()

In [74]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [75]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [76]:
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

In [77]:
# transformer = make_model(10,10)

In [78]:
saved_vocab = torch.load('/kaggle/input/vocabs/vocabularies.pth')
src_vocab = saved_vocab['src_vocab']
tgt_vocab = saved_vocab['tgt_vocab']

In [79]:
tgt_vocab.get_itos()[10]

','

In [80]:
# from torchtext.datasets import AG_NEWS

# # Download data
# train_iter = AG_NEWS(split='train')
# test_iter = AG_NEWS(split='test')


# for label, text in train_iter:
#     print("Label:", label)  # Category number
#     print("Text:", text)    # News text
#     break

In [81]:
from datasets import load_dataset

# Load German-English dataset
dataset = load_dataset("helsinki-nlp/tatoeba_mt", language_pair="deu-eng",trust_remote_code=True)

README.md:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

tatoeba_mt.py:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tatoeba-test.deu-eng.tsv:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

tatoeba-dev.deu-eng.tsv:   0%|          | 0.00/25.1M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [82]:
dataset

DatasetDict({
    test: Dataset({
        features: ['sourceLang', 'targetlang', 'sourceString', 'targetString'],
        num_rows: 17564
    })
    validation: Dataset({
        features: ['sourceLang', 'targetlang', 'sourceString', 'targetString'],
        num_rows: 289748
    })
})

In [83]:
# Check dataset structure
print(dataset)

# Look at a sample
for sample in dataset['test']:
    print("Source Language:", sample['sourceLang'])
    print("Target Language:", sample['targetlang'])
    print("Source String:", sample['sourceString'])
    print("Target String:", sample['targetString'])
    break

# Set up tokenizers
import spacy
de_tokenizer = spacy.load('de_core_news_sm')
en_tokenizer = spacy.load('en_core_web_sm')

# Test tokenization
source_text = sample['sourceString']
target_text = sample['targetString']

print("\nTokenized Source:", [token.text for token in de_tokenizer(source_text)])
print("Tokenized Target:", [token.text for token in en_tokenizer(target_text)])

DatasetDict({
    test: Dataset({
        features: ['sourceLang', 'targetlang', 'sourceString', 'targetString'],
        num_rows: 17564
    })
    validation: Dataset({
        features: ['sourceLang', 'targetlang', 'sourceString', 'targetString'],
        num_rows: 289748
    })
})
Source Language: deu
Target Language: eng
Source String: 1960 wurden die Kriegsschäden beseitigt, und das Schloss wurde zu einem Hotel umgebaut.
Target String: In 1960, the collateral had been removed and the castle was refurbished into a hotel.

Tokenized Source: ['1960', 'wurden', 'die', 'Kriegsschäden', 'beseitigt', ',', 'und', 'das', 'Schloss', 'wurde', 'zu', 'einem', 'Hotel', 'umgebaut', '.']
Tokenized Target: ['In', '1960', ',', 'the', 'collateral', 'had', 'been', 'removed', 'and', 'the', 'castle', 'was', 'refurbished', 'into', 'a', 'hotel', '.']


In [84]:
from torch.utils.data import Dataset

class VocabDataset(Dataset):
    def __init__(self, dataset, src_tokenizer, tgt_tokenizer):
        self.dataset = dataset
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # Get sample
        sample = self.dataset[idx]
        
        # Get source and target texts
        src_text = sample['sourceString']
        tgt_text = sample['targetString']
        
        # Tokenize
        src_tokens = [token.text for token in self.src_tokenizer(src_text)]
        tgt_tokens = [token.text for token in self.tgt_tokenizer(tgt_text)]


        
        return {
            'src': src_tokens,
            'tgt': tgt_tokens
        }
subset_size = 50000
train_subset = dataset['validation'].select(range(5000))



# Create datasets
vocab_dataset  = VocabDataset(
    train_subset,  # Using validation as train for now
    de_tokenizer,
    en_tokenizer
)



# Test it
sample = vocab_dataset[0]
print("Source tokens:", sample['src'])
print("Target tokens:", sample['tgt'])

Source tokens: ['10', 'Jahre', 'sind', 'eine', 'lange', 'Zeit', 'zum', 'Warten', '.']
Target tokens: ['Ten', 'years', 'is', 'a', 'long', 'time', 'to', 'wait', '.']


In [85]:
len(vocab_dataset)

5000

In [86]:
# from torchtext.vocab import build_vocab_from_iterator
# from tqdm import tqdm
# # Token yield functions
# def yield_tokens_src(dataset):
#     for i in tqdm(range(len(dataset)), desc="Building source vocab"):
#         yield dataset[i]['src']

# def yield_tokens_tgt(dataset):
#     for i in tqdm(range(len(dataset)),desc = "Building target vocab"):
#         yield dataset[i]['tgt']

# # Special tokens
# special_tokens = ['<pad>', '<sos>', '<eos>', '<unk>']

# # Build vocabularies
# src_vocab = build_vocab_from_iterator(
#     yield_tokens_src(vocab_dataset),
#     min_freq=2,
#     specials=special_tokens,
#     special_first=True
# )

# tgt_vocab = build_vocab_from_iterator(
#     yield_tokens_tgt(vocab_dataset),
#     min_freq=2,
#     specials=special_tokens,
#     special_first=True
# )

# # Set UNK index
# src_vocab.set_default_index(src_vocab['<unk>'])
# tgt_vocab.set_default_index(tgt_vocab['<unk>'])

# # Test vocabularies
# print("Source vocab size:", len(src_vocab))
# print("Target vocab size:", len(tgt_vocab))

# # Test conversion
# sample = vocab_dataset[0]
# print("\nSource tokens:", sample['src'])
# print("Source indices:", [src_vocab[token] for token in sample['src']])


In [87]:
# # Save vocabularies
# import torch
# torch.save({
#     'src_vocab': src_vocab,
#     'tgt_vocab': tgt_vocab
# }, 'vocabularies.pth')

# Load vocabularies later
saved_vocab = torch.load('/kaggle/input/vocabs/vocabularies.pth')
src_vocab = saved_vocab['src_vocab']
tgt_vocab = saved_vocab['tgt_vocab']

In [88]:
saved_vocab

{'src_vocab': Vocab(), 'tgt_vocab': Vocab()}

In [89]:
# [tgt_vocab[token] for token in sample['tgt']]

In [90]:
from torch.utils.data import Dataset
from tqdm import tqdm
class TranslationDataset(Dataset):
    def __init__(self, dataset, src_tokenizer, tgt_tokenizer,src_vocab, tgt_vocab,max_len=512):
        self.dataset = dataset
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len=max_len
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # Get sample
        sample = self.dataset[idx]
        
        # Get source and target texts
        src_text = sample['sourceString']
        tgt_text = sample['targetString']
        
        # Tokenize
        src_indices = [self.src_vocab[token.text] for token in self.src_tokenizer(src_text)]
        tgt_indices = [self.tgt_vocab[token.text] for token in self.tgt_tokenizer(tgt_text)]
        

        src_indices = src_indices[:self.max_len-1] + [self.src_vocab['<eos>']]
        tgt_indices = [self.tgt_vocab['<sos>']] + tgt_indices[:self.max_len-2] + [self.tgt_vocab['<eos>']]

        src_indices =src_indices +(self.max_len-len(src_indices)) * [self.src_vocab['<pad>']]
        tgt_indices = tgt_indices +(self.max_len-len(tgt_indices)) * [self.tgt_vocab['<pad>']]
        return {
            'src': torch.tensor(src_indices),
            'tgt': torch.tensor(tgt_indices)
        }

# Create datasets
train_dataset = TranslationDataset(
    dataset['validation'],  # Using validation as train for now
    de_tokenizer,
    en_tokenizer,
    src_vocab,
    tgt_vocab
)


# Test it
sample = train_dataset[0]
print("Source tokens:", sample['src'])
print("Target tokens:", sample['tgt'])

# Convert back to tokens to verify
print("\nSource tokens:", [src_vocab.get_itos()[idx.item()] for idx in sample['src']])
print("Target tokens:", [tgt_vocab.get_itos()[idx.item()] for idx in sample['tgt']])

Source tokens: tensor([1316,  210,   32,   25,  312,  125,   84, 6507,    4,    2,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,

In [91]:

train_loader = DataLoader(
    train_dataset,
    batch_size=24,
    shuffle=True
)

# Test the loader
# for batch in train_loader:
#     print("Source shape:", batch['src'].shape)  # Should be [batch_size, max_len]
#     print("Target shape:", batch['tgt'].shape)
#     break

# for batch in train_loader:
#     src_tokens=[[src_vocab.get_itos()[idx]for idx in seq]for seq in batch['src'][:2]]
#     print("\nFirst two source sequences:")
#     print(src_tokens)
#     tgt_tokens=[[tgt_vocab.get_itos()[idx]for idx in seq]for seq in batch['tgt'][:2]]
#     print("\nFirst two source sequences:")
#     print(tgt_tokens)
#     braek
    

In [92]:
# # 1. Move model to CPU first
# model = model.cpu()

# # 2. Delete the model
# del model

# # 3. Clear GPU cache
# torch.cuda.empty_cache()

# # 4. Additional cleanup
# import gc
# gc.collect()

# # Check GPU memory
# import GPUtil
# GPUtil.showUtilization()

In [93]:
# import GPUtil
# GPUtil.showUtilization()

# # Or more detailed info
# print(torch.cuda.memory_summary())

In [94]:
# # At the end of your training
# wandb.finish()

In [95]:

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [96]:
d_model=256
n_heads=8
N=6
model = make_model(
    src_vocab=len(src_vocab),
    tgt_vocab=len(tgt_vocab),
    N=N,
    d_model=d_model,  
    d_ff=1024,    
    h=n_heads,
    dropout=0.1
)

# 4. Move model to GPU and set up training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

n_epochs = 5   
learning_rate = 0.0001  

# 3. Loss and Optimizer
criterion = nn.NLLLoss(ignore_index=tgt_vocab['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# 4. Learning Rate Scheduler (optional but helpful)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=2, verbose=True
)




In [97]:
import wandb

In [98]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")

wandb.login(key=secret_value_0)




True

In [99]:


batch_size=28
accumulation_steps=4
import wandb
# wandb.login(key='5160da7d6632bf129ee82d18c94a11388dc6bcb6')
wandb.init(
    project="transformer-translation(subset_train)",
    config={
        "batch_size": batch_size,
        "accumulation_steps": accumulation_steps,
        "learning_rate": learning_rate,
        "d_model": d_model,
        "n_heads": n_heads,
        "n_layers": N,
    }
)



In [100]:
from torch.cuda.amp import autocast, GradScaler
accumulation_steps = 4
scaler = GradScaler()
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    global_step = 0
    
    for i, batch in enumerate(tqdm(dataloader)):
        try:
            # Move to device
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)
            
            # Create masks
            src_mask = (src != src_vocab['<pad>']).unsqueeze(-2)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            
            tgt_pad_mask = (tgt_input != tgt_vocab['<pad>']).unsqueeze(-2)
            tgt_sub_mask = subsequent_mask(tgt_input.size(-1)).to(device)
            tgt_mask = tgt_pad_mask & tgt_sub_mask
            
            # Forward pass with mixed precision
            with autocast():
                output = model(src, tgt_input, src_mask, tgt_mask)
                loss = criterion(
                    output.contiguous().view(-1, output.size(-1)),
                    tgt_output.contiguous().view(-1)
                )
                loss = loss / accumulation_steps

            # Log to wandb
            wandb.log({
                "batch_loss": loss.item() * accumulation_steps,
                "global_step": global_step
            })
            global_step += 1
            
            # Backward pass with scaled gradients
            scaler.scale(loss).backward()
            
            # Update weights every accumulation_steps
            if (i + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            
            total_loss += loss.item() * accumulation_steps
            
        except RuntimeError as e:
            print("Error in batch!")
            print(f"Source shape: {src.shape}")
            print(f"Target input shape: {tgt_input.shape}")
            print(f"Target output shape: {tgt_output.shape}")
            print(f"Output shape: {output.shape}")
            raise e
    
    # Handle remaining gradients
    if (i + 1) % accumulation_steps != 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
            
    return total_loss / len(dataloader)

In [101]:
# batch_size = 24 # or 16 if 32 is too much
# train_loader = DataLoader(
#     train_dataset,
#     batch_size=batch_size,
#     shuffle=True
# )

In [102]:
# best_loss = float('inf')
# for epoch in range(n_epochs):
#     loss = train_epoch(model, train_loader, optimizer, criterion, device)
    
#     # Log epoch metrics
#     wandb.log({
#         "epoch": epoch,
#         "epoch_loss": loss,
#     })
    
#     # Save checkpoint
#     if loss < best_loss:
#         best_loss = loss
#         torch.save({
#             'epoch': epoch,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'loss': loss,
#         }, 'best_model.pt')
#         # Save to wandb
#         wandb.save('best_model.pt')
    
#     print(f'Epoch: {epoch+1}, Loss: {loss:.4f}')

# # Finish wandb run
# wandb.finish()

In [103]:
checkpoint = torch.load('/kaggle/input/lastcheckpoint/best_model.pt')


In [104]:
checkpoint['loss']

2.2627141343519375

continue train on subset of the data

In [106]:

train_dataset_subset = TranslationDataset(
    train_subset, 
    de_tokenizer, 
    en_tokenizer, 
    src_vocab, 
    tgt_vocab
)

# New dataloader
train_loader_subset = DataLoader(
    train_dataset_subset,
    batch_size=28,
    shuffle=True
)

In [107]:

start_epoch = checkpoint['epoch']
start_epoch 
checkpoint['loss']

2.2627141343519375

In [108]:
checkpoint = torch.load('/kaggle/input/lastcheckpoint/best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch']

best_loss=checkpoint['loss']
best_loss


2.2627141343519375

In [58]:

# # Continue training
# for epoch in range(start_epoch+1, n_epochs):
#     loss = train_epoch(model, train_loader_subset, optimizer, criterion, device)
    
#     wandb.log({
#         "epoch": epoch,
#         "epoch_loss": loss,
#     })
#     torch.save({
#         'epoch': epoch,
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict(),
#         'loss': loss,
#     }, f'checkpoint_epoch_{epoch+1}.pt')
#     wandb.save(f'checkpoint_epoch_{epoch+1}.pt')
#     # Save checkpoint
#     if loss < best_loss:
#         best_loss = loss
#         torch.save({
#             'epoch': epoch,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'loss': loss,
#         }, 'best_model.pt')
#         # Save to wandb
#         wandb.save('best_model.pt')
    
#     print(f'Epoch: {epoch+1}, Loss: {loss:.4f}')

# # Finish wandb run
# wandb.finish()

In [59]:

# !mkdir -p /kaggle/working/transformer_dataset


# !cp '/kaggle/input/modelcheckpoint2/best_model(1 epoch).pt' '/kaggle/working/transformer_dataset/best_model.pt'
# !cp /kaggle/input/vocabs/vocabularies.pth /kaggle/working/transformer_dataset/
# !cp /kaggle/working/best_model.pt /kaggle/working/transformer_dataset/

# metadata = {
#     "title": "transformer-checkpoints",
#     "id": "basem/transformer-checkpoints",  
#     "licenses": [{"name": "CC0-1.0"}]
# }

# import json
# with open('/kaggle/working/transformer_dataset/dataset-metadata.json', 'w') as f:
#     json.dump(metadata, f)

# # 4. Create dataset
# from kaggle.api.kaggle_api_extended import KaggleApi
# api = KaggleApi()
# api.authenticate()
# api.dataset_create_version(
#     folder='/kaggle/working/transformer_dataset',
#     version_notes='New checkpoint'
# )

SyntaxError: unmatched ')' (<ipython-input-59-70b8f9e5d692>, line 25)

In [109]:
test_subset = dataset['test'].select(range(7000))
test_subset

Dataset({
    features: ['sourceLang', 'targetlang', 'sourceString', 'targetString'],
    num_rows: 7000
})

In [110]:
test_dataset = TranslationDataset(
    train_subset,
    de_tokenizer,
    en_tokenizer,
    src_vocab,
    tgt_vocab
)
test_loader = DataLoader(
    test_dataset,
    batch_size=28,
    shuffle=True
)

In [111]:
len(test_dataset)

5000

In [112]:
tgt_vocab['<pad>']

0

In [115]:
from torchtext.data.metrics import bleu_score

def calculate_metrics(predictions, targets, pad_idx):
    # Accuracy

    
    # # Convert tensors to lists of tokens for BLEU
    # pred_tokens = []
    # target_tokens = []
    
    # for pred, tgt in zip(predictions, targets):
    #     # Remove padding and convert to list
    #     pred_clean = [str(p.item()) for p in pred if p.item() != pad_idx]
    #     tgt_clean = [str(t.item()) for t in tgt if t.item() != pad_idx]
        
    #     pred_tokens.append(pred_clean)
    #     target_tokens.append([tgt_clean])  # BLEU expects list of references
    
    # Calculate BLEU score
    bleu = bleu_score(predictions, targets)
    
    return  bleu

# Using in evaluation:
model.eval()
with torch.no_grad():
    max_len=512
    num_batches = len(test_loader)
    all_prediction=[]
    all_references=[]
    
    for batch in tqdm(test_loader):
        src = batch['src'].to(device)
        tgt = batch['tgt'].to(device)
        for b in range(src.size(0)): #b = index in batch
            single_src = src[b : b+1] # shape [1, src_len]
            
            single_src_mask = (single_src != src_vocab['<pad>']).unsqueeze(-2)
             
            
            
            
            # tgt_pad_mask = (tgt_input != tgt_vocab['<pad>']).unsqueeze(-2)
            # tgt_sub_mask = subsequent_mask(tgt_input.size(-1)).to(device)
            # tgt_mask = tgt_pad_mask & tgt_sub_mask
            
            memory = model.encode(single_src, single_src_mask)
            ys = torch.full((1, 1), tgt_vocab['<sos>'],
                          dtype=torch.long, device=device)
            
            for i in range(max_len):
                tgt_mask = subsequent_mask(ys.size(1)).to(device)
                out = model.decode(memory, single_src_mask, ys, tgt_mask)
                prob = model.generator(out[:, -1])
                _, next_word = torch.max(prob, dim=1)
                ys = torch.cat([ys, next_word.unsqueeze(0)], dim=1)
                
                if next_word.item() == src_vocab['<eos>']:
                    break
            pred_seq = ys[0].tolist()  # shape: [T], removing batch dimension
            
            # Remove <sos> if first
            if len(pred_seq) > 0 and pred_seq[0] == tgt_vocab['<sos>']:
                pred_seq = pred_seq[1:]
            # Remove <eos> if last
            if len(pred_seq) > 0 and pred_seq[-1] == tgt_vocab['<eos>']:
                pred_seq = pred_seq[:-1]
            
            # Convert IDs → string tokens for BLEU
            pred_tokens = [tgt_vocab.get_itos()[tid] for tid in pred_seq]
            
            # 4) Prepare reference
            ref_seq = tgt[b].tolist()  # shape: [tgt_len] 
            
            # Remove <sos> if first
            if len(ref_seq) > 0 and ref_seq[0] == tgt_vocab['<sos>']:
                ref_seq = ref_seq[1:]
            # Remove trailing <pad> or <eos>
            while len(ref_seq) > 0 and ref_seq[-1] in [tgt_vocab['<pad>'], tgt_vocab['<eos>']]:
                ref_seq.pop()
    
            ref_tokens = [tgt_vocab.get_itos()[tid] for tid in ref_seq]

            
            all_prediction.append(pred_tokens)
            all_references.append([ref_tokens])
bleu = calculate_metrics(all_prediction, all_references, pad_idx=tgt_vocab['<pad>'])
            
            
            
            
        

    
    
print(f"BLEU Score: {bleu:.2f}")
    
    # Log to wandb
wandb.log({
        
        "test_bleu": bleu
    })

100%|██████████| 179/179 [11:03<00:00,  3.71s/it]


BLEU Score: 0.16


NameError: name 'average_bleu' is not defined

In [116]:
bleu

0.15994003415107727

In [122]:
def translate(model, src_sentence, src_tokenizer, src_vocab, tgt_vocab, device, max_len=50):
    model.eval()
    with torch.no_grad():
        # Tokenize using your tokenizer
        src_tokens = [token.text for token in src_tokenizer(src_sentence)]
        
        # Convert to indices
        src_indices = [src_vocab[token] for token in src_tokens]
        src_tensor = torch.LongTensor([src_indices]).to(device)
        
        # Create mask
        src_mask = (src_tensor != src_vocab['<pad>']).unsqueeze(-2)
        
        # Encode
        memory = model.encode(src_tensor, src_mask)
        
        # Initialize with <sos>
        ys = torch.ones(1, 1).fill_(src_vocab['<sos>']).type_as(src_tensor)
        
        for i in range(max_len-1):
            tgt_mask = subsequent_mask(ys.size(1)).to(device)
            out = model.decode(memory, src_mask, ys, tgt_mask)
            prob = model.generator(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            ys = torch.cat([ys, next_word.unsqueeze(0)], dim=1)
            
            if next_word.item() == src_vocab['<eos>']:
                break
                
        # Convert indices back to tokens
        translated_tokens = []
        for idx in ys[0].cpu().numpy():
            token = tgt_vocab.get_itos()[idx]
            if token in ['<sos>', '<eos>', '<pad>']:
                continue
            translated_tokens.append(token)
                
        return " ".join(translated_tokens)

# Test it
test_sentences = [
    "Wie geht es dir?",
    "Ich liebe dich",
    "Guten Morgen"
]

for sent in test_sentences:
    print(f"Source: {sent}")
    print(f"Translation: {translate(model, sent, de_tokenizer, src_vocab, tgt_vocab, device)}\n")

Source: Wie geht es dir?
Translation: What 's you going to do it ?

Source: Ich liebe dich
Translation: I love you love you .

Source: Guten Morgen
Translation: Your morning morning .



In [None]:
# def load_checkpoint(filepath, model, optimizer=None):
#     checkpoint = torch.load(filepath)
#     model.load_state_dict(checkpoint['model_state_dict'])
#     if optimizer is not None:
#         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#     return checkpoint['epoch'], checkpoint['loss']

# # Usage:
# epoch, loss = load_checkpoint('checkpoints/checkpoint_best.pt', model, optimizer)