In [1]:
import torch
import pickle
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm


%run 'Dataset_and_preprocessing.ipynb'

%run 'model.ipynb'

print("Source Vocabulary Size:", len(src_vocab))
print("Target Vocabulary Size:", len(tgt_vocab))


# Set random seed for PyTorch CPU operations
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE).to(device)

# Initialize the attention layer
attention_layer = BahdanauAttention(units)

# Initialize the decoder

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE).to(device)

# Optimizer
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)

# Custom Loss Function
def loss_function(real, pred):
    # Mask for non-zero tokens in the target
    mask = real.ne(0)
    loss = F.cross_entropy(pred, real, reduction='none')
    loss = loss * mask
    return loss.mean()

# Training Step Function
def train_step(inp, targ, enc_hidden):
    # Move data to the device
    inp, targ, enc_hidden = inp.to(device), targ.to(device), enc_hidden.to(device)

    loss = 0
    optimizer.zero_grad()

    current_batch_size = inp.size(0)
    enc_hidden = enc_hidden[:, :current_batch_size, :]

    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden

    sos_token_index = tgt_vocab['<sos>']
    dec_input = torch.full((current_batch_size, 1), sos_token_index, dtype=torch.long, device=inp.device)

    for t in range(1, targ.size(1)):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        loss += loss_function(targ[:, t], predictions.squeeze(1))
        dec_input = targ[:, t].unsqueeze(1)

    batch_loss = loss / int(targ.size(1))
    batch_loss.backward()
    optimizer.step()

    return batch_loss.item()

# Training Loop
EPOCHS = 40
for epoch in tqdm(range(EPOCHS)):
    total_loss = 0

    for inp, targ in train_loader:
        current_batch_size = inp.size(0)
        # Initialize hidden state with the correct current batch size
        enc_hidden = encoder.initialize_hidden_state(current_batch_size).to(device)

        
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if epoch % 4 == 0:
        print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {total_loss / len(train_loader)}')

        
        
#save the model 
torch.save(encoder.state_dict(), 'encoder.pth')
torch.save(decoder.state_dict(), 'decoder.pth')

Batch shapes: torch.Size([64, 19]) torch.Size([64, 17])
2072
2147
Device: cpu
2072
2147
Source Vocabulary Size: 2072
Target Vocabulary Size: 2147


  2%|█                                           | 1/40 [01:15<49:10, 75.66s/it]

Epoch 1/40, Loss: 2.197827808400418


 12%|█████▌                                      | 5/40 [06:04<42:12, 72.35s/it]

Epoch 5/40, Loss: 0.8366327285766602


 22%|█████████▉                                  | 9/40 [10:54<37:19, 72.24s/it]

Epoch 9/40, Loss: 0.07289392952906325


 32%|█████████████▉                             | 13/40 [15:46<32:41, 72.64s/it]

Epoch 13/40, Loss: 0.02037754793591956


 42%|██████████████████▎                        | 17/40 [20:33<27:41, 72.23s/it]

Epoch 17/40, Loss: 0.015598146314237346


 52%|██████████████████████▌                    | 21/40 [25:17<22:33, 71.21s/it]

Epoch 21/40, Loss: 0.014562862469477857


 62%|██████████████████████████▉                | 25/40 [30:03<17:54, 71.63s/it]

Epoch 25/40, Loss: 0.01465218636068575


 72%|███████████████████████████████▏           | 29/40 [34:56<13:23, 73.01s/it]

Epoch 29/40, Loss: 0.015152472338857168


 82%|███████████████████████████████████▍       | 33/40 [39:51<08:34, 73.56s/it]

Epoch 33/40, Loss: 0.016768292632905094


 92%|███████████████████████████████████████▊   | 37/40 [44:37<03:34, 71.64s/it]

Epoch 37/40, Loss: 0.02010948586456002


100%|███████████████████████████████████████████| 40/40 [48:14<00:00, 72.37s/it]
