# Turkish Diacritisation | YZV 405E NLP Term Project

Author: Bora Boyacıoğlu

Student ID: 150200310

## Step 2: Training

In [None]:
%pip install unidecode --quiet

In [None]:
import pickle as pkl
import time

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from utils.model import Encoder, Decoder, Seq2Seq
from utils.main_utils import build_vocab

In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Reloading the Processed Data

In [None]:
# Load the train dataset.
train_dataset = pkl.load(open('data/train_data.pkl', 'rb'))

# Load the test dataset.
test_dataset = pkl.load(open('data/test_data.pkl', 'rb'))

In [45]:
# Build word to index and index to word mappings.
vocab = {}
vocab['w2i_und'], vocab['i2w_und'] = build_vocab(train_dataset.undiacritized)
vocab['w2i_d'], vocab['i2w_d'] = build_vocab(train_dataset.diacritized)

# Save the mappings for later use.
with open('data/vocab.pkl', 'wb') as f:
    pkl.dump(vocab, f)

### Defining the Training Loop

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

emb_dim = 64
hid_dim = 128
n_layers = 2
dropout = 0.3

encoder = Encoder(input_dim=len(vocab['w2i_und']), emb_dim=emb_dim, hid_dim=hid_dim, n_layers=n_layers, dropout=dropout)
decoder = Decoder(output_dim=len(vocab['w2i_d']), emb_dim=emb_dim, hid_dim=hid_dim, n_layers=n_layers, dropout=dropout)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=vocab['v2i_d']['<pad>'])

In [None]:
time_init = None

def train(model, loader, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0

    for i, (src, trg) in enumerate(loader):
        min, sec = divmod(time.time() - time_init, 60)
        print(f"Batch {i+1}/{len(loader)}: {(i+1)/(len(loader)):.4f}% Complete {int(min):02d}:{int(sec):02d}", end="\r")
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        
        # reshape to [batch size * target len, output dim]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(loader)

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch_padded = pad_sequence(src_batch, padding_value=word2idx_src["<pad>"], batch_first=True)
    trg_batch_padded = pad_sequence(trg_batch, padding_value=word2idx_trg["<pad>"], batch_first=True)
    return src_batch_padded, trg_batch_padded

# Run the training process
num_epochs = 10
clip = 1

train_dataset.to_indices('und', word2idx_src)
train_dataset.to_indices('d', word2idx_trg)

loader = DataLoader(train_dataset, batch_size=18, shuffle=True, collate_fn=collate_fn)

time_init = time.time()
for epoch in range(num_epochs):
    train_loss = train(model, loader, optimizer, criterion, clip, device)
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.4f}\n')


In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

emb_dim = 128
hid_dim = 512
n_layers = 2
dropout = 0.3

encoder = Encoder(input_dim=len(vocab['w2i_und']), emb_dim=emb_dim, hid_dim=hid_dim, n_layers=n_layers, dropout=dropout)
decoder = Decoder(output_dim=len(vocab['w2i_d']), emb_dim=emb_dim, hid_dim=hid_dim, n_layers=n_layers, dropout=dropout)
model = Seq2Seq(encoder, decoder, device).to(device)

# Load the model's state_dict
model.load_state_dict(torch.load('models/e3_l8.99.pth', map_location=device))
model.eval()

RuntimeError: Error(s) in loading state_dict for Seq2Seq:
	size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([91466, 128]) from checkpoint, the shape in current model is torch.Size([91462, 128]).
	size mismatch for decoder.embedding.weight: copying a param with shape torch.Size([93861, 128]) from checkpoint, the shape in current model is torch.Size([93857, 128]).
	size mismatch for decoder.fc_out.weight: copying a param with shape torch.Size([93861, 512]) from checkpoint, the shape in current model is torch.Size([93857, 512]).
	size mismatch for decoder.fc_out.bias: copying a param with shape torch.Size([93861]) from checkpoint, the shape in current model is torch.Size([93857]).

In [None]:
def predict_sentence(model, input_sentence, word2idx_src, idx2word_trg, device):
    # Normalize and tokenize the input sentence
    input_tokens = input_sentence.lower().split()
    input_indices = [word2idx_src.get(token, word2idx_src['<unk>']) for token in input_tokens]
    input_tensor = torch.tensor(input_indices).unsqueeze(1).to(device)

    # Encoder forward pass
    with torch.no_grad():
        hidden, cell = model.encoder(input_tensor)

    # Start decoding from the <sos> token
    trg_indexes = [vocab['w2i_d']['<sos>']]
    outputs = []

    for i in range(50):  # Reasonable limit to sentence length
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
        
        pred_token = output.argmax(1).item()
        if pred_token == vocab['w2i_d']['<eos>']:  # Stop decoding at <eos>
            break
        outputs.append(pred_token)
        trg_indexes.append(pred_token)

    # Convert output indices back to words
    translated_sentence = ' '.join([idx2word_trg[i] for i in outputs])
    return translated_sentence


In [42]:
input_sentence = "neden dogru ceviri yapmiyorsun?"
predicted_sentence = predict_sentence(model, input_sentence, vocab['w2i_und'], vocab['i2w_d'], device)
print("Predicted:", predicted_sentence)


KeyError: '<unk>'

In [46]:
vocab['w2i_und']['<unk>']

KeyError: '<unk>'