### Relevant Imports


In [4]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir = 'logs')
import re

from structure.transformer import Transformer
from structure.Dataset import English_Hindi_Dataset

from sub_modules.embedding import Language_Embedding
from sub_modules.masks import get_masks

### Initializations


In [5]:
# Read data
read_max = 7_00_000 ######

# basics
batch_size = 512
sequence_length = 100
d_model = 512
num_of_sentences = 3_00_000
# transfomer
num_encoder_decoder_layers = 6
num_heads = 8
hidden_layers = 2048

dropout_ff = 0.3
dropout_attn = 0.2


### Dataset


In [6]:
dataset = English_Hindi_Dataset('Dataset/train.en/train.en', 
                                    'Dataset/train.hi/train.hi',
                                    num_of_sentences = num_of_sentences,
                                    max_sequence_length = sequence_length,
                                    read_max = read_max)

en_vocab_size = len(set(dataset.en_vocab))
hi_vocab_size = len(set(dataset.hi_vocab))

assert len(dataset) == num_of_sentences, f"Dataset is of length: {len(dataset)} but required sample :{num_of_sentences}"


Total unique characters: English-> 97 Hindi-> 174
	Dataset Cleaned
	Dataset Tokenized and Pading is Done


### Embeddings


In [7]:
# embeddings
embeddings = Language_Embedding(en_vocab_size, hi_vocab_size, d_model)

### Model Initializations


In [8]:
# GPU for training
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f" Using: {device}")

model = Transformer(
    num_encoder_decoder_layers=num_encoder_decoder_layers,
    d_model=d_model,
    sequence_length=sequence_length,
    hidden_layers=hidden_layers,
    num_heads=num_heads,
    hi_voab_size=hi_vocab_size,
    dropout_ff=dropout_ff,
    dropout_attn=dropout_attn
).to(device)

 Using: cuda


### Load Model


In [9]:
model_save_path = "saved_models"  # Specify your directory to save models
os.makedirs(model_save_path, exist_ok=True)  # Create directory if it doesn't exist


def get_latest_model_checkpoint(model_save_path):
    model_files = os.listdir(model_save_path)
    model_epochs = [int(re.findall(r'model_epoch_(\d+).pt', file)[0]) for file in model_files if file.endswith('.pt')]
    
    if len(model_epochs)>0:
        latest_epoch = max(model_epochs)
        model_save_file = os.path.join(model_save_path, f"model_epoch_{latest_epoch}.pt")
        return latest_epoch, model_save_file
    else:
        return None, None
    
latest_epoch, model_save_file = get_latest_model_checkpoint(model_save_path)

if model_save_file:
    print(f"Loading model from {model_save_file}")
    model.load_state_dict(torch.load(model_save_file))
    current_epoch = latest_epoch + 1
else:
    print("No saved model found. Training from scratch.")
    current_epoch = 0

Loading model from saved_models\model_epoch_100.pt


  model.load_state_dict(torch.load(model_save_file))


In [11]:
### Dictionaries
import json

with open('dicts.json', 'r') as file:
    data = json.load(file)
    
data.keys()

en_vocab = data['en_vocab']
hi_vocab = data['hi_vocab']
en_to_index = data['en_to_index']
index_to_en = data['index_to_en']
hi_to_index = data['hi_to_index']
index_to_hi = data['index_to_hi']


##### Evaluation


In [12]:
def tokenize(sentence, language_to_index, start_token=False, end_token=False):
        sentence_indices = [language_to_index[token] for token in list(sentence)]
        
        if start_token:
            sentence_indices.insert(0, language_to_index['<START>'])
        if end_token:
            sentence_indices.append(language_to_index['<END>'])
            
        while len(sentence_indices) < 100:
            sentence_indices.append(language_to_index['<PADDING>'])
                
        return torch.tensor(sentence_indices) 

In [21]:
def get_masks(en_batch, hi_batch):
    max_sequence_length = 100  # For testing, set max length to match the batch size
    num_of_sentences = len(en_batch)
    
    nil = -1e9

    # Initialize masks
    decoder_self_attention_mask = torch.full([num_of_sentences, max_sequence_length, max_sequence_length], nil)
    decoder_self_attention_mask = torch.triu(decoder_self_attention_mask, diagonal=1)
    
    encoder_self_attention_mask = torch.full([num_of_sentences, max_sequence_length, max_sequence_length], 0.0)
    encoder_decoder_attention_mask = torch.full([num_of_sentences, max_sequence_length, max_sequence_length], 0.0)
    
    for index in range(num_of_sentences):
        # Boolean tensor check for padding tokens
        num_of_en_tokens = (en_batch[index] != en_to_index['<PADDING>']).sum().item()
        num_of_hi_tokens = (hi_batch[index] != hi_to_index['<PADDING>']).sum().item()

        # Update the masks based on number of valid tokens
        encoder_self_attention_mask[index, num_of_en_tokens:, :] = nil
        encoder_self_attention_mask[index, :, num_of_en_tokens:] = nil
        
        encoder_decoder_attention_mask[index, num_of_hi_tokens:, :] = nil
        encoder_decoder_attention_mask[index, :, num_of_en_tokens:] = nil
        
        # Optional: Modify decoder self-attention mask
        decoder_self_attention_mask[index, num_of_hi_tokens:, :] = nil
    
    return decoder_self_attention_mask, encoder_self_attention_mask, encoder_decoder_attention_mask


In [26]:
def translate(model, en_sentence):
    model.eval()
    en_sentence = (en_sentence,)
    hi_sentence = ("",)

    en_token = tokenize(en_sentence[0], en_to_index, start_token=False, end_token=False).unsqueeze(0).to(device)
    hi_token = tokenize(hi_sentence[0], hi_to_index, start_token=True, end_token=False).unsqueeze(0).to(device)

    for word_counter in range(dataset.max_sequence_length):
        print(f"Processing for {word_counter + 1} token")
    
        ds_mask, es_mask, edc_mask = get_masks( en_token, hi_token)
        ds_mask, es_mask, edc_mask = ds_mask.to(device), es_mask.to(device), edc_mask.to(device)
        
        en_embedded, hi_embedded = embeddings(en_token, hi_token)
        en_embedded, hi_embedded =  en_embedded.to(device), hi_embedded.to(device)
        
        predictions = model(en_embedded,
                            hi_embedded,
                            ds_mask, es_mask, edc_mask)
        next_token_prob_distribution = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_hi[next_token_index]
        
        if next_token == '<END>':
            break
        hi_sentence = (hi_sentence[0] + next_token, )
        hi_token = tokenize(hi_sentence[0], hi_to_index, start_token=True, end_token=False).unsqueeze(0).to(device)
        print(f"\t\t\t Predicted till now: {hi_sentence[0]}")
    
    return hi_sentence[0]

In [27]:
line1 = "Hello, How are you?"
translate(model,line1 )

Processing for 1 token
			 Predicted till now: 
Processing for 2 token
			 Predicted till now: =


'='

In [None]:
d

In [27]:
print(f"en sentence : {en}")
print(f"actual translation : {hi}")
print(f"predicted translation : {translation}")

en sentence : In reply, Pakistan got off to a solid start.
actual translation : जिसके जवाब में पाक ने अच्छी शुरुआत की थी.
predicted translation : ----------------------------------------------------------------------------------------------------


In [28]:
line1 = "Hello, How are you?"
line2 = "This is a beautiful day to go out."
line3 = "India is situated on the right side of pakistan"
lines = [line1, line2, line3]

In [29]:
translations = []
for line in lines:
    translations.append(translate(line))

In [30]:
for index, (en, hi) in enumerate(zip(lines, translations)):
    print(f"{en} -> {hi}")

Hello, How are you? -> :
This is a beautiful day to go out. -> :
India is situated on the right side of pakistan -> :


#### Save dictionaries
