# *HindiGPT is a Transformer-based model built from scratch to deliver accurate and context-aware English-to-Hindi translations. It utilizes the encoder-decoder architecture and advanced attention mechanisms to capture complex linguistic patterns between English and Hindi.*

In [1]:
!pip install datasets
!pip install tokenizers



# Load the data and separate into train, validation and test data

In [2]:
import os
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm

os.mkdir("./hindigpt")
os.mkdir("./tokenizer_en")
os.mkdir("./tokenizer_hi")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = load_dataset("Helsinki-NLP/opus-100", "en-hi", split='train')
validation_dataset = load_dataset("Helsinki-NLP/opus-100", "en-hi", split='validation')

raw_train_dataset, rt_to_skip = random_split(train_dataset, [100000,len(train_dataset)-100000])
raw_validation_dataset, vt_to_skip = random_split(validation_dataset, [1000,len(validation_dataset)-1000])

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")

README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/65.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/247k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/534319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Train dataset size: 534319
Validation dataset size: 2000


# Create tokenizers

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

def get_ds_iterator(raw_train_dataset, lang):
  for data in raw_train_dataset:
    yield data['translation'][lang]

# Create Source Tokenizer - English
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
trainer_en = BpeTrainer(min_frequency=2, special_tokens=["[PAD]","[UNK]","[CLS]", "[SEP]", "[MASK]"])
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_en.train_from_iterator(get_ds_iterator(raw_train_dataset, "en"), trainer=trainer_en)
tokenizer_en.save("./tokenizer_en/tokenizer_en.json")

# Create Target Tokenizer - Hindi
tokenizer_hi = Tokenizer(BPE(unk_token="[UNK]"))
trainer_hi = BpeTrainer(min_frequency=2, special_tokens=["[PAD]","[UNK]","[CLS]", "[SEP]", "[MASK]"])
tokenizer_hi.pre_tokenizer = Whitespace()
tokenizer_hi.train_from_iterator(get_ds_iterator(raw_train_dataset, "hi"), trainer=trainer_hi)
tokenizer_hi.save("./tokenizer_hi/tokenizer_hi.json")

tokenizer_en = Tokenizer.from_file("./tokenizer_en/tokenizer_en.json")
tokenizer_hi = Tokenizer.from_file("./tokenizer_hi/tokenizer_hi.json")

source_vocab_size = tokenizer_en.get_vocab_size()
target_vocab_size = tokenizer_hi.get_vocab_size()

max_seq_len_source = 0
max_seq_len_target = 0

for data in raw_train_dataset:
    enc_ids = tokenizer_en.encode(data['translation']['en']).ids
    dec_ids = tokenizer_hi.encode(data['translation']['hi']).ids
    max_seq_len_source = max(max_seq_len_source, len(enc_ids))
    max_seq_len_target = max(max_seq_len_target, len(dec_ids))

print(f'max_seqlen_source: {max_seq_len_source}')
print(f'max_seqlen_target: {max_seq_len_target}')

max_seq_len = 155

max_seqlen_source: 489
max_seqlen_target: 504


# Prepare dataset and dataloader

In [4]:
# Transform raw dataset to the encoded dataset that can be processed by the model
class EncodeDataset(Dataset):
    def __init__(self, raw_dataset, max_seq_len):
        super().__init__()
        self.raw_dataset = raw_dataset
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.raw_dataset)

    def __getitem__(self, index):

        raw_text = self.raw_dataset[index]

        source_text = raw_text['translation']['en']
        target_text = raw_text['translation']['hi']

        source_text_encoded = tokenizer_en.encode(source_text).ids
        target_text_encoded = tokenizer_hi.encode(target_text).ids

        CLS_ID = torch.tensor([tokenizer_hi.token_to_id("[CLS]")], dtype=torch.int64)
        SEP_ID = torch.tensor([tokenizer_hi.token_to_id("[SEP]")], dtype=torch.int64)
        PAD_ID = tokenizer_hi.token_to_id("[PAD]")

        source_text_encoded = source_text_encoded[:self.max_seq_len - 2]
        target_text_encoded = target_text_encoded[:self.max_seq_len - 1]

        num_source_padding = max(0, self.max_seq_len - len(source_text_encoded) - 2)
        num_target_padding = max(0, self.max_seq_len - len(target_text_encoded) - 1)

        encoder_padding = torch.tensor([PAD_ID] * num_source_padding, dtype = torch.int64)
        decoder_padding = torch.tensor([PAD_ID] * num_target_padding, dtype = torch.int64)

        encoder_input = torch.cat([CLS_ID, torch.tensor(source_text_encoded, dtype=torch.int64), SEP_ID, encoder_padding], dim=0).long()

        decoder_input = torch.cat([CLS_ID, torch.tensor(target_text_encoded, dtype=torch.int64), decoder_padding ], dim=0)

        target_label = torch.cat([torch.tensor(target_text_encoded, dtype=torch.int64),SEP_ID,decoder_padding], dim=0)

        encoder_mask = (encoder_input != PAD_ID).unsqueeze(0).unsqueeze(0).int()

        decoder_mask = (decoder_input != PAD_ID).unsqueeze(0).unsqueeze(0).int() & causal_mask(self.max_seq_len)

        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input,
            'target_label': target_label,
            'encoder_mask': encoder_mask,
            'decoder_mask': decoder_mask,
            'source_text': source_text,
            'target_text': target_text
        }

def causal_mask(size):
        mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
        return mask == 0

train_ds = EncodeDataset(raw_train_dataset, max_seq_len)
val_ds = EncodeDataset(raw_validation_dataset, max_seq_len)

train_dataloader = DataLoader(train_ds, batch_size = 32, shuffle = True)
val_dataloader = DataLoader(val_ds, batch_size = 1, shuffle = True)

# Input embedding and positional encoding

In [5]:
import torch
import torch.nn as nn
import math

class EmbeddingLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, input):
        embedding_output = self.embedding(input) * math.sqrt(self.d_model)
        return embedding_output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq_len: int, dropout_rate: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)
        pe = torch.zeros(max_seq_len, d_model)

        pos = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, input_embdding):
        input_embdding = input_embdding + (self.pe[:, :input_embdding.shape[1], :]).requires_grad_(False)
        return self.dropout(input_embdding)

# Multihead Attention

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int, dropout_rate: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.num_heads = num_heads
        assert d_model % num_heads == 0, "d_model must be divisible by number of heads"

        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)
        self.W_o = nn.Linear(d_model, d_model, bias=False)

    def forward(self, q, k, v, encoder_mask):
        query = self.W_q(q)
        key = self.W_k(k)
        value = self.W_v(v)
        query = query.view(query.shape[0], query.shape[1], self.num_heads ,self.d_k).transpose(1,2)
        key = key.view(key.shape[0], key.shape[1], self.num_heads ,self.d_k).transpose(1,2)
        value = value.view(value.shape[0], value.shape[1], self.num_heads ,self.d_k).transpose(1,2)

        # SELF ATTENTION BLOCK STARTS

        attention_score = (query @ key.transpose(-2,-1))/math.sqrt(self.d_k)

        if encoder_mask is not None:
          attention_score = attention_score.masked_fill(encoder_mask == 0, float('-inf'))

        attention_score = attention_score.softmax(dim=-1)

        if self.dropout is not None:
          attention_score = self.dropout(attention_score)

        attention_output = attention_score @ value

        # SELF ATTENTION BLOCK ENDS

        attention_output = attention_output.transpose(1,2).contiguous().view(attention_output.shape[0], -1, self.num_heads * self.d_k)

        multihead_output = self.W_o(attention_output)

        return multihead_output

# Feedfoward Network, Layer Normalization and AddAndNorm

In [7]:
class FeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout_rate: float):
        super().__init__()

        self.dropout = nn.Dropout(dropout_rate)
        self.layer_1 = nn.Linear(d_model, d_ff)
        self.layer_2 = nn.Linear(d_ff, d_model)

    def forward(self, input):
        return self.layer_2(self.dropout(torch.relu(self.layer_1(input))))

class LayerNorm(nn.Module):
    def __init__(self, eps: float = 1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(512))
        self.beta = nn.Parameter(torch.zeros(512))

    def forward(self, input):
        mean = input.mean(dim = -1, keepdim=True)
        std = input.std(dim = -1, keepdim=True)
        return self.gamma * (input - mean)/(std + self.eps) + self.beta

class AddAndNorm(nn.Module):
  def __init__(self, dropout_rate: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.layer_norm = LayerNorm()

  def forward(self, input, sub_layer):
        return input + self.dropout(sub_layer(self.layer_norm(input)))

# Encoder block and Encoder

In [8]:
class EncoderBlock(nn.Module):
    def __init__(self, multihead_attention: MultiHeadAttention, feed_forward: FeedForward, dropout_rate: float) -> None:
        super().__init__()
        self.multihead_attention = multihead_attention
        self.feed_forward = feed_forward
        self.addnorm_1 = AddAndNorm(dropout_rate)
        self.addnorm_2 = AddAndNorm(dropout_rate)

    def forward(self, encoder_input, encoder_mask):
        encoder_input = self.addnorm_1(encoder_input, lambda encoder_input: self.multihead_attention(encoder_input, encoder_input, encoder_input, encoder_mask))
        encoder_input = self.addnorm_2(encoder_input, self.feed_forward)
        return encoder_input

class Encoder(nn.Module):
    def __init__(self, encoderblocklist: nn.ModuleList) -> None:
        super().__init__()
        self.encoderblocklist = encoderblocklist
        self.layer_norm = LayerNorm()

    def forward(self, encoder_input, encoder_mask):
        for encoderblock in self.encoderblocklist:
            encoder_input = encoderblock(encoder_input, encoder_mask)
        encoder_output = self.layer_norm(encoder_input)
        return encoder_output

# Decoder block and decoder and the projection

In [9]:
class DecoderBlock(nn.Module):
    def __init__(self, masked_multihead_attention: MultiHeadAttention, cross_multihead_attention: MultiHeadAttention, feed_forward: FeedForward, dropout_rate: float) -> None:
        super().__init__()
        self.masked_multihead_attention = masked_multihead_attention
        self.cross_multihead_attention = cross_multihead_attention
        self.feed_forward = feed_forward
        self.addnorm_1 = AddAndNorm(dropout_rate)
        self.addnorm_2 = AddAndNorm(dropout_rate)
        self.addnorm_3 = AddAndNorm(dropout_rate)

    def forward(self, decoder_input, encoder_output, encoder_mask, decoder_mask):
        decoder_input = self.addnorm_1(decoder_input, lambda decoder_input: self.masked_multihead_attention(decoder_input, decoder_input, decoder_input, decoder_mask))
        decoder_input = self.addnorm_2(decoder_input, lambda decoder_input: self.cross_multihead_attention(decoder_input, encoder_output, encoder_output, encoder_mask))
        decoder_input = self.addnorm_3(decoder_input, self.feed_forward)
        return decoder_input

class Decoder(nn.Module):
    def __init__(self, decoderblocklist: nn.ModuleList) -> None:
        super().__init__()
        self.decoderblocklist = decoderblocklist
        self.layer_norm = LayerNorm()

    def forward(self, decoder_input, encoder_output, encoder_mask, decoder_mask):
        for decoderblock in self.decoderblocklist:
            decoder_input = decoderblock(decoder_input, encoder_output, encoder_mask, decoder_mask)
        decoder_output = self.layer_norm(decoder_input)
        return decoder_output

class ProjectionLayer(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.projection_layer = nn.Linear(d_model, vocab_size)

    def forward(self, decoder_output):
        return self.projection_layer(decoder_output)

# Create and build Transfomer

In [10]:
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, source_embed: EmbeddingLayer, target_embed: EmbeddingLayer, source_pos: PositionalEncoding, target_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()

        self.source_embed = source_embed
        self.source_pos = source_pos
        self.encoder = encoder

        self.target_embed = target_embed
        self.target_pos = target_pos
        self.decoder = decoder

        self.projection_layer = projection_layer

    def encode(self, encoder_input, encoder_mask):
        encoder_input = self.source_embed(encoder_input)
        encoder_input = self.source_pos(encoder_input)
        encoder_output = self.encoder(encoder_input, encoder_mask)
        return encoder_output

    def decode(self, encoder_output, encoder_mask, decoder_input, decoder_mask):
        decoder_input = self.target_embed(decoder_input)
        decoder_input = self.target_pos(decoder_input)
        decoder_output = self.decoder(decoder_input, encoder_output, encoder_mask, decoder_mask)
        return decoder_output

    def project(self, decoder_output):
        return self.projection_layer(decoder_output)

def build_model(source_vocab_size: int, target_vocab_size: int, source_seq_len: int, target_seq_len: int, d_model: int=512, num_blocks: int=6, num_heads: int=8, dropout_rate: float=0.1, d_ff: int=2048) -> Transformer:
    # Creating the embedding layers
    source_embed = EmbeddingLayer(d_model, source_vocab_size)
    target_embed = EmbeddingLayer(d_model, target_vocab_size)

    # Creating the positional encoding layers
    source_pos = PositionalEncoding(d_model, source_seq_len, dropout_rate)
    target_pos = PositionalEncoding(d_model, target_seq_len, dropout_rate)

    # Creating the encoder-block-list
    encoderblocklist = []
    for _ in range(num_blocks):
        multihead_attention = MultiHeadAttention(d_model, num_heads, dropout_rate)
        feed_forward = FeedForward(d_model, d_ff, dropout_rate)
        encoder_block = EncoderBlock(multihead_attention, feed_forward, dropout_rate)
        encoderblocklist.append(encoder_block)

    # Creating the encoder
    encoder = Encoder(nn.ModuleList(encoderblocklist))

    # Creating the decoder-block-list
    decoderblocklist = []
    for _ in range(num_blocks):
        masked_multihead_attention = MultiHeadAttention(d_model,num_heads, dropout_rate)
        cross_multihead_attention = MultiHeadAttention(d_model, num_heads, dropout_rate)
        feed_forward = FeedForward(d_model, d_ff, dropout_rate)
        decoder_block = DecoderBlock(masked_multihead_attention, cross_multihead_attention, feed_forward, dropout_rate)
        decoderblocklist.append(decoder_block)
        
    # Creating the decoder
    decoder = Decoder(nn.ModuleList(decoderblocklist))

    # Creating the projection layer
    projection_layer = ProjectionLayer(d_model, target_vocab_size)

    model = Transformer(encoder, decoder, source_embed, target_embed, source_pos, target_pos, projection_layer)

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return model

# The final model architecture

In [11]:
model = build_model(tokenizer_en.get_vocab_size(), tokenizer_hi.get_vocab_size(),max_seq_len, max_seq_len, d_model=512).to(device)
print(model)

Transformer(
  (source_embed): EmbeddingLayer(
    (embedding): Embedding(30000, 512)
  )
  (source_pos): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (encoderblocklist): ModuleList(
      (0-5): 6 x EncoderBlock(
        (multihead_attention): MultiHeadAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (W_q): Linear(in_features=512, out_features=512, bias=False)
          (W_k): Linear(in_features=512, out_features=512, bias=False)
          (W_v): Linear(in_features=512, out_features=512, bias=False)
          (W_o): Linear(in_features=512, out_features=512, bias=False)
        )
        (feed_forward): FeedForward(
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_1): Linear(in_features=512, out_features=2048, bias=True)
          (layer_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (addnorm_1): AddAndNorm(
          (dropout): Dropout(p=0.1, inplace=False)
        

# Training and Validation of HindiGPT

In [12]:
def run_validation(model, validation_ds, tokenizer_en, tokenizer_hi, max_seq_len, device, print_msg, global_step):
    model.eval()
    count = 0

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device)
            encoder_mask = batch["encoder_mask"].to(device)

            cls_id = tokenizer_hi.token_to_id('[CLS]')
            sep_id = tokenizer_hi.token_to_id('[SEP]')

            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_input = torch.empty(1, 1).fill_(cls_id).type_as(encoder_input).to(device)

            while True:
                if decoder_input.size(1) == max_seq_len:
                    break

                decoder_mask = causal_mask(decoder_input.size(1)).type_as(encoder_mask).to(device)

                out = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)

                prob = model.project(out[:, -1])

                _, next_word = torch.max(prob, dim=1)
                decoder_input = torch.cat(
                    [decoder_input, torch.empty(1, 1).type_as(encoder_input).fill_(next_word.item()).to(device)], dim=1
                )

                if next_word == sep_id:
                    break

            model_out = decoder_input.squeeze(0)

            source_text = batch["source_text"][0]
            target_text = batch["target_text"][0]
            model_out_text = tokenizer_hi.decode(model_out.detach().cpu().numpy())

            print_msg('-'*55)
            print_msg(f'Source Text: {source_text}')
            print_msg(f'Target Text: {target_text}')
            print_msg(f'Predicted by HindiGPT: {model_out_text}')

            if count == 2:
                break

def train_model(preload_epoch=None):
    EPOCHS = 10
    initial_epoch = 0
    global_step = 0
    total_loss = 0

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, eps=1e-9)

    if preload_epoch is not None:
      model_filename = f"./hindigpt/model_{preload_epoch}.pt"
      state = torch.load(model_filename)
      model.load_state_dict(state['model_state_dict'])
      initial_epoch = state['epoch'] + 1
      optimizer.load_state_dict(state['optimizer_state_dict'])
      global_step = state['global_step']

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_en.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, EPOCHS):
        model.train()
        epoch_loss = 0
        batch_count = 0

        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)
            target_label = batch['target_label'].to(device) # (B, seq_len)

            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            projection_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(projection_output.view(-1, tokenizer_hi.get_vocab_size()), target_label.view(-1))
            
            # Track total loss
            epoch_loss += loss.item()
            batch_count += 1

            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Log epoch loss
        avg_loss = epoch_loss / batch_count
        print(f"\nEpoch {epoch}: Average Loss = {avg_loss:.4f}")
        
        # Validation
        run_validation(model, val_dataloader, tokenizer_en, tokenizer_hi, max_seq_len, device, lambda msg: batch_iterator.write(msg), global_step)

        # Save the model at the end of every epoch
        model_filename = f"./hindigpt/model_{epoch}.pt"
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)

# Training the model

In [13]:
train_model(preload_epoch=None)

Processing Epoch 00: 100%|██████████| 3125/3125 [37:31<00:00,  1.39it/s, loss=5.169]



Epoch 0: Average Loss = 5.9986
-------------------------------------------------------
Source Text: Non-repudiation
Target Text: गैर-रेपुडियेशन
Predicted by HindiGPT: - ए - पट्टी
-------------------------------------------------------
Source Text: It is He who made the night for you, that you may rest in it, and the day to provide visibility. There are indeed signs in that for people who listen.
Target Text: वह वही (खुदाए क़ादिर तवाना) है जिसने तुम्हारे नफा के वास्ते रात को बनाया ताकि तुम इसमें चैन करो और दिन को (बनाया) कि उसकी रौशनी में देखो भालो उसमें शक़ नहीं जो लोग सुन लेते हैं उनके लिए इसमें (कुदरत की बहुतेरी निशानियाँ हैं)
Predicted by HindiGPT: और जो लोग ( ख़ुदा की ) ज़मीन में है और तुम पर ( भी ) तुम पर ( भी ) तुम पर ( भी ) तुम पर ( भी ) तुम लोगों को ( भी ) एक तरह ( और ) एक तरह ( और ) एक तरह ( और ) एक चीज़ से ( क़यामत की ) हिदायत व ज़मीन में ( भी ) है


Processing Epoch 01: 100%|██████████| 3125/3125 [37:43<00:00,  1.38it/s, loss=4.983]



Epoch 1: Average Loss = 4.9012
-------------------------------------------------------
Source Text: (ILIR'S BROTHER SPEAKING ALBANIAN ON VIDEO)
Target Text: (LLIR के भाई वीडियो पर अल्बानियाई बोल)
Predicted by HindiGPT: ( c ) 2003 , [ ] ]
-------------------------------------------------------
Source Text: Say, ‘Invoke those whom you claim [to be gods] besides Him. They have no power to remove your distress, nor to bring about any change [in your state].
Target Text: कह दो, "तुम उससे इतर जिनको भी पूज्य-प्रभु समझते हो उन्हें पुकार कर देखो। वे न तुमसे कोई कष्ट दूर करने का अधिकार रखते है और न उसे बदलने का।"
Predicted by HindiGPT: कहो , " क्या तुम अल्लाह के मार्ग में से हटकर जो कुछ वे करते हो , वे अपने ही को नहीं मानते , और न तुम अपने ही को कोई सहायक न पाओगे


Processing Epoch 02: 100%|██████████| 3125/3125 [37:37<00:00,  1.38it/s, loss=4.083]



Epoch 2: Average Loss = 4.2738
-------------------------------------------------------
Source Text: A line to be used as a separator
Target Text: विभाजक के रूप में इस्तेमाल करने के लिए एक लकीर
Predicted by HindiGPT: एक पंक्ति के रूप में एक पंक्ति के लिए पंक्ति
-------------------------------------------------------
Source Text: That I would see it coming...
Target Text: मैं यह आ रहा देखना होगा कि ...
Predicted by HindiGPT: मैं इसे देख रहा हूँ ...


Processing Epoch 03: 100%|██████████| 3125/3125 [37:40<00:00,  1.38it/s, loss=3.667]



Epoch 3: Average Loss = 3.7850
-------------------------------------------------------
Source Text: The Naushera SDM assured the disabled that their problems could be resolved.
Target Text: एसडीएम नौशहरा ने विकलांगों को आश्वासन दिया कि उनकी समस्याएं हल हो सकती हैं।
Predicted by HindiGPT: कोई आरंभीकरण प्रविष्टि नहीं है जो अपनी सेवा से उन्हें बदल सकते हैं .
-------------------------------------------------------
Source Text: But beware of men who think too much.
Target Text: लेकिन बहुत ज्यादा लगता है, जो पुरुषों से सावधान रहना.
Predicted by HindiGPT: लेकिन लोगों को सुना है जो बहुत बहुत से .


Processing Epoch 04: 100%|██████████| 3125/3125 [37:36<00:00,  1.39it/s, loss=3.610]



Epoch 4: Average Loss = 3.3860
-------------------------------------------------------
Source Text: Whether to show contacts that are offline in the contact list.
Target Text: क्या संपर्कों को दिखाना है जो संपर्क सूची में ऑफ़लाइन हैं.
Predicted by HindiGPT: क्या संपर्क सूची में संपर्क पूर्वावलोकन दिखाता है .
-------------------------------------------------------
Source Text: & Help
Target Text: मारबल डेस्कटॉप ग्लोब के बारे में (A)
Predicted by HindiGPT: मदद ( C )


Processing Epoch 05: 100%|██████████| 3125/3125 [37:38<00:00,  1.38it/s, loss=2.819]



Epoch 5: Average Loss = 3.0587
-------------------------------------------------------
Source Text: [ CAMERON CHUCKLES ]
Target Text: [कैमरून दूसरे से टकराए]
Predicted by HindiGPT: [ टीवी पर ]: पर ]:
-------------------------------------------------------
Source Text: - JOB DONE, EH?
Target Text: - काम किया, एह?
Predicted by HindiGPT: - किसी को , एह , एह


Processing Epoch 06: 100%|██████████| 3125/3125 [37:36<00:00,  1.39it/s, loss=2.662]



Epoch 6: Average Loss = 2.7936
-------------------------------------------------------
Source Text: Cartago
Target Text: कार्तागोcosta_ rica. kgm
Predicted by HindiGPT: कार puertorico . kgm
-------------------------------------------------------
Source Text: - What?
Target Text: - क्या?
Predicted by HindiGPT: - क्या ?


Processing Epoch 07: 100%|██████████| 3125/3125 [37:40<00:00,  1.38it/s, loss=2.570]



Epoch 7: Average Loss = 2.5818
-------------------------------------------------------
Source Text: Satan has got the better of them and has caused them to forget the remembrance of God. They have gone over to the side of the devil, and it is as the devil's partisans that they shall be the losers:
Target Text: उनपर शैतान ने पूरी तरह अपना प्रभाव जमा लिया है। अतः उसने अल्लाह की याद को उनसे भुला दिया। वे शैतान की पार्टीवाले हैं। सावधान रहो शैतान की पार्टीवाले ही घाटे में रहनेवाले हैं!
Predicted by HindiGPT: शैतान ने उनके वास्ते अच्छा कर दिखाया और उन्हें अल्लाह की याद भुला दी , ये शैतान की याद के मामले में वे शैतान का ख़याल डाल रहे है । निश्चय ही वे ज़ालिम होंगे
-------------------------------------------------------
Source Text: Robert Brogden, sr., our daddy, was sheriff for two decades before him.
Target Text: रॉबर्ट Brogden, सीनियर, हमारे पिताजी, उसे पहले दो दशकों के लिए था शेरिफ.
Predicted by HindiGPT: रॉबर्ट परिवहन कर्ता , हमारे पिता के लिए , हमारी नौकरी के लिए खुद की आवश्यकता थी .


Processing Epoch 08: 100%|██████████| 3125/3125 [37:40<00:00,  1.38it/s, loss=2.312]



Epoch 8: Average Loss = 2.4134
-------------------------------------------------------
Source Text: I will give you an answer in a day or two.
Target Text: मैं तुम्हें एक-दो दिन में जवाब दूँगा।
Predicted by HindiGPT: मैं तुम्हें दो दिन या दो दिन में एक जवाब देना होगा .
-------------------------------------------------------
Source Text: It's three years since Mama Bengta died. It's three years since Mama Bengta died.
Target Text: तीन साल हो गये मेरे मम्मा बैंग्टा को मरे.
Predicted by HindiGPT: यह तीन साल मर चुका है ... ... ... ... / मैं ... ... / मैं ... ... / मैं


Processing Epoch 09: 100%|██████████| 3125/3125 [37:38<00:00,  1.38it/s, loss=2.445]



Epoch 9: Average Loss = 2.2791
-------------------------------------------------------
Source Text: Come back when you've got the 10 grand, that's my minimum.
Target Text: आप 10 हज़ार मिल गया है जब वापस आओ, कि मेरी न्यूनतम है.
Predicted by HindiGPT: वापस आओ कि आप 10 , मेरे न्यूनतम अंक है कि .
-------------------------------------------------------
Source Text: - OH MAN.
Target Text: - अरे यार.
Predicted by HindiGPT: - ओह आदमी .


# Testing the HindiGPT model to translate new sentences

In [14]:
def hindigpt(user_input_text):

    user_input_text = str(user_input_text).strip()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer_en = Tokenizer.from_file("./tokenizer_en/tokenizer_en.json")
    tokenizer_my = Tokenizer.from_file("./tokenizer_hi/tokenizer_hi.json")

    model = build_model(tokenizer_en.get_vocab_size(), tokenizer_hi.get_vocab_size(),max_seq_len, max_seq_len, d_model=512).to(device)

    checkpoint_number = 9
    model_filename = f"./hindigpt/model_{checkpoint_number}.pt"
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])

    model.eval()
    with torch.no_grad():
      
        source_text_encoding = tokenizer_en.encode(user_input_text)
        source_text_encoding = torch.cat([
            torch.tensor([tokenizer_en.token_to_id('[CLS]')], dtype=torch.int64),
            torch.tensor(source_text_encoding.ids, dtype=torch.int64),
            torch.tensor([tokenizer_en.token_to_id('[SEP]')], dtype=torch.int64),
            torch.tensor([tokenizer_en.token_to_id('[PAD]')] * (max_seq_len - len(source_text_encoding.ids) - 2), dtype=torch.int64)
        ], dim=0).to(device)
        source_mask = (source_text_encoding != tokenizer_en.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
        encoder_output = model.encode(source_text_encoding, source_mask)

        decoder_input = torch.empty(1, 1).fill_(tokenizer_hi.token_to_id('[CLS]')).type_as(source_text_encoding).to(device)

        # Generating the translation word by word
        while decoder_input.size(1) < max_seq_len:
            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(torch.int).type_as(source_mask).to(device)
            out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

            prob = model.project(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            decoder_input = torch.cat([decoder_input, torch.empty(1, 1).type_as(source_text_encoding).fill_(next_word.item()).to(device)], dim=1)

            if next_word == tokenizer_hi.token_to_id('[SEP]'):
                break

    # Converting ids to tokens
    return tokenizer_hi.decode(decoder_input[0].tolist())


# Translation using HindiGPT
## TEST 1

In [3]:
user_input = "Good Morning"
transalated_text = hindigpt(user_input)

print(f"User input (in English): {user_input}")
print(f"Translation (in Hindi): {transalated_text}")

## TEST 2

In [4]:
user_input = "How are you?"
transalated_text = hindigpt(user_input)

print(f"User input (in English): {user_input}")
print(f"Translation (in Hindi): {transalated_text}")

## TEST 3

In [5]:
user_input = "Hi, I am good."
transalated_text = hindigpt(user_input)

print(f"User input (in English): {user_input}")
print(f"Translation (in Hindi): {transalated_text}")