In [None]:
import torch
import torch.nn as nn
import math

class InputEmbedding(nn.Module):
    def __init__(self, d_model:int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
    def forward(self, x:torch.Tensor)->torch.Tensor:
        # x shape : (Batch, seq)
        # return : (Batch, seq, dim)
        return self.embedding(x) * (self.d_model ** 0.5)

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

class LayerNormalization(nn.Module):
    def __init__(self, eps = 1e-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1)) # multiplied
        self.beta = nn.Parameter(torch.zeros(1)) # added
    def forward(self, x):
        mean = x.mean(dim=-1, keepdims=True)
        std = x.std(dim=-1, keepdims=True)
        return self.alpha* (x-mean)/(std+self.eps) + self.beta

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff:int, dropout:float):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # (batch, seq_len, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # (batch, seq_len, d_model)
    def forward(self,x):
        # x shape: (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class MultiHeadedAttentionBlock(nn.Module):
    def __init__(self, d_model:int, h:int, dropout: int):
        super().__init__()
        self.d_model = d_model
        self.h = h
        self.dropout = nn.Dropout(dropout)
        assert d_model % h ==0 , "d_model is not divisible by h"
        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model,d_model,bias=False)
        self.w_k = nn.Linear(d_model,d_model,bias=False)
        self.w_v = nn.Linear(d_model,d_model,bias=False)

        self.w_o = nn.Linear(d_model, d_model)

    @staticmethod
    def attention(query, key,value, mask, dropout:nn.Dropout) :
        d_k = query.shape[-1]
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask==0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        return (attention_scores @ value) , attention_scores

    def forward(self,q,k,v,mask):
        # x shape : (batch , seq_len, d_model)
        # return : (batch , seq_len, d_model)

        query = self.w_q(q)     # (batch , seq_len, d_model)
        key = self.w_k(k)       # (batch , seq_len, d_model)
        value = self.w_v(v)     # (batch , seq_len, d_model)
        # (batch , seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1],self.h, self.d_k).transpose(1,2)
        key = key.view(key.shape[0], key.shape[1],self.h, self.d_k).transpose(1,2)
        value = value.view(value.shape[0], value.shape[1],self.h, self.d_k).transpose(1,2)

        x, attention_scores = MultiHeadedAttentionBlock.attention(query, key, value,mask, self.dropout)

        # Combine all the head together
        x = x.transpose(1,2).contiguous().flatten(2) # (batch, seq_len, h*d_k) --> (batch, seq_len, d_model)
        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)

class ResidualConnection(nn.Module):
    def __init__(self, dropout: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()
    def forward(self,x, sublayer):
        out = x + self.dropout(sublayer(self.norm(x)))  # Skip connections
        return out

class EncoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadedAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connection = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])
    def forward(self,x, src_mask):
        x = self.residual_connection[0](x, lambda x: self.self_attention_block(x,x,x,src_mask))
        x = self.residual_connection[1](x, self.feed_forward_block)
        return self.dropout(x)

class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x,mask)
        return self.norm(x)

class DecoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadedAttentionBlock, cross_attention_block: MultiHeadedAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])
    def forward(self, x, encoder_output, src_mask, target_mask):
        # x shape: (batch, seq_len, d_model)
        # encoder_output : (batch, seq_len, d_model)
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x,x,x,target_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()
    def forward(self, x, encoder_output, src_mask, target_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, target_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)
    def forward(self, x):
        # x shape: (batch, seq_len, d_model)
        # output: (batch, seq_lem , vocab_size)
        return torch.log_softmax(self.proj(x),dim=-1)

class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbedding, tgt_embed: InputEmbedding, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self,encoder_output, src_mask, tgt, target_mask ):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, target_mask)

    def project(self, x):
        # (batch, seq_len, d_model)
        return self.projection_layer(x)

def build_transformer(src_vocab_size, tgt_vocab_size, src_seq_len, tgt_seq_len, d_model = 512, N=6, h= 8, dropout= 0.3, d_ff = 2048):

    # create the embedding layers
    src_embed = InputEmbedding(d_model, src_vocab_size)
    tgt_embed = InputEmbedding(d_model, tgt_vocab_size)
    # create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadedAttentionBlock(d_model, h, dropout)
        encoder_feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(encoder_self_attention_block, encoder_feed_forward_block,dropout)
        encoder_blocks.append(encoder_block)

    # create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadedAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadedAttentionBlock(d_model, h, dropout)
        decoder_feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block , decoder_cross_attention_block, decoder_feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # create the encoder and decoder
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    # create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer


In [None]:
from pathlib import Path

def get_config():
    return {
        "batch_size": 16,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 150,
        "d_model": 512,
        # "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "hi",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": None,
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

def get_weights_file_path(config, epoch:str):
    model_folder = config['model_folder']
    model_basename = config['model_basename']
    model_filename = f"{model_basename}{epoch}.pt"
    return str(Path('.')/model_folder/model_filename)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len
        # create special token tensors
        self.sos_token = torch.Tensor([tokenizer_src.token_to_id("[SOS]")]).type(torch.int64)
        self.eos_token = torch.Tensor([tokenizer_src.token_to_id("[EOS]")]).type(torch.int64)
        self.pad_token = torch.Tensor([tokenizer_src.token_to_id("[PAD]")]).type(torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos , eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # we will add [SOS] and [EOS] to the sequence
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # We will add only [SOS] to the sequence

        if enc_num_padding_tokens<0 or dec_num_padding_tokens<0:
            raise ValueError("Sentence too long")

        # Encoder Input: Add [SOS], [EOS] and [PAD]
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*enc_num_padding_tokens, dtype = torch.int64),
            ],
            dim = 0
        )

        # Decoder Input : Add [SOS] and [PAD]
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens,dtype=torch.int64),
                torch.tensor([self.pad_token]*dec_num_padding_tokens, dtype = torch.int64)
            ],
            dim= 0
        )

        # Label: Add [EOS] and [PAD]
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens,dtype = torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*dec_num_padding_tokens, dtype = torch.int64)
            ],
            dim= 0
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input, # (seq_len)
            "decoder_input": decoder_input, # (seq_len)
            "encoder_mask": (encoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int(), # During self attention, we don't want our [PAD] tokens to interact with each other # (1,1,seq_len)
            "decoder_mask": (decoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)),  # padding tokens shouldn't interact and tokens should only interact with the tokens that comes prior to it # (1,seq_len,seq_len)
            "label" : label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def casual_mask(size):
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int) # (size, size) --> (seq_len, seq_len)
    return mask == 0

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

# from dataset import BilingualDataset, casual_mask
# from model import build_transformer
from tqdm import tqdm
import warnings

# importing necessary hugging face libraries
# !pip install datasets
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from pathlib import Path

# from config import get_weights_file_path, get_config

def get_all_sentences(ds,lang):
    for item in ds:
        yield item["translation"][lang]

def get_or_build_tokenizer(config, ds, lang:str):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens = ["[UNK]","[PAD]","[SOS]","[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_src.token_to_id("[SOS]")
    eos_idx = tokenizer_src.token_to_id("[EOS]")
    # Precomputes the encoder output and resuse it for every token we get from the decoder
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input the SOS token
    decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1)==max_len:
            break
        # Build Mask for the target (decoder input)
        decoder_mask = casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        # calculate the output of the decoder
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
        # Get the next token
        prob = model.project(out[:,-1])
        # select the token with the max prob as this is the greedy search!
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat([decoder_input, torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

        if next_word== eos_idx:
            break
    return decoder_input.squeeze(0)

def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples = 2):
    model.eval()
    count=0

    # source_texts = []
    # expected = []
    # predicted = []
    # size of the control window
    console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count+=1
            encoder_input = batch["encoder_input"].to(device)
            encoder_mask = batch["encoder_mask"].to(device)

            assert encoder_input.size(0)==1 ,"Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            # source_texts.append(source_text)
            # expected.append(target_text)
            # predicted.append(model_out_text)

            # print the msg to console
            print_msg('-'*console_width)
            print_msg(f"SOURCE: {source_text}")
            print_msg(f"TARGET: {target_text}")
            print_msg(f"PREDICTED: {model_out_text}")

            if count==num_examples:
                break


def get_ds(config):
    ds_raw = load_dataset("cfilt/iitb-english-hindi" ,split = 'train') #, f'{config["lang_src"]}-{config["lang_tgt"]}', split='train')
    ds_raw = ds_raw.select(range(20000))
    # Build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config["lang_src"])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config["lang_tgt"])
    # Train-val split : 90 % for train, 10% for val
    train_ds_size = int(0.9*len(ds_raw))
    val_ds_size = len(ds_raw)-train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')

    train_dataloader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
    val_dataloader =  DataLoader(val_ds,batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len,vocab_tgt_len,config['seq_len'],config['seq_len'])
    return model

def train_model(config):
    # define the device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'On Device: {device}')

    # make sure the weights folder exists
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr = config['lr'], eps=1e-9)

    initial_epoch = 0
    global_step = 0
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f"Preloading Model {model_filename}")
        state = torch.load(model_filename)
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id("[PAD]"), label_smoothing=0.1).to(device)
    print("Training the model: ")
    for epoch in range(initial_epoch, initial_epoch + config['num_epochs']):
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            model.train()
            encoder_input = batch["encoder_input"].to(device) # (B, seq_len)
            decoder_input = batch["decoder_input"].to(device) # (B, seq_len)
            encoder_mask = batch["encoder_mask"].to(device)   # (B,1, seq_len)
            decoder_mask = batch["decoder_mask"].to(device)   # (B,seq_len, seq_len)

            # run the tensors through the transformer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output,encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, target_vocab_size)

            label = batch["label"].to(device) # (B, seq_len)

            # (B,seq_len, tgt_vocab_size) --> (B*seq_len, tgt_vocab_size)
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss on tensorboard
            writer.add_scalar("train loss", loss.item(), global_step)
            writer.flush()

            # backpropagtes the loss
            loss.backward()

            # update the weights
            optimizer.step()
            optimizer.zero_grad()

            # run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config["seq_len"], device, lambda msg: batch_iterator.write(msg) ,global_step, writer)

            global_step+=1

        # Run validation after epoch to see how the model is performing
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config["seq_len"], device, lambda msg: batch_iterator.write(msg) ,global_step, writer)

        # save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)


if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    config = get_config()
    train_model(config)

On Device: cuda
Max length of source sentence: 70
Max length of target sentence: 98
Training the model: 


Processing Epoch 00: 100%|██████████| 1125/1125 [04:58<00:00,  3.77it/s, loss=4.536]


--------------------------------------------------------------------------------
SOURCE: Show right margin
TARGET: दिखाएँ दायाँ
PREDICTED: प्रोजेक्ट फ़ाइल
--------------------------------------------------------------------------------
SOURCE: _ Select
TARGET: चुनें (_ S) 
PREDICTED: ( _ S )


Processing Epoch 01: 100%|██████████| 1125/1125 [04:58<00:00,  3.77it/s, loss=4.001]


--------------------------------------------------------------------------------
SOURCE: Use pkg - config to add library support from other packages
TARGET: उपयोग को जोड़ें समर्थन से अन्य संकुल
PREDICTED: को से फ़ाइल से . को को नहीं से नहीं को को फ़ाइल से .
--------------------------------------------------------------------------------
SOURCE: Selected accessible
TARGET: चुने गए एक्सेसेबेल
PREDICTED: चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित चयनित च

Processing Epoch 02: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=2.671]


--------------------------------------------------------------------------------
SOURCE: Move a card or build of cards on to the empty slot
TARGET: एक पत्ता या पत्तों के समूह खाली खाँचा पर ले जाएँ
PREDICTED: ~ a को एक खाली तस्वीर खाँचा पर ले जाएँ .
--------------------------------------------------------------------------------
SOURCE: Command
TARGET: कमांड
PREDICTED: कमांड


Processing Epoch 03: 100%|██████████| 1125/1125 [04:58<00:00,  3.77it/s, loss=2.531]


--------------------------------------------------------------------------------
SOURCE: C compiler flags:
TARGET: सी फ्लैग्सः
PREDICTED: सी फ्लैग्सः
--------------------------------------------------------------------------------
SOURCE: % s has no text interface
TARGET: % s का कोई पाठ अंतराफलक नहीं है
PREDICTED: % s का कोई नहीं है से नहीं से . नहीं से . नहीं रिमोट रिमोट रिमोट से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं रिमोट रिमोट रिमोट रिमोट रिमोट से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से रिमोट से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से . नहीं से .


Processing Epoch 04: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=2.271]


--------------------------------------------------------------------------------
SOURCE: Generate patches relative to:
TARGET: बनाएँ (G) कोः
PREDICTED: नया निर्देशिका कोः
--------------------------------------------------------------------------------
SOURCE: Place the four of spades next to the three of spades.
TARGET: हुकुम की दुक्का के बगल में हुकुम के तिक्की को रखें. 
PREDICTED: हुकुम की हुकुम के हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम की हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम की हुकुम हुकुम हुकुम हुकुम की हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम के हुकुम की हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम की हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम की हुकुम की हुकुम हुकुम हुकुम की हुकुम की हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम हुकुम की हुकुम की हु

Processing Epoch 05: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=2.014]


--------------------------------------------------------------------------------
SOURCE: Skip
TARGET: छोड़ें
PREDICTED: छोड़ें
--------------------------------------------------------------------------------
SOURCE: Base Card: ~ a
TARGET: आधार पत्ताः ~ a
PREDICTED: आधार पत्ताः ~ a


Processing Epoch 06: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=2.480]


--------------------------------------------------------------------------------
SOURCE: Ignore Files:
TARGET: लछ्य फाइल
PREDICTED: फ़ाइल निर्देशिका
--------------------------------------------------------------------------------
SOURCE: (error)
TARGET: (त्रुटि) 
PREDICTED: ( त्रुटि )


Processing Epoch 07: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=1.834]


--------------------------------------------------------------------------------
SOURCE: The shortcut is already used by another component in Anjuta. Do you want to keep it anyway?
TARGET: शॉर्टकट है प्रयुक्त द्वारा इंच Anjuta को? 
PREDICTED: Anjuta है इंच प्रयोक्ता Anjuta है इंच Anjuta प्रयोक्ता Anjuta प्रयोक्ता Anjuta प्रयोक्ता को इंच Anjuta प्रयोक्ता Anjuta प्रयोक्ता इंच Anjuta प्रयोक्ता इंच Anjuta है इंच Anjuta इंच Anjuta प्रयोक्ता प्रयोक्ता प्रयोक्ता है इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta इंच प्रयोक्ता इंच प्रयोक्ता प्रयोक्ता को प्रयोक्ता को इंच Anjuta प्रयोक्ता प्रयोक्ता इंच Anjuta प्रयोक्ता इंच Anjuta प्रयोक्ता इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta इंच Anjuta प्रयोक्ता इंच Anjuta प्रयोक्ता Anjuta प्रयोक्ता Anjuta प्रयोक्ता इंच Anjuta प्रयोक्ता इंच Anjuta प्रयोक्ता प्रयोक्ता प्रयोक्ता इंच Anjuta प्रयोक्ता इंच प्रयोक्ता इंच प्रयोक्ता इंच Anjuta प्रयोक्ता इंच Anjuta प्रयोक्ता प्रयोक्ता प्रयोक्ता प्रयोक्

Processing Epoch 08: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=1.771]


--------------------------------------------------------------------------------
SOURCE: Reshuffle cards
TARGET: पत्तों को फिर फेंटें
PREDICTED: फिर फेंटें फिर फेंटें
--------------------------------------------------------------------------------
SOURCE: Updated:% s
TARGET: अद्यतनीकृत से. 
PREDICTED: अद्यतनीकृत से .


Processing Epoch 09: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=1.593]


--------------------------------------------------------------------------------
SOURCE: Perl Source File
TARGET: पर्ल स्रोत फ़ाइल
PREDICTED: पर्ल स्रोत फ़ाइल
--------------------------------------------------------------------------------
SOURCE: Main Menu
TARGET: मुख्य मेन्यू
PREDICTED: मुख्य खेलः


Processing Epoch 10: 100%|██████████| 1125/1125 [04:57<00:00,  3.79it/s, loss=1.655]


--------------------------------------------------------------------------------
SOURCE: Place the king of clubs next to the queen of clubs.
TARGET: चिड़ी की बेगम के बगल में चिड़ी के वादशाह को रखें. 
PREDICTED: चिड़ी की बेगम के बगल में चिड़ी के बेगम को रखें .
--------------------------------------------------------------------------------
SOURCE: Build
TARGET: बिल्ड
PREDICTED: बिल्ड ( _ B )


Processing Epoch 11: 100%|██████████| 1125/1125 [04:57<00:00,  3.79it/s, loss=1.503]


--------------------------------------------------------------------------------
SOURCE: Move ~ a onto an empty bottom slot.
TARGET: ~ a को एक खाली नीचे स्लॉट में ले जाएँ. 
PREDICTED: ~ a को एक खाली नीचे स्लॉट में ले जाएँ .
--------------------------------------------------------------------------------
SOURCE: _ Selection
TARGET: चयन (_ S) 
PREDICTED: चयन ( _ S )


Processing Epoch 12: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=1.687]


--------------------------------------------------------------------------------
SOURCE: Allow temporary spots use
TARGET: अस्थायी स्पॉट उपयोग स्वीकारें
PREDICTED: अस्थायी स्पॉट उपयोग स्वीकारें
--------------------------------------------------------------------------------
SOURCE: NAME
TARGET: नाम
PREDICTED: नाम


Processing Epoch 13: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=1.585]


--------------------------------------------------------------------------------
SOURCE: End of file
TARGET: फ़ाइल खोलें
PREDICTED: फ़ाइल का नाम
--------------------------------------------------------------------------------
SOURCE: Component for searching
TARGET: खोज रहा है घटक के लिए
PREDICTED: खोज रहा है घटक के लिए


Processing Epoch 14: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=1.414]


--------------------------------------------------------------------------------
SOURCE: Makefile - based project
TARGET: मेकफाइल आधारित परियोजना
PREDICTED: मेकफाइल आधारित परियोजना
--------------------------------------------------------------------------------
SOURCE: Restart the game
TARGET: वर्तमान खेल पुनःप्रारंभ करें
PREDICTED: वर्तमान खेल पुनःप्रारंभ करें


Processing Epoch 15: 100%|██████████| 1125/1125 [04:56<00:00,  3.79it/s, loss=1.572]


--------------------------------------------------------------------------------
SOURCE: Project from Existing Sources
TARGET: परियोजना से सोर्सेज़
PREDICTED: परियोजना से सोर्सेज़
--------------------------------------------------------------------------------
SOURCE: Currently activated plugins
TARGET: अभी सक्रिय प्लगिन
PREDICTED: अभी सक्रिय प्लगिन


Processing Epoch 16: 100%|██████████| 1125/1125 [04:57<00:00,  3.78it/s, loss=1.428]


--------------------------------------------------------------------------------
SOURCE: Add a new file / directory to the CVS tree
TARGET: जोड़ें a नया फ़ाइल निर्देशिका को सीवीएस ट्री
PREDICTED: जोड़ें a नया निर्देशिका जोड़ें ट्री को सीवीएस
--------------------------------------------------------------------------------
SOURCE: Serial Line Connection
TARGET: सीरियल पंक्ति कनेक्शन
PREDICTED: सीरियल पंक्ति कनेक्शन


Processing Epoch 17: 100%|██████████| 1125/1125 [04:56<00:00,  3.79it/s, loss=1.396]


--------------------------------------------------------------------------------
SOURCE: ace of diamonds
TARGET: ईंट का इक्का
PREDICTED: ईंट का इक्का
--------------------------------------------------------------------------------
SOURCE: Field "% s" must contains only letters, digits or the following characters "# $:%% +,. = @ ^ _ `~". In addition you cannot have a leading dash. Please fix it.
TARGET: क्षेत्र से. रखता है अक्षर या अनुसरण कर रहा है अक्षर अन्दर a डैश कृपया. 
PREDICTED: क्षेत्र से . रखता है अक्षर या अनुसरण कर रहा है अक्षर अन्दर a डैश कृपया .


Processing Epoch 18: 100%|██████████| 1125/1125 [04:57<00:00,  3.79it/s, loss=1.343]


--------------------------------------------------------------------------------
SOURCE: Add License Information:
TARGET: लाइसेंस जानकारी जोड़ें
PREDICTED: लाइसेंस जानकारी जोड़ें
--------------------------------------------------------------------------------
SOURCE: _ Select
TARGET: चुनें (_ S) 
PREDICTED: चुनें ( _ S )


Processing Epoch 19: 100%|██████████| 1125/1125 [04:56<00:00,  3.79it/s, loss=1.504]


--------------------------------------------------------------------------------
SOURCE: Author Name:
TARGET: लेखक नामः
PREDICTED: लेखकः
--------------------------------------------------------------------------------
SOURCE: Remember this selection
TARGET: यह चयन याद रखें
PREDICTED: यह चयन याद रखें


In [None]:

import torch
import warnings
warnings.filterwarnings('ignore')

def infer_model(config ,src_text:str):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # step 1: Tokenize it
    tokenizer_src = get_or_build_tokenizer(config, None, lang = config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, None, lang = config['lang_tgt'])
    src_token = tokenizer_src.encode(src_text).ids

    # create encoder input tensor: (B, seq_len) --> (1, seq_len)
    sos_token = torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64)
    eos_token = torch.tensor([tokenizer_src.token_to_id('[EOS]')],dtype=torch.int64)
    pad_token = torch.tensor([tokenizer_src.token_to_id('[PAD]')],dtype=torch.int64)
    encd_num_padding = config['seq_len'] - len(src_token) - 2
    encoder_input = torch.cat(
        [
            sos_token,
            torch.tensor(src_token, dtype=torch.int64),
            eos_token,
            torch.tensor([pad_token]*encd_num_padding, dtype=torch.int64)
        ], dim=0
    ).unsqueeze(0)
    # create encoder input mask # (1,1,seq_len)
    encoder_input_mask = (encoder_input!=pad_token).unsqueeze(0).unsqueeze(0).int()
    # create model and load the weights
    model = get_model(config,tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    state = torch.load("weights/tmodel_17.pt", map_location=device)
    model.load_state_dict(state['model_state_dict'])
    model_out = greedy_decode(model,encoder_input.to(device), encoder_input_mask.to(device), tokenizer_src, tokenizer_tgt, 100, device)

    # decode model output
    tgt_text = tokenizer_tgt.decode(model_out.cpu().numpy())
    return tgt_text


if __name__=='__main__':
    config = get_config()
    src_text = input("Enter: ")
    tgt_text = infer_model(config, src_text)
    print(f'Translation: {tgt_text}')



Enter:  Place the king of clubs next to the queen of clubs.
Translation: चिड़ी की बेगम के बगल में चिड़ी के वादशाह को रखें .


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive
