In [1]:
import pandas as pd
from transformers import AutoTokenizer
import torch 
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
import pickle
from indicnlp import indic_tok

In [2]:
# Load the CSV file containing the parallel corpus
df = pd.read_csv("eng_Hindi_data_train.csv", header=None)

In [3]:
df.head()

Unnamed: 0,0,1
0,and deliver us by Thy mercy from the people of...,और अपनी रहमत से हमें इन काफ़िर लोगों (के नीचे)...
1,Transformed position of fourth point,चौथे बिन्दु का रूपांतरित स्थान
2,"Oh, woe to me; I wish I never took so - and - ...",हाए अफसोस काश मै फला शख्स को अपना दोस्त न बनाता
3,The PS file is to be translated into a PDF fil...,पीएस2पीडीएफ के इस्तेमाल से पीएस फ़ाइल को पीडीए...
4,Receiving LDAP search results...,LDAP खोज परिणाम पा रहा है...


In [4]:
input_sentences = df.iloc[:, 0].tolist()
output_sentences = df.iloc[:, 1].tolist()

In [5]:
with open("inp_sent.pkl", 'wb') as f: 
    pickle.dump(input_sentences, f)

In [6]:
with open('out_sent.pkl', 'wb') as f: 
    pickle.dump(output_sentences, f)

In [4]:
df.head()

Unnamed: 0,0,1
0,and deliver us by Thy mercy from the people of...,और अपनी रहमत से हमें इन काफ़िर लोगों (के नीचे)...
1,Transformed position of fourth point,चौथे बिन्दु का रूपांतरित स्थान
2,"Oh, woe to me; I wish I never took so - and - ...",हाए अफसोस काश मै फला शख्स को अपना दोस्त न बनाता
3,The PS file is to be translated into a PDF fil...,पीएस2पीडीएफ के इस्तेमाल से पीएस फ़ाइल को पीडीए...
4,Receiving LDAP search results...,LDAP खोज परिणाम पा रहा है...


In [11]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [26]:
# Tokenize the English and Hindi sentences
english_tokens = tokenizer(df.iloc[:, 0].tolist(), padding=True, truncation=True, return_tensors="pt")
hindi_tokens = tokenizer(df.iloc[:, 1].tolist(), padding=True, truncation=True, return_tensors="pt")

In [27]:
dataset = [(english_tokens["input_ids"][i], hindi_tokens["input_ids"][i]) for i in range(len(df))]

In [28]:
dataset[0][0].shape, dataset[0][1].shape

(torch.Size([512]), torch.Size([512]))

In [34]:
class TranslationDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        source, target = self.dataset[index]
        return {"source": source, "target": target}

In [36]:
my_dat = TranslationDataset(dataset)

In [45]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.rnn = nn.GRU(input_dim, hidden_dim, n_layers, dropout=self.dropout)

    def forward(self, src):
        output, hidden = self.rnn(src)
        return output, hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.rnn = nn.GRU(output_dim, hidden_dim, n_layers, dropout=self.dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        output, hidden = self.rnn(input, hidden)
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_output, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

# Define the training and evaluation functions
def train(model, optimizer, criterion, train_loader, device):
    model.train()
    epoch_loss = 0
    for batch_idx, temp in enumerate(train_loader):
        src = temp['source']
        trg = temp['target']
        src = src.to(device)
        trg = trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].contiguous().view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(train_loader)

def evaluate(model, criterion, val_loader, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch_idx, (src, trg) in enumerate(val_loader):
            src = src.to(device)
            trg = trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].cont

In [37]:
loader = DataLoader(my_dat, batch_size=128, shuffle=True)

In [46]:
i = next(iter(loader))
print(type(i))
print(i)

<class 'dict'>
{'source': tensor([[  101, 10576, 10105,  ...,     0,     0,     0],
        [  101, 12845, 73657,  ...,     0,     0,     0],
        [  101, 11696, 12415,  ...,     0,     0,     0],
        ...,
        [  101, 70827, 98514,  ...,     0,     0,     0],
        [  101, 51418, 10160,  ...,     0,     0,     0],
        [  101,   168, 10243,  ...,     0,     0,     0]]), 'target': tensor([[   101, 101246,  34578,  ...,      0,      0,      0],
        [   101,  38076,  10532,  ...,      0,      0,      0],
        [   101,  21547,    851,  ...,      0,      0,      0],
        ...,
        [   101,    865,  13466,  ...,      0,      0,      0],
        [   101,  10977,    113,  ...,      0,      0,      0],
        [   101,    883,  27155,  ...,      0,      0,      0]])}


In [40]:
i['source'].shape

torch.Size([128, 512])

In [50]:
i['target'].shape, i['target'].dtype

(torch.Size([128, 512]), torch.int64)

: 

In [47]:
encoder = Encoder(512, 512, 8, 0.2)
decoder = Decoder(512, 512, 8, 0.2)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


In [43]:
model = Seq2Seq(encoder, decoder, device)
model

Seq2Seq(
  (encoder): Encoder(
    (rnn): GRU(512, 512, num_layers=8, dropout=0.2)
  )
  (decoder): Decoder(
    (rnn): GRU(512, 512, num_layers=8, dropout=0.2)
    (fc_out): Linear(in_features=512, out_features=512, bias=True)
  )
)

# trying out bert-embeddings

In [1]:
import torch 
from transformers import BertTokenizer, BertModel

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [21]:
input_sentences = ['Hello, how are you?']
output_sentences = ['नमस्ते, आप कैसे हैं?']

In [22]:
input_tokens = tokenizer(input_sentences, padding="max_length", truncation=True, max_length=256, return_tensors='pt')
print(input_tokens)

{'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,

In [23]:
input_ids = input_tokens['input_ids']
input_mask = input_tokens['attention_mask']

print(type(input_ids))
print(type(input_mask))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [24]:
print(input_ids.shape)
print(input_mask.shape)

torch.Size([1, 256])
torch.Size([1, 256])


In [27]:
with torch.no_grad():
    bert_outputs = bert_model(input_ids, input_mask)

In [28]:
bert_outputs[0].shape

torch.Size([1, 256, 768])

In [18]:
bert_outputs['last_hidden_state'].shape, bert_outputs['pooler_output'].shape

(torch.Size([2, 256, 768]), torch.Size([2, 768]))