In [1]:
!pip3 install indic-nlp-library



In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
from indicnlp.tokenize import indic_tokenize

In [3]:
data = pd.read_csv("/kaggle/input/english-hindi-dataset/Sentence pairs in English-Hindi - 2025-02-11.tsv",
                  sep="\t",header=None,names=["SrcSentID","SrcSent","DstSentID","DstSent"])

In [4]:
data.head()

Unnamed: 0,SrcSentID,SrcSent,DstSentID,DstSent
0,1282,Muiriel is 20 now.,485968,म्यूरियल अब बीस साल की हो गई है।
1,1282,Muiriel is 20 now.,2060319,म्यूरियल अब बीस साल की है।
2,1294,Education in this world disappoints me.,485564,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,1302,That won't happen.,2060320,वैसा नहीं होगा।
4,1308,I miss you.,2060321,मुझें तुम्हारी याद आ रही है।


In [5]:
data.drop(labels=[data.columns[0],data.columns[2]],axis=1,inplace=True)

In [6]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,Muiriel is 20 now.,म्यूरियल अब बीस साल की हो गई है।
1,Muiriel is 20 now.,म्यूरियल अब बीस साल की है।
2,Education in this world disappoints me.,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,That won't happen.,वैसा नहीं होगा।
4,I miss you.,मुझें तुम्हारी याद आ रही है।


In [7]:
src_sent_tokenizer = AutoTokenizer.from_pretrained("google-T5/T5-base")

In [8]:
data["SrcSent"] = data["SrcSent"].apply(lambda x: src_sent_tokenizer.tokenize(x))

In [9]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की हो गई है।
1,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की है।
2,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,"[▁That, ▁won, ', t, ▁happen, .]",वैसा नहीं होगा।
4,"[▁I, ▁miss, ▁you, .]",मुझें तुम्हारी याद आ रही है।


In [10]:
data["DstSent"] = data["DstSent"].apply(lambda x: indic_tokenize.trivial_tokenize(x,lang="hi"))

In [11]:
data["SrcSent"] = data["SrcSent"].apply(src_sent_tokenizer.convert_tokens_to_ids)

In [12]:
Vs = src_sent_tokenizer.get_vocab()

In [13]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[466, 751, 31, 17, 1837, 5]","[वैसा, नहीं, होगा, ।]"
4,"[27, 3041, 25, 5]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [14]:
hindi_vocab = set()

for tokenized_hindi_sent in data["DstSent"]:
    hindi_vocab.update(tokenized_hindi_sent)

In [15]:
Vd = dict()
for idx, token in enumerate(hindi_vocab):
    Vd[token] = idx + 3
Vd["<PAD>"] = 0
Vd["<SOS>"] = 1
Vd["<EOS>"] = 2

In [16]:
def convert_hindi_tokens_to_ids(hindi_sent):
    return [Vd[token] for token in hindi_sent]

In [17]:
data["DstSent"] = data["DstSent"].apply(lambda x: convert_hindi_tokens_to_ids(x))

In [18]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[2515, 263, 3746, 5583, 2197, 3885, 1170, 4318..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[2515, 263, 3746, 5583, 2197, 4318, 5362]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[6747, 3550, 5424, 1866, 338, 6653, 672, 169, ..."
3,"[466, 751, 31, 17, 1837, 5]","[2101, 3713, 576, 5362]"
4,"[27, 3041, 25, 5]","[4175, 1968, 4400, 5825, 2702, 4318, 5362]"


In [19]:
def insert_sos_token_id(hindi_sent_token_ids):
    return [1] + hindi_sent_token_ids

In [20]:
data["DstSentInput"] = data["DstSent"].apply(lambda x: insert_sos_token_id(x))

In [21]:
def insert_eos_token_id(hindi_sent_token_ids):
    return hindi_sent_token_ids + [2]

In [22]:
data["DstSentLabel"] = data["DstSent"].apply(lambda x: insert_eos_token_id(x))

In [23]:
data.head()

Unnamed: 0,SrcSent,DstSent,DstSentInput,DstSentLabel
0,"[4159, 23, 14018, 19, 460, 230, 5]","[2515, 263, 3746, 5583, 2197, 3885, 1170, 4318...","[1, 2515, 263, 3746, 5583, 2197, 3885, 1170, 4...","[2515, 263, 3746, 5583, 2197, 3885, 1170, 4318..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[2515, 263, 3746, 5583, 2197, 4318, 5362]","[1, 2515, 263, 3746, 5583, 2197, 4318, 5362]","[2515, 263, 3746, 5583, 2197, 4318, 5362, 2]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[6747, 3550, 5424, 1866, 338, 6653, 672, 169, ...","[1, 6747, 3550, 5424, 1866, 338, 6653, 672, 16...","[6747, 3550, 5424, 1866, 338, 6653, 672, 169, ..."
3,"[466, 751, 31, 17, 1837, 5]","[2101, 3713, 576, 5362]","[1, 2101, 3713, 576, 5362]","[2101, 3713, 576, 5362, 2]"
4,"[27, 3041, 25, 5]","[4175, 1968, 4400, 5825, 2702, 4318, 5362]","[1, 4175, 1968, 4400, 5825, 2702, 4318, 5362]","[4175, 1968, 4400, 5825, 2702, 4318, 5362, 2]"


In [24]:
data.drop(labels=[data.columns[1]],axis=1,inplace=True)

In [26]:
X = list(data["SrcSent"])
Y_input = list(data["DstSentInput"])
Y_label = list(data["DstSentLabel"])

In [27]:
X_tensor = [torch.tensor(tokenized_eng_sent_ids) for tokenized_eng_sent_ids in X]
Y_input_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_input]
Y_label_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_label]

In [29]:
X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_padded_input = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first=True)
Y_padded_label = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first=True)

In [31]:
Ns = X_padded.shape[1]
Nd = Y_padded_label.shape[1]