In [15]:
import pandas as pd
import numpy as np
import re
import string


In [16]:

# Read the CSV file
df = pd.read_csv('Data/Hindi/hindi_english_parallel.csv')

In [17]:
df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [20]:
# Preprocessing function
def preprocess_text(text):
    try:
        # Convert to lowercase
        text = text.lower()

        # Remove numbers
        text = re.sub(r'\d+', '', text)

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Additional preprocessing steps specific to Hindi may be required

        return text
    except:
        return text

# Apply preprocessing to the English and Hindi columns
df['english'] = df['english'].apply(preprocess_text)
df['hindi'] = df['hindi'].apply(preprocess_text)

In [21]:
df.dropna(inplace=True)

In [22]:
df.describe()

Unnamed: 0,hindi,english
count,1555727.0,1555727.0
unique,967488.0,974298.0
top,,
freq,2214.0,2637.0


In [156]:
df.tail()

Unnamed: 0,hindi,english
1561835,members making oathaffirmation,शपथ लेनेप्रतिज्ञान करने वाले सदस्य
1561836,स्पष्टीकरण–जहां इस उपधारा के अधीन हानि और लाभ ...,स्पष्टीकरण–जहां इस उपधारा के अधीन हानि और लाभ ...
1561837,मैंने गौर किया है कि यह न केवल अपने महत्त्वपूर...,है। i note that this is a landmark meeting – n...
1561838,उन्होंने मेरे समक्ष जो प्रदर्शन किया उसमें से ...,है। in the presentations that they made before...
1561839,खाद्य और जल सुरक्षा पर्यावरण की दृष्टि से वहनी...,्त है। issues such as food and water security ...


In [23]:
df.shape

(1555727, 2)

In [14]:
df.to_csv('preprocessed_english_hindi_dataset.csv', index=False)


In [24]:
df_hin = pd.read_csv('preprocessed_english_hindi_dataset.csv')

In [25]:
df_hin.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,accerciser accessibility explorer
2,निचले पटल के लिए डिफोल्ट प्लगइन खाका,the default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका,the default plugin layout for the top panel
4,उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष...,a list of plugins that are disabled by default


In [157]:
df = df.head(10000)

In [158]:
df.shape

(10000, 2)

In [159]:
df.describe()

Unnamed: 0,hindi,english
count,10000,10000.0
unique,2015,2652.0
top,number,
freq,56,65.0


In [160]:
SRC_LANGUAGE = 'english'
TGT_LANGUAGE = 'hindi'

In [161]:
# Get the source and target sentences from the dataframe
src_sentences = df['english'].tolist()
tgt_sentences = df['hindi'].tolist()

In [162]:
tokenizer = get_tokenizer('basic_english') 

In [163]:
# Place-holders
token_transform = {}
vocab_transform = {}

In [164]:
tokenized_src_sentences = [tokenizer(sentence) for sentence in src_sentences]
tokenized_tgt_sentences = [tokenizer(sentence) for sentence in tgt_sentences]

In [165]:
vocab_transform = build_vocab_from_iterator(tokenized_src_sentences + tokenized_tgt_sentences)

In [166]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [167]:
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_vocab, tgt_vocab):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, index):
        src = self.src_sentences[index]
        tgt = self.tgt_sentences[index]
        return src, tgt

In [168]:
def custom_collate_fn(batch):
    src_sentences, tgt_sentences = zip(*batch)
    
    # Convert source and target sentences to tensors and indices
    src_tensor = [torch.tensor([src_vocab[word] for word in sentence]) for sentence in src_sentences]
    tgt_tensor = [torch.tensor([tgt_vocab[word] for word in sentence]) for sentence in tgt_sentences]

    # Pad the sequences within the batch
    src_padded = pad_sequence(src_tensor, batch_first=True)
    tgt_padded = pad_sequence(tgt_tensor, batch_first=True)

    return src_padded, tgt_padded

In [169]:
# Define your encoder-decoder translation model
class EncoderDecoder(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size, hidden_size):
        super(EncoderDecoder, self).__init__()
        self.embedding = nn.Embedding(src_vocab_size, embed_size)
        self.encoder = nn.GRU(embed_size, hidden_size)
        self.decoder = nn.GRU(embed_size, hidden_size)
        self.fc = nn.Linear(hidden_size, tgt_vocab_size)

    def forward(self, src, tgt):
        src_embedded = self.embedding(src)
        encoder_output, encoder_hidden = self.encoder(src_embedded)

        # Ensure tgt tensor has valid indices within the range of tgt_vocab_size
        tgt_indices = (tgt >= 0) & (tgt < tgt_vocab_size)
        tgt_filtered = tgt.masked_fill(~tgt_indices, 0)  # Replace out-of-range indices with 0

        tgt_embedded = self.embedding(tgt_filtered)
        decoder_output, _ = self.decoder(tgt_embedded, encoder_hidden)
        output = self.fc(decoder_output)
        return output

In [182]:
input_size

200371

In [183]:
# Define hyperparameters
input_size = 512 #len(src_vocab)
hidden_size = 128
output_size = 512 #len(tgt_vocab)
learning_rate = 0.01
batch_size = 2
num_epochs = 5
embed_size = 128  

In [184]:
# Create your translation dataset and data loader with custom collate function
translation_dataset = TranslationDataset(tokenized_src_sentences, tokenized_tgt_sentences, src_vocab, tgt_vocab)
dataloader = DataLoader(translation_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

In [185]:
# Initialize your encoder-decoder model
model = EncoderDecoder(input_size, hidden_size, output_size, embed_size)

In [186]:
print("Source sentence:", translation_dataset[0][0])
print("Target sentence:", translation_dataset[0][1])

Source sentence: ['give', 'your', 'application', 'an', 'accessibility', 'workout']
Target sentence: ['अपने', 'अनुप्रयोग', 'को', 'पहुंचनीयता', 'व्यायाम', 'का', 'लाभ', 'दें']


In [187]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [188]:
from torchtext.vocab import build_vocab_from_iterator

# Assuming `tokenized_src_sentences` and `tokenized_tgt_sentences` are your tokenized sentences

# Build source vocabulary
src_vocab = build_vocab_from_iterator(tokenized_src_sentences, specials=["<unk>", "<pad>", "<sos>", "<eos>"])
src_vocab.set_default_index(0)  # Set the default index for unknown words

# Build target vocabulary
tgt_vocab = build_vocab_from_iterator(tokenized_tgt_sentences, specials=["<unk>", "<pad>", "<sos>", "<eos>"])
tgt_vocab.set_default_index(0)  # Set the default index for unknown words


In [191]:
for epoch in range(num_epochs):
    running_loss = 0.0
    for src, tgt in dataloader:
        print(src)
        print(tgt)
        optimizer.zero_grad()

        # Forward pass
        outputs = model(src, tgt)

        # Calculate loss
        loss = criterion(outputs.view(-1, output_size), tgt.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print average loss for the epoch
    average_loss = running_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")

tensor([[ 69, 266,  18],
        [292,   0,   0]])
tensor([[284,  33, 134,   7],
        [353,   0,   0,   0]])


RuntimeError: Expected hidden size (1, 4, 128), got [1, 3, 128]

In [136]:
df

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,accerciser accessibility explorer
2,निचले पटल के लिए डिफोल्ट प्लगइन खाका,the default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका,the default plugin layout for the top panel
4,उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष...,a list of plugins that are disabled by default
...,...,...
1561835,members making oathaffirmation,शपथ लेनेप्रतिज्ञान करने वाले सदस्य
1561836,स्पष्टीकरण–जहां इस उपधारा के अधीन हानि और लाभ ...,स्पष्टीकरण–जहां इस उपधारा के अधीन हानि और लाभ ...
1561837,मैंने गौर किया है कि यह न केवल अपने महत्त्वपूर...,है। i note that this is a landmark meeting – n...
1561838,उन्होंने मेरे समक्ष जो प्रदर्शन किया उसमें से ...,है। in the presentations that they made before...
