In [1]:
import pandas as pd
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
SOW_token = 0
EOW_token = 1
x_train = pd.read_csv('/kaggle/input/aksharantar/aksharantar_sampled/hin/hin_train.csv', header=None) #, nrows=10000)

class Lang:
    def __init__(self, name):
        self.name = name
        self.letter2index = {}
        self.letter2count = {}
        self.index2letter = {0: "0", 1: "1"}
        self.n_letters = 2 # Count SOW and EOW

    def addWord(self, word):
        for ch in word:
            self.addLetter(ch)

    def addLetter(self, ch):
        if ch not in self.letter2index:
            self.letter2index[ch] = self.n_letters
            self.letter2count[ch] = 1
            self.index2letter[self.n_letters] = ch
            self.n_letters += 1
        else:
            self.letter2count[ch] += 1

In [3]:
input_lang = Lang('eng')
output_lang = Lang('hin')

In [4]:
MAX_LENGTH = 45

def indexesFromWord(lang, word):
    return [lang.letter2index[ch] for ch in word]

def tensorFromWord(lang, word):
    indexes = indexesFromWord(lang, word)
    indexes.append(EOW_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def wordFromTensor(lang, tensor):
    s = ""
    for i in tensor:
        if(i.item()==1):
            break
        s += lang.index2letter[i.item()] 
    return s

def get_dataloader(x, input_lang, output_lang, batch_size):
    n = len(x[0])
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for i in range(n):
        input_lang.addWord(x[0][i])
        output_lang.addWord(x[1][i])
        inp_ids = indexesFromWord(input_lang, x[0][i])
        tgt_ids = indexesFromWord(output_lang, x[1][i])
        inp_ids.append(EOW_token)
        tgt_ids.append(EOW_token)
        input_ids[i, :len(inp_ids)] = inp_ids
        target_ids[i, :len(tgt_ids)] = tgt_ids
    
    data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    return dataloader

In [5]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size #32
        self.embedding = nn.Embedding(input_size, hidden_size) #45x32 = 1440
        # model
        self.algo = nn.GRU(hidden_size, hidden_size, num_layers=5, batch_first=True) 
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.algo(embedded)
        return output, hidden

In [6]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.algo = nn.GRU(hidden_size, hidden_size, num_layers=5, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOW_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.algo(output, hidden)
        output = self.out(output)
        return output, hidden

In [7]:
from sklearn.metrics import accuracy_score 

def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, batch_size):

    total_loss = 0
    correct = 0
    all_preds=[]
    all_labels=[]
    k = 0
    
    for data in dataloader:
        input_tensor, target_tensor = data
        
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
      
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
        
        outputs = decoder_outputs.view(-1, decoder_outputs.size(-1))
        labels = target_tensor.view(-1)
        
        loss = criterion(outputs, labels)
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        
        if(k%100==0):
            print(k*32, loss.item(), correct)
            print(wordFromTensor(input_lang, input_tensor[0]), wordFromTensor(output_lang, target_tensor[0]), wordFromTensor(output_lang, predicted[:45]))
        k += 1
#         all_preds.append(predicted.tolist())
#         all_labels.append(labels.tolist())
        
#         i += 1
#         if(i==1):
#             print(i, encoder_outputs.shape, encoder_hidden.shape)
#             print(" ", outputs.shape, labels.shape, predicted.shape)
#             print(" ", predicted[:45], labels[:45])
        i = 0
        while (i < 1440):
            j = 0
#             mi = 45*i
            while (j<45):
                if(predicted[i+j] != labels[i+j]):
                    break
                j+=1
            if(j==45):
                correct += 1
            i += 45
    
#     print(all_preds)
#     print(all_labels)
    print('\n')
    return total_loss / len(dataloader), correct 

In [8]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001, batch_size=50):
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss, acc = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size)
        print(epoch, loss, acc)

In [9]:
def evaluate(encoder, decoder, word, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromWord(input_lang, word)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, _ = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze(-1)

        decoded_letters = []
        for idx in decoded_ids:
            if idx.item() == EOW_token:
                decoded_letters.append['1']
                break
            decoded_letters.append(output_lang.index2letter[idx.item()])
    return decoded_letters

def evaluateRandomly(encoder, decoder, input_lang, output_lang, n=10):
    for i in range(n):
        print('in', x_train[0][i])
        print('out', x_train[1][i])
        output_words = evaluate(encoder, decoder, x_train[0][i], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('pred', output_sentence)
        print('')

In [None]:
hidden_size = 128
batch_size = 32

train_dataloader = get_dataloader(x_train, input_lang, output_lang, batch_size)
# test_dataloader = get_dataloader(x_test, input_lang, output_lang, batch_size)

encoder = EncoderRNN(input_lang.n_letters, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_letters).to(device)
print(input_lang.n_letters, output_lang.n_letters)
train(train_dataloader, encoder, decoder, 10, 0.01, batch_size)
# evaluate
# encoder.eval()
# decoder.eval()
# evaluateRandomly(encoder, decoder, input_lang, output_lang)

28 66
0 4.147501468658447 0
accumulator एक्यूमुलेटर ॅइइइइइइइइइइइइइइइइइइइइइझझझझझझझझझझझझझझझझझझझझझझझ
3200 0.740950882434845 0
accommodotion एकोमोडोशन किर्रा
6400 0.5725782513618469 0
awarenesh अवेयरनेश पल्ल्ाा
9600 0.5909155011177063 0
kebehtar केबेहतर बाडा
12800 0.5474437475204468 0
rangarezi रंगरेज़ी मालाा
16000 0.5747870802879333 0
sekhanbilag सेखनबिलग सुबाााय
19200 0.5049561262130737 0
azmatein अज़मतें अलाााा
22400 0.43273666501045227 0
maghrol माघरोल माडाेड
25600 0.42726466059684753 0
nooranganj नूरनगंज निरााार
28800 0.5175405740737915 0
malayaza मलयज माााा0000000000000000000000000000000000000000
32000 0.42226630449295044 1
pulindo पुलिंदो पाड्डडा
35200 0.40757158398628235 1
pixelmator पिक्सलमेटर पिरिराााल
38400 0.4173910319805145 2
dugai दुगई दाला
41600 0.5164890289306641 2
matadhikyanchi मताधिक्यांची मृ्ठ्च्याचचा
44800 0.4086759686470032 2
rajupark राजूपार्क रां्परर
48000 0.36685308814048767 3
allotments अलॉटमेंट्स अलममििसस


1 0.5067837456986308 4
0 0.42245355248451233 0
karjafedi