In [8]:
# Importing Libraries

from io import open
import unicodedata
import string
import re
import random
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import pandas as pd

# Load Training, Validation, and Test Data

df = pd.read_csv('/kaggle/input/aksharantar/aksharantar_sampled/hin/hin_train.csv',names = ["English",'Hindi'],header = None)
df_val=pd.read_csv('/kaggle/input/aksharantar/aksharantar_sampled/hin/hin_valid.csv',names=["English","Hindi"],header=None)
df_test=pd.read_csv('/kaggle/input/aksharantar/aksharantar_sampled/hin/hin_test.csv',names=["English","Hindi"],header=None)

maxlength_english=0
maxlength_hindi=0

# Encoder Dictionary Creation
hindi_to_index = {'SOS_token': 0, 'EOS_token': 1, 'PAD_token': 2}
english_to_index = {'SOS_token': 0, 'EOS_token': 1, 'PAD_token': 2}


# Make dictionary for English alphabets
english_alphabets = 'abcdefghijklmnopqrstuvwxyz'
for idx, alphabet in enumerate(english_alphabets):
    english_to_index[alphabet] = idx + 3

# Make dictionary for Hindi characters
hindi_characters = set()
for x in range(len(df)):
    english_word=df.iloc[x]['English']
    hindi_word = df.iloc[x]['Hindi']
    maxlength_english=max(maxlength_english,len(english_word))
    maxlength_hindi=max(maxlength_hindi,len(hindi_word))
    hindi_characters.update(hindi_word) 


for x in range(len(df_test)):
    english_word=df_test.iloc[x]['English']
    hindi_word = df_test.iloc[x]['Hindi']
    maxlength_english=max(maxlength_english,len(english_word))
    maxlength_hindi=max(maxlength_hindi,len(hindi_word))
    hindi_characters.update(hindi_word) 

start = 3
for i, char in enumerate(hindi_characters):
    hindi_to_index[char] = start + i

# Printing the created dictionaries
print(hindi_to_index)
print(english_to_index)
maxlength_hindi+=3

# Decoder Dictionary Creation
index_to_hindi = {v: k for k, v in hindi_to_index.items()}
index_to_english = {v: k for k, v in english_to_index.items()}
print(index_to_english)
print(index_to_hindi)


#functions to create the encodings required for English and hindi words
def encode_words_english(language,df):
    encoded_words=[]
    maxlength=maxlength_english+1
    to_index=english_to_index
    
    for _, row in df.iterrows():
        language_word = row['English']
        word = torch.zeros(maxlength, dtype=torch.long)+2
        for idx, char in enumerate(language_word):
            word[idx] = to_index[char]
        word[len(language_word)]=to_index['EOS_token']
        encoded_words.append(word)
    encoded_words = torch.stack(encoded_words)
    return encoded_words

def encode_words_hindi(language,df):
    encoded_words=[]
    maxlength=maxlength_hindi
    to_index=hindi_to_index
    
    for _, row in df.iterrows():
        language_word = row['Hindi']
        word = torch.zeros(maxlength, dtype=torch.long)+2
        word[0]=to_index['SOS_token']
        for idx, char in enumerate(language_word):
            word[idx+1] = to_index[char]
        word[len(language_word)]=to_index['EOS_token']
        encoded_words.append(word)
    encoded_words = torch.stack(encoded_words)
    return encoded_words


#contains encoding for training ,validation and testing data.
english_encoded_words=encode_words_english('English',df)
hindi_encoded_words=encode_words_hindi('Hindi',df)
english_encoded_words_val=encode_words_english('English',df_val)
hindi_encoded_words_val=encode_words_hindi('Hindi',df_val)
english_encoded_words_test=encode_words_english('English',df_test)
hindi_encoded_words_test=encode_words_hindi('Hindi',df_test)


{'SOS_token': 0, 'EOS_token': 1, 'PAD_token': 2, 'ऋ': 3, 'ब': 4, 'औ': 5, 'झ': 6, 'ज': 7, 'छ': 8, '्': 9, 'ङ': 10, 'ॉ': 11, 'ं': 12, 'ी': 13, 'फ': 14, 'ऑ': 15, 'त': 16, 'ौ': 17, 'ख': 18, 'थ': 19, 'क': 20, 'ह': 21, 'इ': 22, 'आ': 23, 'ु': 24, 'द': 25, 'ि': 26, 'य': 27, 'ँ': 28, 'ऽ': 29, 'भ': 30, 'ः': 31, 'ई': 32, 'ट': 33, 'ॅ': 34, 'ळ': 35, 'ऐ': 36, 'ऊ': 37, 'ओ': 38, '़': 39, 'ञ': 40, 'म': 41, 'ू': 42, 'ल': 43, 'न': 44, 'अ': 45, 'ठ': 46, 'ढ': 47, 'ध': 48, 'च': 49, 'उ': 50, 'ए': 51, 'प': 52, 'ण': 53, 'र': 54, 'ग': 55, 'ो': 56, 'व': 57, 'ष': 58, 'स': 59, 'ॊ': 60, 'ा': 61, 'ड': 62, 'ृ': 63, 'े': 64, 'श': 65, 'ै': 66, 'घ': 67}
{'SOS_token': 0, 'EOS_token': 1, 'PAD_token': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28}
{0: 'SOS_token', 1: 'EOS_token', 2: 'PAD_token', 3: 'a', 4: 'b', 5: 'c', 6: 'd', 7: 'e', 8: 'f', 

In [9]:
#function to reshape the hidden layer 
def reshape_arr(x,num_layers):
    for i in range(1,num_layers,+2):
        if(i==1):tmp=x[i]
        else:tmp+=x[i]
    tmp1=tmp.repeat(num_layers,1,1)
    return tmp1

In [10]:
#importing wandb
import wandb
wandb.login(key='a9d4ee5e3628e01c0f6f0fa50e59e7be9438d147')
wandb.init(project="dl_assignment3")


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcs22m013[0m. Use [1m`wandb login --relogin`[0m to force relogin


**Q3.Implemented RNN,GRU and LSTM**

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size,embedding_size,num_layers,drop,cell_type,bidirection=True):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size=embedding_size
        self. num_layers=num_layers
        self.dropout=nn.Dropout(drop)
        self.embedding = nn.Embedding(input_size, embedding_size).to(device)
        self.bidirectional=bidirection
        self.lstm=nn.LSTM(embedding_size,hidden_size,num_layers,dropout=drop,batch_first=False,bidirectional=bidirection).to(device)
        self.rnn = nn.RNN(embedding_size, hidden_size,num_layers,dropout=drop,batch_first=False,bidirectional=bidirection).to(device)
        self.gru = nn.GRU(embedding_size, hidden_size,num_layers,dropout=drop,batch_first=False,bidirectional=bidirection).to(device)
        self.cell_type=cell_type
    def forward(self, input):
        #input:(seq_length,N)
        input=input.T
#         print("einput ",input.shape)
        embedded = self.dropout(self.embedding(input))
#         print("eembed ",embedded.shape)
        #embedded:(seq_length,N,embedding_size)
        if(self.cell_type=="LSTM"):
            output,(hidden,cell)=self.lstm(embedded)
#             print('encodero',output.shape)
#             print('enchid',hidden.shape)
#             print('enccell',cell.shape)
            if(self.bidirectional):
                hidden=reshape_arr(hidden,self.num_layers)
                cell=reshape_arr(cell,self.num_layers)
            return output,(hidden,cell)
            
        if(self.cell_type=="GRU"):
            output, hidden = self.gru(embedded)

        if(self.cell_type=="RNN"):
            output,hidden=self.rnn(embedded)
             
        if(self.bidirectional):
            hidden=reshape_arr(hidden,self.num_layers)
        return  output,hidden
        


class DecoderRNN(nn.Module):
    def __init__(self, input_size,hidden_size, output_size,embedding_size,num_layers,drop,cell_type):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size=embedding_size
        self. num_layers=num_layers
        self.dropout=nn.Dropout(drop)
        self.embedding = nn.Embedding(input_size, embedding_size).to(device)
        self.lstm=nn.LSTM(embedding_size,hidden_size,num_layers,dropout=drop,batch_first=False).to(device)
        self.rnn = nn.RNN(embedding_size, hidden_size,num_layers,dropout=drop,batch_first=False).to(device)
        self.gru = nn.GRU(embedding_size, hidden_size,num_layers,dropout=drop,batch_first=False).to(device)
        self.cell_type=cell_type
        
        self.fc_out = nn.Linear(hidden_size, output_size).to(device)

    def forward(self, input,hidden,cell):
        
        input=input.T
        
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size,embedding_size]
        
        if(self.cell_type=="RNN"):
            output,hidden = self.rnn(embedded,hidden)
        if(self.cell_type=='GRU'):
            output,hidden = self.gru(embedded,hidden)
        if(self.cell_type=="LSTM"):
            output,(hidden,cell)=self.lstm(embedded,(hidden,cell))
            prediction = self.fc_out(output)
            return prediction,hidden,cell
        #output:[1,batch_size,hidden_size]
        prediction = self.fc_out(output)
        return prediction, hidden


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=maxlength_english):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder,cell_type):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.softmax = nn.Softmax(dim=2)
        self.cell_type=cell_type


        
    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
#         print(source.shape)
#         print(target.shape)
        target_vocab_size = len(hindi_to_index)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)        
        if(self.cell_type=='LSTM'):
            output, (hidden,cell) = self.encoder.forward(source)
        else:
            output, hidden = self.encoder.forward(source)

#         print("output","hidden")
#         print(output,hidden)
        x = target[:,0].reshape(batch_size,1)
        #print(target_len)
        for t in range(1, target_len):
            if(self.cell_type=='LSTM'):
                output, hidden,cell = self.decoder.forward(x, hidden,cell)
            else:
                output, hidden = self.decoder.forward(x, hidden,None)
            
#             print("dout",output.shape)
            outputs[t] = output.squeeze(0)
            teacher_force = random.random() < teacher_forcing_ratio
            output = self.softmax(output)
#             print("doutput ",output.shape)
            top1 = torch.argmax(output,dim = 2)
#             print("top1 ",top1.shape)
            x = target[:,t].reshape(batch_size,1) if teacher_force else top1.T
        return outputs

In [None]:
file1 = open("predictions_vanilla.txt","a")
#file for predictions of the test dataset contains hindi predicted and english predicted.

 **Q4.b)Provide sample inputs from the test data and predictions made by your best model (more marks for presenting this grid creatively). Implemented using write_to_file function**

In [None]:
#to convert the tensors to calculate accuracy
def calculate_predictions(output,target):
    output1=nn.Softmax(dim=2)(output[1:])
    predictions=torch.argmax(output1,dim=2)
    pred=predictions.T
    target1=target[:,1:]
    return pred,target1

#for printing the prediction and target in text file.
def write_to_file(pred,target):
    pred_s=''
    for i in pred:
        if(i in index_to_hindi):
            pred_s+=index_to_hindi[i]
    pred_target=''
    for i in target:
        if(i in index_to_hindi):
            pred_target+=index_to_hindi[i]
    file1.write(pred_s+"        "+pred_target)

#to calculate accuracy 
def calculate_accuracy(model,english_encoded_words,hindi_encoded_words,batch_size,teacher_forcing_ratio):
    correct=0
    total_loss=0
    loss_function=nn.CrossEntropyLoss(reduction='sum')
    
    for i in range(0,len(english_encoded_words),batch_size):
        src=english_encoded_words[i:i+batch_size].to(device)
        target=hindi_encoded_words[i:i+batch_size].to(device)
        output=model.forward(src,target,0)
        pred,target1=calculate_predictions(output,target)
        out = output[1:].reshape(-1, output.shape[2])
        target2 = target[:,1:].T.reshape(-1)
        loss = loss_function(out, target2)
        total_loss += loss.item()
        for t in range(len(pred)):
            if(False):
                write_to_file(pred[t],target1[t])
            if(torch.equal(pred[t],target1[t])):
                correct+=1
    return correct,total_loss





def train(num_layers,enc_dropout,dec_dropout,num_epochs,learning_rate,batch_size,embedding_size,hidden_size,cell_type):
    input_size_encoder=len(english_to_index)
    input_size_decoder=len(hindi_to_index)
    output_size=len(hindi_to_index)
    encoder_net=EncoderRNN(input_size_encoder, hidden_size,embedding_size,num_layers,enc_dropout,cell_type).to(device)
    decoder_net=DecoderRNN(input_size_decoder,hidden_size,output_size,embedding_size,num_layers,dec_dropout,cell_type).to(device)
    model=Seq2Seq(encoder_net,decoder_net,cell_type).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    correct_predictions=0
    correct_predictions_val=0
    loss_function=nn.CrossEntropyLoss(reduction='sum')
    for epoch in range(num_epochs):
        print(epoch)
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0
        for i in range(0,len(english_encoded_words),batch_size):
            src=english_encoded_words[i:i+batch_size].to(device)
            target=hindi_encoded_words[i:i+batch_size].to(device)
            
            output=model(src,target)
            output1=nn.Softmax(dim=2)(output[1:])

            predictions=torch.argmax(output1,dim=2)
   
            out = output[1:].reshape(-1, output.shape[2])
            target1 = target[:,1:].T.reshape(-1)
   
            optimizer.zero_grad()
            loss = loss_function(out, target1)
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
        #to find the loss and accuracy
        correct_predictions,training_loss=calculate_accuracy(model,english_encoded_words,hindi_encoded_words,batch_size,0)
        correct_predictions_val,val_loss=calculate_accuracy(model,english_encoded_words_val,hindi_encoded_words_val,batch_size,0)
        correct_predictions_test,test_loss=calculate_accuracy(model,english_encoded_words_test,hindi_encoded_words_test,batch_size,0)
        
        Training_loss=total_loss/(len(english_encoded_words)*maxlength_hindi)
        Validation_loss=val_loss/(len(english_encoded_words_val)*maxlength_hindi)
        Validation_accuracy=(correct_predictions_val/len(english_encoded_words_val)*100)
        Test_accuracy=(correct_predictions_test/len(english_encoded_words_test)*100)
        Training_accuracy=(correct_predictions/51200)*100
        print("Training_accuracy:",Training_accuracy)
        print("Validation_accuracy:",Validation_accuracy)
        print("Test_accuracy:",Test_accuracy)
        wandb.log({'Training_accuracy':Training_accuracy,'Epoch':epoch+1,'Training_loss':Training_loss,'Validation_loss':Validation_loss,'Validation_accuracy':Validation_accuracy})


**Q4)Best configuration for without attetion**

In [None]:
#best config in attention
num_layers=4
enc_dropout=0.3
dec_dropout=0.3
num_epochs=1
learning_rate=0.001
batch_size=512
hidden_size=1024
embedding_size=256
cell_type="LSTM"
train(num_layers,enc_dropout,dec_dropout,num_epochs,learning_rate,batch_size,embedding_size,hidden_size,cell_type)

In [None]:
def withattention():
    wandb.init(project='dl_assignment3')
    config = wandb.config
    wandb.run.name = "cell_type_{}bidirec{}layers{}batchsize{}hidden{}embedding{}learning_rate{}".format(config.cell_type,config.bidirectional,config.no_of_layers,config.batchsize,config.hidden_size,config.input_embedding_size,config.learning_rate)
    hidden_size = config.hidden_size
    embedding_size = config.input_embedding_size
    num_layers = config.no_of_layers
    num_epochs = config.epochs
    batch_size = config.batchsize
    enc_dropout = config.dropout
    dec_dropout=config.dropout
    cell_type=config.cell_type
    learning_rate=config.learning_rate
    train(num_layers,enc_dropout,dec_dropout,num_epochs,learning_rate,batch_size,embedding_size,hidden_size,cell_type)
    


Q3. Sweep Configurations 

In [None]:
sweep_configuration = {
    'method' : 'bayes',
    'metric' : { 'goal' : 'maximize',
    'name' : 'Validation_accuracy'},
    'parameters':{
        'learning_rate': {'values':[0.001,0.002]},
        'batchsize' : {'values' : [128,256,512,1024]},
        'input_embedding_size' : {'values' : [256,512,1024]},
        'no_of_layers' : {'values' : [2,3,4]},
        'hidden_size' : {'values' : [256,512,1024]},
        'cell_type' : {'values' : ['RNN','LSTM','GRU']},
        'bidirectional' : {'values' : ['Yes']},
        'dropout' : {'values' : [0.2,0.3,0.4]},
        'epochs' : {'values' : [10,15,18]}
    }
}
sweep_id = wandb.sweep(sweep = sweep_configuration,project = 'dl_assignment3')
wandb.agent(sweep_id,function=withattention,count = 1)