**Data loading and Preprocessing**


*   We will load all the files directly from the Google drive. So first we have to mount the drive into collab notebook.

*   For preprocessing we will only use csv module of python. 


In [None]:
import csv
import string
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#setting the device to "cuda" if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Mounting the google drive

- Upload "train.csv" and "hindistatements_week3.csv" inside the collab folder in google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Data Preprocessing**

For data preprocessing, we will do following steps:

1. Do not process sentences whose length is less than the MAX_LENGTH
2. Do not process any null pair
3. Converting into lowercase characters
4. Removing all punctutations from the sentences.
5. Removing the sentences who have some "english" words in hindi sentences.
6. Change the encoding of the sentences.
7. Build vocabulary for hindi and english languages to map index to unique words and vice versa.
8. Converting sentence into tensors for further processing

In [None]:
MAX_LENGTH=12

class Vocab_builder:
    def __init__(self):
        self.word_2_index={"<SOS>":0,"<EOS>":1,"<PAD>":2,"<UKN>":3}
        self.index_2_word={0:"<SOS>", 1:"<EOS>", 2:"<PAD>", 3:"<UKN>"}
        self.freq={}
        self.size=4

    def add_this_sentence(self,sentence):
        words=sentence.split(" ")
        for word in words:
            if word not in self.word_2_index:
                #If the word is not there, add it to a new index and store the indexes
                #Initialize the frequency of the word to 1 and increase the size of the vocabulary    
                self.word_2_index[word]=self.size
                self.freq[word]=1     
                self.index_2_word[self.size]=word
                self.size+=1
            else:
                # If the word is already present then just increase the frequency
                self.freq[word]+=1    

In [None]:
#Initilizing the objects of hindi and english vocabularies:
hindi_vocab=Vocab_builder()
eng_vocab=Vocab_builder()

In [None]:
def length(sentence):
    '''
    Function to tell the length of a sentence.
    '''
    return len(sentence.split(" "))

def is_mixed(sentence):
    '''
    This function will return True if a hindi sentence is containing some english character.
    '''
    letters="abcdefghijklmnopqrstuvwxyz"
    for ch in letters:
        if ch in sentence:
            return True
    return False

def preprocess(sentence):
    '''
        This function will apply the neccesary preprocessing to a sentence
    '''
    #First we will remove all punctuations from the sentence
    punctuations=list(string.punctuation)
    cleaned=""
    for letter in sentence:
        if letter not in punctuations:
            cleaned+=letter
    cleaned=cleaned.lower() ## Converting into lowercase
    return cleaned

In [None]:
def clean_the_data(path):
    pairs=[]
    with open(path,'rt') as f:
        data=csv.reader(f, delimiter=',')
        row_num=0
        for row in data:
            if row_num!=0:
                hindi=row[1]
                eng=row[2]
                
                if length(hindi)>=MAX_LENGTH or length(eng)>=MAX_LENGTH:
                    continue
                if not hindi or not eng:
                    continue
                if is_mixed(hindi):
                    continue
                hindi=hindi.encode('utf-8',errors='ignore').decode('utf-8')
                eng=eng.encode('ascii',errors='ignore').decode('utf-8')
                hindi=preprocess(hindi)
                eng=preprocess(eng)
                #Adding <SOS>, <EOS> and padding tokens
                pair=[hindi.strip(), eng.strip()]

                hin_extra=MAX_LENGTH-len(hindi.strip().split(" "))
                eng_extra=MAX_LENGTH-len(eng.strip().split(" "))

                hindi_vocab.add_this_sentence(pair[0])
                eng_vocab.add_this_sentence(pair[1])
                pair[0]=pair[0].split(" ")
                pair[0].insert(0,"<SOS>")
                pair[0].append("<EOS>")
                pair[0]=pair[0]+["<PAD>"]*(hin_extra)

                pair[1]=pair[1].split(" ")
                pair[1].insert(0,"<SOS>")
                pair[1].append("<EOS>")
                pair[1]=pair[1]+["<PAD>"]*(eng_extra)

                pair[0]=" ".join(pair[0])
                pair[1]=" ".join(pair[1])
                pairs.append(pair)
            row_num+=1
    return pairs

In [None]:
file_path="/content/drive/MyDrive/Colab Notebooks/"
train_file_path=file_path+"train.csv"

In [None]:
pairs=clean_the_data(train_file_path)

In [None]:
#Now we need to convert each of this pair into corresponding tensors
def pair_to_tensor(pair):
    '''
    A function to convert a given pair to tensors corresponding to index in vocabulary
    '''
    hindi_sentence=pair[0]
    eng_sentence=pair[1]
    indexes_hindi=[hindi_vocab.word_2_index[word] for word in hindi_sentence.split(' ')]
    indexes_eng=[eng_vocab.word_2_index[word] for word in eng_sentence.split(' ')]
    hindi_tensor=torch.tensor(indexes_hindi, dtype=torch.long, device=device).view(-1,1)
    eng_tensor=torch.tensor(indexes_eng, dtype=torch.long, device=device).view(-1,1)
    return (hindi_tensor, eng_tensor)

In [None]:
hin_tensors=[]
eng_tensors=[]
for pair in pairs:
    hin,eng=pair_to_tensor(pair)
    hin_tensors.append(hin)
    eng_tensors.append(eng)

**Seq2Seq Model Implementation**

1. **Encoder RNN Implementation**

In [None]:
class EncoderLSTM(nn.Module):
    def __init__(self,size_input,size_embbeding,size_hidden,layers,p):
        super(EncoderLSTM,self).__init__()
        self.size_input=size_input
        self.size_embbeding=size_embbeding
        self.size_hidden=size_hidden
        self.layers=layers
        self.dropout=nn.Dropout(p)
        self.tag=True

        self.embbed_layer=nn.Embedding(self.size_input,self.size_embbeding)
        self.lstm=nn.LSTM(self.size_embbeding,self.size_hidden,self.layers,dropout=p)

    def forward(self, x):
        # print(x.shape)
        embbeding=self.dropout(self.embbed_layer(x))
        # print(embbeding.shape)
        output, (hidden_st,cell_st) = self.lstm(embbeding)
        return hidden_st, cell_st # will return only hidden and cell state from the Encoder

2. **Decoder RNN Implementation**

In [None]:
class DecoderLSTM(nn.Module):
    def __init__(self,size_input,size_embbeding,size_hidden,layers,p,size_output):
        super(DecoderLSTM,self).__init__()
        self.size_input=size_input
        self.size_embbeding=size_embbeding
        self.size_hidden=size_hidden
        self.layers=layers
        self.size_output=size_output
        self.dropout=nn.Dropout(p)
        # self.tag=True

        self.embbed_layer=nn.Embedding(self.size_input,self.size_embbeding) # input_size X embedding_size
        self.lstm=nn.LSTM(self.size_embbeding,self.size_hidden,self.layers,dropout=p) #embedding_size * hidden_size
        self.fc=nn.Linear(self.size_hidden,self.size_output) # hidden_size*output_size

    def forward(self,x,hidden_st,cell_st):
        x=x.unsqueeze(0)
        embbeding=self.dropout(self.embbed_layer(x))
        outputs, (hidden_st, cell_st) = self.lstm(embbeding, (hidden_st,cell_st))
        preds=self.fc(outputs)
        preds=preds.squeeze(0)
        return preds,hidden_st,cell_st

3. **Encoder Decoder Interface Implementation**

In [None]:
class Seq2seq_model(nn.Module):
    def __init__(self,encoder_net,decoder_net):
        super(Seq2seq_model,self).__init__()
        self.encoder_net=encoder_net
        self.decoder_net=decoder_net

    def forward(self,src,target,teacher_forcing=0.5):
        batch_length=src.shape[1]
        target_len=target.shape[0]
        target_vocab_len=eng_vocab.size

        output_tensor=torch.zeros(target_len,batch_length,target_vocab_len).to(device)
        hidden_st_enc, cell_st_enc=self.encoder_net(src)
        x=target[0]

        for i in range(1,target_len):
            output,hidden_st_dec,cell_st_dec=self.decoder_net(x,hidden_st_enc,cell_st_enc)
            output_tensor[i]=output
            pred=output.argmax(1)
            x=target[i] if random.random()<teacher_forcing else pred #teacher forcing is used with probability 0.5

        return output_tensor

Creating objects of Encoder, Decoder and Seq2Seq model

In [None]:
encoder_ip_size=hindi_vocab.size #equal to hindi vocab size
encoder_embbeding_size=400  #encoder embedding size, tried various values to finalize this value
encoder_hidden_size=512 # previously tried with 1024 but this value gives better score
encoder_layers=1  # LSTM layers=1
encoder_dropout=float(0.5)  # dropout if applied, for layers=1 no need of dropout

encoder_obj=EncoderLSTM(encoder_ip_size, encoder_embbeding_size, encoder_hidden_size, encoder_layers,
                        encoder_dropout).to(device) # creating object of EncoderLSTM class

print(encoder_obj)

EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embbed_layer): Embedding(23444, 400)
  (lstm): LSTM(400, 512, dropout=0.5)
)


  "num_layers={}".format(dropout, num_layers))


In [None]:
# embedding size, hidden_size and number of layers will be same as of Encoders
decoder_ip_size=eng_vocab.size
decoder_embbed_size=400
decoder_hidden_size=512
decoder_layers=1
decoder_dropout=float(0.5)
decoder_op_size=eng_vocab.size

decoder_obj=DecoderLSTM(decoder_ip_size,decoder_embbed_size,decoder_hidden_size,
                        decoder_layers, decoder_dropout, decoder_op_size).to(device)  # creating object of DecoderLSTM

print(decoder_obj)

  "num_layers={}".format(dropout, num_layers))


DecoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embbed_layer): Embedding(17314, 400)
  (lstm): LSTM(400, 512, dropout=0.5)
  (fc): Linear(in_features=512, out_features=17314, bias=True)
)


In [None]:
model=Seq2seq_model(encoder_obj, decoder_obj)
print(model)

Seq2seq_model(
  (encoder_net): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embbed_layer): Embedding(23444, 400)
    (lstm): LSTM(400, 512, dropout=0.5)
  )
  (decoder_net): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embbed_layer): Embedding(17314, 400)
    (lstm): LSTM(400, 512, dropout=0.5)
    (fc): Linear(in_features=512, out_features=17314, bias=True)
  )
)


**Training the Model**

In [None]:
batch_size=64
optimizer=optim.Adagrad(model.parameters(),lr=0.005)  #slowed down the learning rate to better convergence
PATH="/content/drive/MyDrive/Colab Notebooks/phase3_v1.pth"

#model was trained for 90 epochs but due to session length limit it was trained in 3 steps of 30 epochs each
epochs=50  
epoch_loss=0.0
padding_idx=eng_vocab.word_2_index["<PAD>"]
criterion=nn.CrossEntropyLoss(ignore_index=padding_idx) #ignore padding index while calculating loss

train_model=False #if need to train the model again, set it to True
model_available=True

if train_model==False:
    model=torch.load(PATH)
else:
    if model_available:
        model=torch.load(PATH)
    batches=len(pairs)//batch_size
    for epoch in range(epochs):
        print(f"epoch {epoch+1}/{epochs}")
        model.eval()
        model.train(True)
        cur_batch=0
        for idx in range(0,len(pairs),batch_size):
            cur_batch+=1
            if(cur_batch%100==0):
                print(f"    running batch {cur_batch} of {batches}")
            if idx+batch_size < len(pairs):
                src_batch=hin_tensors[idx:idx+batch_size]
                target_batch=eng_tensors[idx:idx+batch_size]
            else:
                src_batch=hin_tensors[idx:]
                target_batch=eng_tensors[idx:]

            src_batch=torch.cat(src_batch,dim=1)     #max_len*batch_size
            target_batch=torch.cat(target_batch,dim=1)

            output=model(src_batch,target_batch)
            output=output[1:].reshape(-1,output.shape[2])
            target=target_batch[1:].reshape(-1)

            optimizer.zero_grad()
            loss=criterion(output,target)

            loss.backward()
            # restrict gradients from exploding
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            optimizer.step()
            epoch_loss += loss.item()
        
        print(f"Epoch loss : {loss.item()}")
        
        torch.save(model,PATH)
        model_available=True

In [None]:
def clean_sentence(sentence):
    punctuations=list(string.punctuation)
    cleaned=""
    for letter in sentence:
        if letter=='<' or letter=='>' or letter not in punctuations:
            cleaned+=letter
    return cleaned  

def predict_translation(model,sentence,device,max_length=MAX_LENGTH):
    sentence=clean_sentence(sentence)
    tokens=sentence.split(" ")
    indexes=[]
    for token in tokens:
        if token in hindi_vocab.word_2_index:
            indexes.append(hindi_vocab.word_2_index[token])
        else:
            indexes.append(hindi_vocab.word_2_index["<UKN>"])
    tensor_of_sentence=torch.LongTensor(indexes).unsqueeze(1).to(device)
    with torch.no_grad():
        hidden,cell=model.encoder_net(tensor_of_sentence)
    outputs=[0]
    for _ in range(max_length):
        prev_word=torch.LongTensor([outputs[-1]]).to(device)
        with torch.no_grad():
            output,hidden,cell=model.decoder_net(prev_word, hidden,cell)
            pred=output.argmax(1).item()

        outputs.append(pred)

        if eng_vocab.index_2_word[pred] =="<EOS>":
            break
    
    final=[]

    for i in outputs:
        if i == "<PAD>":
            break
        final.append(i)

    final = [eng_vocab.index_2_word[idx] for idx in final]
    translated=" ".join(final)
    return translated

In [None]:
test_sentences=[pair[0] for pair in pairs[50:100]]
actual_sentences=[pair[1] for pair in pairs[50:100]]
pred_sentences=[]

for idx,i in enumerate(test_sentences):
    # print(i)
    translated=predict_translation(model,i,device)
    print("*"*20)
    print(f"Hindi: {i}")
    print(f"Actual: {actual_sentences[idx]}")
    print(f"Predicted: {translated}")
    print("*"*20)

********************
Hindi: <SOS> पर जाएं <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> go on <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> go on the way <EOS>
********************
********************
Hindi: <SOS> तो हमने दुनिया को तीनतीन मीटर के वर्गों में विभाजित किया। <EOS> <PAD>
Actual: <SOS> so we divided the world into threemeter squares <EOS> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> so we we need in the world the same thing time we
********************
********************
Hindi: <SOS> अजीब बात है। <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> its funny <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> thats the good good idea idea idea to squeeze me see how
********************
********************
Hindi: <SOS> मैं अभी तक मरा नहीं हूँ <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> im not dead yet <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

In [None]:
fp=open("/content/drive/MyDrive/Colab Notebooks/answer_week3_v1.txt","w")

In [None]:
val_data_path="/content/drive/MyDrive/Colab Notebooks/hindistatements_week3.csv"
with open(val_data_path, 'rt') as f:
    data=csv.reader(f, delimiter=',')
    row_num=0
    for row in data:
        if row_num==0:
            row_num+=1
            continue
        sentence=row[2].strip()
        translated=predict_translation(model,sentence,device)
        translated=translated.split(" ")[1:-1]
        translated=" ".join(translated)
        fp.write(translated+'\n')
        print(f"sentence : {row_num}")
        row_num+=1
fp.close()
        

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
sentence : 1
sentence : 2
sentence : 3
sentence : 4
sentence : 5
sentence : 6
sentence : 7
sentence : 8
sentence : 9
sentence : 10
sentence : 11
sentence : 12
sentence : 13
sentence : 14
sentence : 15
sentence : 16
sentence : 17
sentence : 18
sentence : 19
sentence : 20
sentence : 21
sentence : 22
sentence : 23
sentence : 24
sentence : 25
sentence : 26
sentence : 27
sentence : 28
sentence : 29
sentence : 30
sentence : 31
sentence : 32
sentence : 33
sentence : 34
sentence : 35
sentence : 36
sentence : 37
sentence : 38
sentence : 39
sentence : 40
sentence : 41
sentence : 42
sentence : 43
sentence : 44
sentence : 45
sentence : 46
sentence : 47
sentence : 48
sentence : 49
sentence : 50
sentence : 51
sentence : 52
sentence : 53
sentence : 54
sentence : 55
sentence : 56
sentence : 57
sentence : 58
sentence : 59
sentence : 60
sentence : 61
sentence : 62
sentence : 63
sentence : 64
sentence : 65
sentence : 66
sentence : 67
senten