<a href="https://colab.research.google.com/github/arunm917/CS6910_Assignment_3/blob/main/CS6910_Assignment_3_V4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading necessary packages and files

In [11]:
import csv
import gdown
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import random
import matplotlib.pyplot as plt


In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [13]:
!nvidia-smi

Thu May 11 15:20:14 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8     9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [14]:
# downloading file from gdrive
output = 'tam_train'
file_id = '1pdJVD8P71fpqGRnvFfOp_6TbVft9NlnH' # Google drive ID
#Download the file
gdown.download('https://drive.google.com/uc?id=' + file_id, output, quiet=False)
print('DONE.')

Downloading...
From: https://drive.google.com/uc?id=1pdJVD8P71fpqGRnvFfOp_6TbVft9NlnH
To: /content/tam_train
100%|██████████| 2.69M/2.69M [00:00<00:00, 209MB/s]

DONE.





In [15]:
# downloading file from gdrive
output = 'tam_valid'
file_id = '1pdp6ojHltRRNLXsmoQbGRc2Qn8X1EUJV' # Google drive ID
#Download the file
gdown.download('https://drive.google.com/uc?id=' + file_id, output, quiet=False)
print('DONE.')

Downloading...
From: https://drive.google.com/uc?id=1pdp6ojHltRRNLXsmoQbGRc2Qn8X1EUJV
To: /content/tam_valid
100%|██████████| 164k/164k [00:00<00:00, 64.1MB/s]

DONE.





In [16]:
# downloading file from gdrive
output = 'tam_test'
file_id = '1pdaTq-g2ZKhRKv6fRrSbEsJkOH5gdrEQ' # Google drive ID
#Download the file
gdown.download('https://drive.google.com/uc?id=' + file_id, output, quiet=False)
print('DONE.')

Downloading...
From: https://drive.google.com/uc?id=1pdaTq-g2ZKhRKv6fRrSbEsJkOH5gdrEQ
To: /content/tam_test
100%|██████████| 157k/157k [00:00<00:00, 51.1MB/s]

DONE.





#Preprocessing

In [17]:
train_data_df = pd.read_csv('tam_train')
valid_data_df = pd.read_csv('tam_valid')
test_data_df = pd.read_csv('tam_test')

In [18]:
train_data_df.columns = ['English','Tamil']
# valid_data_df.columns = ['English','Tamil']
# test_data_df.columns = ['English','Tamil']

# Creating vocabulary and padding

In [19]:
# Creating vocabulary

char_list_eng = []
for i in range(len(train_data_df['English'])):
  char = [*train_data_df.loc[i, 'English']]
  char_list_eng.extend(char)

char_list_tam = []
for i in range(len(train_data_df['Tamil'])):
  char = [*train_data_df.loc[i, 'Tamil']]
  char_list_tam.extend(char)

print(len(char_list_eng))
print(len(char_list_tam))

721198
621903


In [20]:
# Indexing

SOS_token = '<SOS>'
EOS_token = '<EOS>'
PAD_token = '<PAD>'
UNK_token = '<UNK>'

vocabulary_eng = list(set(char_list_eng))
vocabulary_eng = [PAD_token] + [UNK_token] + [SOS_token] + [EOS_token] + vocabulary_eng 

vocabulary_tam = list(set(char_list_tam))
vocabulary_tam = [PAD_token] + [UNK_token] + [SOS_token] + [EOS_token] + vocabulary_tam

In [21]:
print(len(vocabulary_eng))
print(vocabulary_eng)
print(len(vocabulary_tam))
print(vocabulary_tam)

30
['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'n', 'r', 'z', 'i', 'o', 'e', 'y', 'x', 'c', 't', 'b', 'g', 'm', 'h', 'u', 's', 'a', 'v', 'p', 'w', 'j', 'l', 'f', 'd', 'q', 'k']
50
['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ன', 'ள', 'ம', 'ஜ', 'ஓ', 'ெ', 'ை', 'ற', 'ே', '்', 'ய', 'எ', 'த', 'ஷ', 'ோ', 'உ', 'ூ', 'ஏ', 'ி', 'ா', 'ண', 'ப', 'ஸ', 'ந', 'ீ', 'ஊ', 'ு', 'ட', 'ஒ', 'ஈ', 'க', 'ல', 'ொ', 'ஆ', 'வ', 'ஞ', 'ர', 'அ', 'ஹ', 'ச', 'இ', 'ௌ', 'ழ', 'ஃ', 'ஐ', 'ங']


In [22]:
char_index_eng = {value: index for index, value in enumerate(vocabulary_eng)}
char_index_tam = {value: index for index, value in enumerate(vocabulary_tam)}
# num_list = [char_index[char] for char in vocabulary]

print(char_index_eng)
print(char_index_tam)
# print(num_list)

{'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3, 'n': 4, 'r': 5, 'z': 6, 'i': 7, 'o': 8, 'e': 9, 'y': 10, 'x': 11, 'c': 12, 't': 13, 'b': 14, 'g': 15, 'm': 16, 'h': 17, 'u': 18, 's': 19, 'a': 20, 'v': 21, 'p': 22, 'w': 23, 'j': 24, 'l': 25, 'f': 26, 'd': 27, 'q': 28, 'k': 29}
{'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3, 'ன': 4, 'ள': 5, 'ம': 6, 'ஜ': 7, 'ஓ': 8, 'ெ': 9, 'ை': 10, 'ற': 11, 'ே': 12, '்': 13, 'ய': 14, 'எ': 15, 'த': 16, 'ஷ': 17, 'ோ': 18, 'உ': 19, 'ூ': 20, 'ஏ': 21, 'ி': 22, 'ா': 23, 'ண': 24, 'ப': 25, 'ஸ': 26, 'ந': 27, 'ீ': 28, 'ஊ': 29, 'ு': 30, 'ட': 31, 'ஒ': 32, 'ஈ': 33, 'க': 34, 'ல': 35, 'ொ': 36, 'ஆ': 37, 'வ': 38, 'ஞ': 39, 'ர': 40, 'அ': 41, 'ஹ': 42, 'ச': 43, 'இ': 44, 'ௌ': 45, 'ழ': 46, 'ஃ': 47, 'ஐ': 48, 'ங': 49}


In [23]:
idx2char_eng = {value: key for key, value in char_index_eng.items()}
idx2char_tam = {value: key for key, value in char_index_tam.items()}
print(idx2char_eng)
print(idx2char_tam)

{0: '<PAD>', 1: '<UNK>', 2: '<SOS>', 3: '<EOS>', 4: 'n', 5: 'r', 6: 'z', 7: 'i', 8: 'o', 9: 'e', 10: 'y', 11: 'x', 12: 'c', 13: 't', 14: 'b', 15: 'g', 16: 'm', 17: 'h', 18: 'u', 19: 's', 20: 'a', 21: 'v', 22: 'p', 23: 'w', 24: 'j', 25: 'l', 26: 'f', 27: 'd', 28: 'q', 29: 'k'}
{0: '<PAD>', 1: '<UNK>', 2: '<SOS>', 3: '<EOS>', 4: 'ன', 5: 'ள', 6: 'ம', 7: 'ஜ', 8: 'ஓ', 9: 'ெ', 10: 'ை', 11: 'ற', 12: 'ே', 13: '்', 14: 'ய', 15: 'எ', 16: 'த', 17: 'ஷ', 18: 'ோ', 19: 'உ', 20: 'ூ', 21: 'ஏ', 22: 'ி', 23: 'ா', 24: 'ண', 25: 'ப', 26: 'ஸ', 27: 'ந', 28: 'ீ', 29: 'ஊ', 30: 'ு', 31: 'ட', 32: 'ஒ', 33: 'ஈ', 34: 'க', 35: 'ல', 36: 'ொ', 37: 'ஆ', 38: 'வ', 39: 'ஞ', 40: 'ர', 41: 'அ', 42: 'ஹ', 43: 'ச', 44: 'இ', 45: 'ௌ', 46: 'ழ', 47: 'ஃ', 48: 'ஐ', 49: 'ங'}


In [24]:
# Define the tokenizer
# max_length = 10
def tokenize_eng(word):
    chars = [*word]
    tokens_eng = [char_index_eng[char] if char in char_index_eng else 0 for char in chars]
    
    return tokens_eng

def tokenize_tam(word):
    chars = [*word]
    tokens_tam = [char_index_tam[char] if char in char_index_tam else 0 for char in chars]
    
    return tokens_tam

In [25]:
# Define the training pairs
training_pairs = train_data_df.values.tolist()
val_pairs = valid_data_df.values.tolist()
test_pairs = test_data_df.values.tolist()

In [26]:
len(val_pairs)

4095

In [27]:
# testing the tokenize function
tokenize_eng('arun')

[20, 5, 18, 4]

In [28]:
eng_words = [tokenize_eng(pair[0]) for pair in training_pairs]
tam_words = [tokenize_tam(pair[1]) for pair in training_pairs]

In [29]:
# Determining max length english

lengths_eng = []
# max_length_eng = max([len(words) for words in eng_words])
for word in eng_words:

    word_length = len(word)
    lengths_eng.append(word_length)
print(lengths_eng)
# lengths_eng = np.array(lengths_eng)


[11, 9, 11, 15, 22, 23, 16, 13, 13, 8, 8, 11, 17, 13, 18, 19, 9, 18, 18, 14, 18, 11, 9, 12, 10, 16, 16, 12, 14, 17, 16, 12, 5, 11, 16, 18, 11, 12, 14, 14, 7, 17, 18, 16, 11, 16, 23, 11, 22, 9, 15, 17, 16, 10, 15, 11, 9, 6, 12, 15, 6, 23, 13, 14, 13, 22, 17, 16, 14, 13, 13, 10, 17, 7, 17, 13, 14, 12, 9, 18, 18, 13, 21, 21, 14, 14, 9, 13, 10, 7, 20, 10, 17, 12, 16, 23, 8, 19, 9, 17, 14, 12, 12, 14, 19, 5, 15, 11, 9, 12, 6, 18, 15, 17, 16, 20, 14, 11, 19, 11, 14, 14, 17, 20, 12, 9, 21, 8, 12, 18, 9, 15, 9, 13, 15, 11, 10, 15, 15, 13, 14, 19, 13, 23, 7, 15, 11, 20, 17, 11, 7, 7, 12, 21, 16, 13, 13, 16, 5, 8, 19, 21, 21, 8, 15, 17, 21, 12, 17, 15, 16, 22, 19, 9, 7, 13, 18, 11, 11, 10, 12, 13, 13, 10, 16, 13, 11, 14, 13, 14, 18, 16, 15, 14, 7, 9, 18, 15, 18, 14, 10, 5, 14, 15, 14, 20, 11, 16, 20, 8, 15, 9, 9, 13, 18, 11, 18, 7, 11, 7, 14, 12, 14, 12, 21, 15, 16, 10, 16, 9, 19, 13, 10, 13, 12, 13, 18, 20, 23, 10, 17, 15, 20, 18, 19, 21, 13, 16, 10, 16, 12, 19, 19, 11, 16, 10, 22, 14, 18, 15, 

In [30]:
# Determining max length tamil
max_length_tam = max([len(words) for words in tam_words])
print(max_length_tam)

# Determining max length english and tamil
max_length = max([len(words) for words in eng_words + tam_words])
print(max_length)

23
30


In [31]:
def padding(word_pairs):
  ''' Function to pad the input and target sequences. Padding is done to ensure that
      all the training, validation and test samples are of equal size.'''
  # padded_input_sequences = []
  # padded_target_sequences = []
  
  eng_words = [tokenize_eng(pair[0]) for pair in word_pairs]
  tam_words = [tokenize_tam(pair[1]) for pair in word_pairs]

  
  padded_input_sequences = [torch.tensor([char_index_eng['<SOS>']] + eng_words + [char_index_eng['<EOS>']] + [(char_index_eng['<PAD>'])]*(max_length - len(eng_words))) for eng_words in eng_words]
  padded_target_sequences = [torch.tensor([char_index_eng['<SOS>']] + tam_words + [char_index_tam['<EOS>']] + [(char_index_tam['<PAD>'])]*(max_length - len(tam_words))) for tam_words in tam_words]
  tensor = torch.tensor([char_index_eng['<PAD>']]*(max_length+2))
  padded_input_sequences.append(tensor)
  padded_target_sequences.append(tensor)
  padded_input_sequences = torch.stack(padded_input_sequences)
  padded_target_sequences = torch.stack(padded_target_sequences)
  
  return(padded_input_sequences,padded_target_sequences)


In [32]:
# Creating datasets
training_input_sequences, training_target_sequences = padding(training_pairs)
train_dataset = torch.utils.data.TensorDataset(training_input_sequences, training_target_sequences)

val_input_sequences, val_target_sequences = padding(val_pairs)
val_dataset = torch.utils.data.TensorDataset(val_input_sequences, val_target_sequences)

test_input_sequences, test_target_sequences = padding(test_pairs)
test_dataset = torch.utils.data.TensorDataset(test_input_sequences, test_target_sequences)

# Architecture

In [33]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        

        embedding = self.dropout(self.embedding(x))
       

        outputs, (hidden, cell) = self.rnn(embedding)
        

        return hidden, cell

In [34]:
class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        
        predictions = self.fc(outputs)

        
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [35]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(vocabulary_tam)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        
        x = target[0]

        for t in range(1, target_len):
            
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

# Hyperparameters

In [41]:
# Training hyperparameters
num_epochs = 100
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(vocabulary_eng)
input_size_decoder = len(vocabulary_tam)
output_size = len(vocabulary_tam)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 512 
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

In [42]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder,decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout,).to(device)

In [43]:
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = char_index_eng['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# Training

In [44]:
# Creating Dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size, shuffle=True)

In [None]:
for epoch in tqdm(range(num_epochs)):

    total_loss = 0


    for batch_idx, (input_seq, target_seq) in enumerate(train_loader):
        # Get input and targets and get to cuda
        inp_data = input_seq.t().to(device)
        target = target_seq.t().to(device)

        # Forward prop
        output = model(inp_data, target)

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)
        total_loss += loss.item()

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

    loss_epoc = total_loss/batch_idx
    print('Epoc loss:', loss_epoc)