<a href="https://colab.research.google.com/github/arunm917/CS6910_Assignment_3/blob/main/CS6910_Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading necessary packages and files

In [1]:
import csv
import gdown
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# downloading file from gdrive
output = 'tam_train'
file_id = '1pdJVD8P71fpqGRnvFfOp_6TbVft9NlnH' # Google drive ID
#Download the file
gdown.download('https://drive.google.com/uc?id=' + file_id, output, quiet=False)
print('DONE.')

Downloading...
From: https://drive.google.com/uc?id=1pdJVD8P71fpqGRnvFfOp_6TbVft9NlnH
To: /content/tam_train
100%|██████████| 2.69M/2.69M [00:00<00:00, 17.0MB/s]


DONE.


In [4]:
# downloading file from gdrive
output = 'tam_valid'
file_id = '1pdp6ojHltRRNLXsmoQbGRc2Qn8X1EUJV' # Google drive ID
#Download the file
gdown.download('https://drive.google.com/uc?id=' + file_id, output, quiet=False)
print('DONE.')

Downloading...
From: https://drive.google.com/uc?id=1pdp6ojHltRRNLXsmoQbGRc2Qn8X1EUJV
To: /content/tam_valid
100%|██████████| 164k/164k [00:00<00:00, 44.8MB/s]

DONE.





In [5]:
# downloading file from gdrive
output = 'tam_test'
file_id = '1pdaTq-g2ZKhRKv6fRrSbEsJkOH5gdrEQ' # Google drive ID
#Download the file
gdown.download('https://drive.google.com/uc?id=' + file_id, output, quiet=False)
print('DONE.')

Downloading...
From: https://drive.google.com/uc?id=1pdaTq-g2ZKhRKv6fRrSbEsJkOH5gdrEQ
To: /content/tam_test
100%|██████████| 157k/157k [00:00<00:00, 27.8MB/s]

DONE.





#Preprocessing

In [14]:
train_data_df = pd.read_csv('tam_train')
valid_data_df = pd.read_csv('tam_valid')
test_data_df = pd.read_csv('tam_test')

In [15]:
train_data_df.columns = ['English','Tamil']
# valid_data_df.columns = ['English','Tamil']
# test_data_df.columns = ['English','Tamil']

# Creating vocabulary

In [16]:
# Creating vocabulary

char_list = []
for i in range(len(train_data_df['English'])):
  char = [*train_data_df.loc[i, 'English']]
  char_list.extend(char)

for i in range(len(train_data_df['Tamil'])):
  char = [*train_data_df.loc[i, 'Tamil']]
  char_list.extend(char)
# vocabulary = set(vocabulary)
print(len(char_list))

1343101


In [17]:
# Indexing

SOS_token = '<SOS>'
EOS_token = '<EOS>'
PAD_token = '<PAD>'
UNK_token = '<UNK>'

vocabulary = list(set(char_list))
vocabulary = [PAD_token] + [UNK_token] + vocabulary + [SOS_token] + [EOS_token]

In [18]:
print(len(vocabulary))
print(vocabulary)

76
['<PAD>', '<UNK>', 'உ', 'd', 'ஃ', 'ஐ', 'f', 'ழ', 'ச', 'எ', 'z', 'r', 'ஸ', 'ன', 'v', 'p', 'அ', 'ஊ', 'ற', 'ஈ', 'n', 'ீ', 'ௌ', 'ய', 'i', 'ு', 'இ', 'ா', '்', 'ப', 'ள', 'ோ', 'ண', 'ஹ', 'வ', 'ஞ', 'ல', 'g', 'ஒ', 'j', 'w', 'ங', 'ெ', 'h', 'ூ', 'ஏ', 'a', 'ஆ', 'ஜ', 'ை', 'y', 'ட', 'o', 'b', 'ஓ', 'q', 'க', 'ந', 'ே', 'ர', 'l', 'k', 'e', 'x', 'ொ', 't', 'm', 's', 'ஷ', 'த', 'c', 'ி', 'ம', 'u', '<SOS>', '<EOS>']


In [19]:
char_index = {value: index for index, value in enumerate(vocabulary)}
num_list = [char_index[char] for char in vocabulary]
print(char_index)
print(num_list)

{'<PAD>': 0, '<UNK>': 1, 'உ': 2, 'd': 3, 'ஃ': 4, 'ஐ': 5, 'f': 6, 'ழ': 7, 'ச': 8, 'எ': 9, 'z': 10, 'r': 11, 'ஸ': 12, 'ன': 13, 'v': 14, 'p': 15, 'அ': 16, 'ஊ': 17, 'ற': 18, 'ஈ': 19, 'n': 20, 'ீ': 21, 'ௌ': 22, 'ய': 23, 'i': 24, 'ு': 25, 'இ': 26, 'ா': 27, '்': 28, 'ப': 29, 'ள': 30, 'ோ': 31, 'ண': 32, 'ஹ': 33, 'வ': 34, 'ஞ': 35, 'ல': 36, 'g': 37, 'ஒ': 38, 'j': 39, 'w': 40, 'ங': 41, 'ெ': 42, 'h': 43, 'ூ': 44, 'ஏ': 45, 'a': 46, 'ஆ': 47, 'ஜ': 48, 'ை': 49, 'y': 50, 'ட': 51, 'o': 52, 'b': 53, 'ஓ': 54, 'q': 55, 'க': 56, 'ந': 57, 'ே': 58, 'ர': 59, 'l': 60, 'k': 61, 'e': 62, 'x': 63, 'ொ': 64, 't': 65, 'm': 66, 's': 67, 'ஷ': 68, 'த': 69, 'c': 70, 'ி': 71, 'ம': 72, 'u': 73, '<SOS>': 74, '<EOS>': 75}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75]


In [24]:
# Define the tokenizer
max_length = 10
def tokenize(word):
    # Convert the sentence to lowercase and split it into words
    # print(word)
    chars = [*word]

    # print(chars)
    # chars = chars + ['<EOS>'] + ['<PAD>'] * (max_length - len(chars) - 1)
    # Map each word to its index in the vocabulary
    tokens = [char_index[char] if char in char_index else 0 for char in chars]
    
    return tokens

In [59]:
# Define the training pairs
training_pairs = train_data_df.values.tolist()
val_pairs = valid_data_df.values.tolist()
test_pairs = test_data_df.values.tolist()

In [60]:
len(val_pairs)

4095

In [48]:
# testing the tokenize function
tokenize('arun')

[46, 11, 73, 20]

In [28]:
eng_words = [tokenize(pair[0]) for pair in training_pairs]
tam_words = [tokenize(pair[1]) for pair in training_pairs]

In [49]:
# Determining max length
max_length = max([len(words) for words in eng_words + tam_words])
print(max_length)

30


In [64]:
def padding(word_pairs):
  ''' Function to pad the input and target sequences. Padding is done to ensure that
      all the training, validation and test samples are of equal size.'''

  eng_words = [tokenize(pair[0]) for pair in word_pairs]
  tam_words = [tokenize(pair[1]) for pair in word_pairs]
  padded_input_sequences = [torch.tensor(words + [char_index['<EOS>']] + [(char_index['<PAD>'])]*(max_length - len(words))) for words in eng_words]
  padded_target_sequences = [torch.tensor(words + [char_index['<EOS>']] + [(char_index['<PAD>'])]*(max_length - len(words))) for words in tam_words]
  tensor = torch.tensor([char_index['<PAD>']]*(max_length+1))
  padded_input_sequences.append(tensor)
  padded_target_sequences.append(tensor)
  padded_input_sequences = torch.stack(padded_input_sequences)
  padded_target_sequences = torch.stack(padded_target_sequences)
  
  return(padded_input_sequences,padded_target_sequences)


In [65]:
# Creating datasets
training_input_sequences, training_target_sequences = padding(training_pairs)
train_dataset = torch.utils.data.TensorDataset(training_input_sequences, training_target_sequences)

val_input_sequences, val_target_sequences = padding(val_pairs)
val_dataset = torch.utils.data.TensorDataset(val_input_sequences, val_target_sequences)

test_input_sequences, test_target_sequences = padding(test_pairs)
test_dataset = torch.utils.data.TensorDataset(test_input_sequences, test_target_sequences)

# Architecture

In [44]:
# Defining the EncoderRNN model
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, cell_type):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type

        self.embedding = nn.Embedding(input_size, hidden_size)

        if cell_type == "RNN":
            self.rnn = nn.RNN(hidden_size, hidden_size, num_layers)
        elif cell_type == "LSTM":
            self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers)
        elif cell_type == "GRU":
            self.rnn = nn.GRU(hidden_size, hidden_size, num_layers)


    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, -1, self.hidden_size)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden
    
    def initHidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

In [45]:
# Defining the DecoderRNN model
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, cell_type):
        super(DecoderRNN, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.cell_type = cell_type

        self.embedding = nn.Embedding(output_size, hidden_size)
        if cell_type == "RNN":
            self.rnn = nn.RNN(hidden_size, hidden_size, num_layers)
        elif cell_type == "LSTM":
            self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers)
        elif cell_type == "GRU":
            self.rnn = nn.GRU(hidden_size, hidden_size, num_layers)

        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)
        output = self.softmax(self.out(output))
        return output, hidden


In [75]:
def accuracy(encoder, decoder, val_loader, device):
    ''' This function calculates word-level accuracy which is used as the
        evaluation metric.'''
    encoder.eval()
    decoder.eval()

    total_correct = 0
    total_words = 0

    with torch.no_grad():
        for input_seq, target_seq in val_loader:
            batch_size = input_seq.size(0)

            # Initializing the encoder and decoder hidden states
            encoder_hidden = encoder.initHidden(batch_size)
            decoder_hidden = encoder_hidden

            # Forward pass through the encoder
            input_length = input_seq.size(1)
            for i in range(input_length):
                encoder_output, encoder_hidden = encoder(input_seq[:, i], encoder_hidden)

            # Initialize the decoder input to the start token
            decoder_input = torch.tensor([[74] * batch_size]).to(device)

            # Forward pass through the decoder
            target_length = target_seq.size(1)
            decoded_words = []
            for i in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.data.topk(1)
                decoded_words.append(topi.view(-1).tolist())

                # Setting decoder input to the current target token
                decoder_input = topi.squeeze().detach()

            # Calculating the word-level accuracy
            for i in range(batch_size):
                target_words = [val_loader.dataset.idx2word[idx] for idx in target_seq[i].tolist()]
                decoded_words_i = [val_loader.dataset.idx2word[idx] for idx in decoded_words[i]]
                correct = sum([1 if target_words[j] == decoded_words_i[j] else 0 for j in range(len(target_words))])
                total_correct += correct
                total_words += len(target_words)

    return total_correct / total_words

# Hyperparameters

In [68]:
# Defining hyperparameters
input_size = len(vocabulary)  # size of the vocabulary
output_size = len(vocabulary)
hidden_size = 256
learning_rate = 0.01
num_layers = 2
cell_type = 'RNN'
num_epochs = 2
batch_size = 128

# Defining EncoderRNN model and optimizer
encoder = EncoderRNN(input_size, hidden_size, num_layers, cell_type)
decoder = DecoderRNN(hidden_size, output_size, num_layers, cell_type)

#Training

In [69]:
# Creating Dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size, shuffle=True)

In [70]:
# Defining the loss function and the optimizer
criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)

In [None]:
# Training the model
'''In this part of the code the encoder-decoder model is trained.
   The input sequence is given to the encoder which outputs an encoded
   represenatation and a hidden state. The output of the encoder (context vector)
   is then fed to the decoder to be decoded'''
   
for epoch in tqdm(range(num_epochs)):
    loss = 0
    for batch_idx, (input_seq, target_seq) in enumerate(train_loader):
        # Zero gradients and reset the hidden states
        optimizer.zero_grad()
        encoder_hidden = encoder.initHidden(input_seq.size(0))
        decoder_hidden = encoder_hidden

        # print('\nbatch:', batch_idx, '\ninput:', input_seq, '\ntarget:', target_seq)

        # Forward pass through the encoder
        input_length = input_seq.size(1)
        # print('input length:',input_length)
        
        for i in range(input_length):
            encoder_output, encoder_hidden = encoder(input_seq[:, i], encoder_hidden)
        # print('encoder output', encoder_output.shape)
        # print('encoder hidden', encoder_hidden.shape)

        # Initialize the decoder input to the start token
        decoder_input = torch.tensor([[74] * batch_size])
        # print('decoder_input:', decoder_input.shape)
        # print(decoder_input)

        # Forward pass through the decoder
        target_length = target_seq.size(1)
        for i in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss = criterion(decoder_output.squeeze(), target_seq[:, i])
            loss += loss.item()

            # Setting decoder input to the current target token
            decoder_input = target_seq[:, i].unsqueeze(0)

        # Backward pass and optimizer step
        loss.backward()
        optimizer.step()

    val_accuracy = accuracy(encoder, decoder, val_loader, device)
    print('Epoch [{}/{}], Train Loss: {:.4f}, Val Accuracy: {:.4f}'.format(epoch+1, num_epochs, loss, val_accuracy))
