<a href="https://colab.research.google.com/github/ch23s020/Assignment3/blob/main/Assignment3_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.2.0-py2.py3-none-any.whl (281 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.1/281.1 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [3]:
import csv
import gdown
import random
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import wandb

In [4]:
# Download URLs for Google Sheets as CSV. Separated Marathi Files into three different folders as Training , Validation and Test

train_url = "https://docs.google.com/spreadsheets/d/11duz5Vbqay5TVn_uyglVQVcEZllTbWQt_8zTt2TcBSA/export?format=csv"

valid_url = "https://docs.google.com/spreadsheets/d/1KbKFfxFkMddkZde0r5PWKnQ0vzdh-XihxsMP7XUFDJc/export?format=csv"

test_url = "https://docs.google.com/spreadsheets/d/1ItKDweGPNtzWiF3rs0jzKjh7ZRRkas2hz7yWvbt4yzQ/export?format=csv"

# Paths to save the files

train_output = 'train_data.csv'

valid_output = 'valid_data.csv'

test_output = 'test_data.csv'

# Downloading the files

gdown.download(train_url, train_output, quiet=False)

gdown.download(valid_url, valid_output, quiet=False)

gdown.download(test_url, test_output, quiet=False)


Downloading...
From: https://docs.google.com/spreadsheets/d/11duz5Vbqay5TVn_uyglVQVcEZllTbWQt_8zTt2TcBSA/export?format=csv
To: /content/train_data.csv
2.23MB [00:00, 6.07MB/s]
Downloading...
From: https://docs.google.com/spreadsheets/d/1KbKFfxFkMddkZde0r5PWKnQ0vzdh-XihxsMP7XUFDJc/export?format=csv
To: /content/valid_data.csv
143kB [00:00, 8.18MB/s]
Downloading...
From: https://docs.google.com/spreadsheets/d/1ItKDweGPNtzWiF3rs0jzKjh7ZRRkas2hz7yWvbt4yzQ/export?format=csv
To: /content/test_data.csv
149kB [00:00, 8.76MB/s]


'test_data.csv'

Data laod and  Pre-Processing

In [6]:
# Function to load data

def load_data(file_path):

    data = []

    with open(file_path, 'r', encoding='utf-8') as csvfile:

        csvreader = csv.reader(csvfile)

        for idx, row in enumerate(csvreader):

            try:

                x = str(row[0])  # first column contains Romanized strings

                y = str(row[1])  # second column contains Devanagari strings

                data.append((x, y))

            except IndexError:

                print(f"IndexError in row {idx + 1}: {row}")
                # To resolve and verify the index error

    return data

# Load the data

train_data = load_data(train_output)

valid_data = load_data(valid_output)

test_data = load_data(test_output)




# Print a sample from each dataset to verify

print("Sample from train data:", train_data[0])

print("Sample from valid data:", valid_data[0])

print("Sample from test data:", test_data[0])

Sample from train data: ('fusharun', 'फुशारुन')
Sample from valid data: ('garvyabarobarach', 'गारव्याबरोबरच')
Sample from test data: ('heetler', 'हिटलर')


RNN


In [7]:
# Data Preparation

class TransliterationDataset(Dataset):

    def __init__(self, data, char2index, max_length=20):

        self.data = data

        self.char2index = char2index

        self.max_length = max_length

    def __len__(self):

        return len(self.data)

    def __getitem__(self, idx):

        x, y = self.data[idx]

        x_indices = [self.char2index[c] for c in x] + [self.char2index['<PAD>']] * (self.max_length - len(x))

        y_indices = [self.char2index[c] for c in y] + [self.char2index['<PAD>']] * (self.max_length - len(y))

        return torch.tensor(x_indices), torch.tensor(y_indices), len(x), len(y)

def collate_fn(batch):

    x, y, x_lengths, y_lengths = zip(*batch)

    x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=char2index['<PAD>'])  #P

    y = torch.nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=char2index['<PAD>'])

    return x, y

# Create character to index mappings

all_chars = sorted(set(''.join([x for x, y in train_data + valid_data + test_data]) + ''.join([y for x, y in train_data + valid_data + test_data])))

char2index = {char: idx for idx, char in enumerate(all_chars)}

char2index['<PAD>'] = len(char2index)

char2index['<SOS>'] = len(char2index) + 1

char2index['<EOS>'] = len(char2index) + 2


Training


In [8]:
# Initialize wandb

sweep_config = {

    "method": "random",

    "parameters": {

        "learning_rate": {"values": [0.001, 0.01, 0.1]},
        "batch_size": {"values": [32]},
        "num_epochs": {"values": [5, 10, 15, 20, 40, 60]},
        "encoder_layers": {"values": [1, 2, 3]},
        "decoder_layers": {"values": [1, 2, 3]},
        "hidden_dim": {"values": [128, 256, 512]},
        "embedding_dim": {"values": [128, 256, 512]},
        "dropout_rate": {"values": [0, 0.1, 0.2]},
        "rnn_cell_type": {"values": ["lstm", "rnn", "gru"]},
        "bidirectional": {"values": [False]},
        "max_length": {"values": [20, 60, 100, 150]},
        "gradient_clip": {"values": [1, 2]},

    }
}

sweep_id = wandb.sweep(sweep_config, project="transliteration-Assign3")


# Datasets and Dataloaders

train_dataset = TransliterationDataset(train_data, char2index)

valid_dataset = TransliterationDataset(valid_data, char2index)

test_dataset = TransliterationDataset(test_data, char2index)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn)

test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: e9i15px2
Sweep URL: https://wandb.ai/ch23s020/transliteration-Assign3/sweeps/e9i15px2


In [9]:
# Model Components and Classes, Training Function

class EmbeddingLayer(nn.Module):

    def __init__(self, input_dim, embedding_dim):

        super(EmbeddingLayer, self).__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

    def forward(self, x):

        return self.embedding(x)

class EncoderRNN(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout, rnn_type='lstm', bidirectional=False):

        super(EncoderRNN, self).__init__()

        self.embedding = EmbeddingLayer(input_dim, embedding_dim)

        rnn_cls = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}[rnn_type]

        self.rnn = rnn_cls(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)

    def forward(self, x):

        x = self.embedding(x)

        outputs, hidden = self.rnn(x)

        return outputs, hidden

class DecoderRNN(nn.Module):

    def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers, dropout, rnn_type='lstm', bidirectional=False):

        super(DecoderRNN, self).__init__()

        self.embedding = EmbeddingLayer(output_dim, embedding_dim)

        rnn_cls = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}[rnn_type]

        self.rnn = rnn_cls(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)

        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim)

    def forward(self, x, hidden):

        x = self.embedding(x).unsqueeze(1)

        outputs, hidden = self.rnn(x, hidden)

        predictions = self.fc(outputs.squeeze(1))

        return predictions, hidden

class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, device):

        super(Seq2Seq, self).__init__()

        self.encoder = encoder

        self.decoder = decoder

        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        batch_size = trg.shape[0]

        trg_len = trg.shape[1]

        trg_vocab_size = self.decoder.embedding.embedding.num_embeddings

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        if isinstance(hidden, tuple):  # LSTM

            hidden = (hidden[0][:self.decoder.rnn.num_layers], hidden[1][:self.decoder.rnn.num_layers])

        else:  # RNN or GRU

            hidden = hidden[:self.decoder.rnn.num_layers]

        input = trg[:, 0]

        for t in range(1, trg_len):

            output, hidden = self.decoder(input, hidden)

            outputs[:, t] = output

            top1 = output.argmax(1)

            input = trg[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs



In [10]:
def calculate_word_accuracy(output, target):

    pred_tokens = output.argmax(dim=2)

    non_pad_elements = (target != char2index['<PAD>']).float()

    correct = (pred_tokens == target).float() * non_pad_elements

    accuracy = correct.sum() / non_pad_elements.sum()

    return accuracy.item() * 100



In [11]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    epoch_acc = 0

    for i, (src, trg) in enumerate(iterator):

        src = src.to(model.device)

        trg = trg.to(model.device)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]

        output = output[:, 1:].reshape(-1, output_dim)

        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)

        acc = calculate_word_accuracy(output.view(src.size(0), -1, output_dim), trg.view(src.size(0), -1))

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    epoch_acc = 0

    with torch.no_grad():

        for i, (src, trg) in enumerate(iterator):

            src = src.to(model.device)

            trg = trg.to(model.device)

            output = model(src, trg)

            output_dim = output.shape[-1]

            output = output[:, 1:].reshape(-1, output_dim)

            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)

            acc = calculate_word_accuracy(output.view(src.size(0), -1, output_dim), trg.view(src.size(0), -1))

            epoch_loss += loss.item()

            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):

    elapsed_time = end_time - start_time

    elapsed_mins = int(elapsed_time / 60)

    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

    return elapsed_mins, elapsed_secs

# Hyperparameters

INPUT_DIMENSION = len(char2index)

OUTPUT_DIMENSION = len(char2index)

def train_model():

    wandb.init(project="transliteration-Assign3", config=sweep_config)

    config = wandb.config

    # Creating datasets and dataloaders

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

    valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size, collate_fn=collate_fn)

    test_loader = DataLoader(test_dataset, batch_size=config.batch_size, collate_fn=collate_fn)

    ENCODER_EMBEDDING_DIMENSION = config.embedding_dim

    DECODER_EMBEDDING_DIMENSION = config.embedding_dim

    HIDDEN_DIMENSION = config.hidden_dim

    NUM_ENCODER_LAYERS = config.encoder_layers

    NUM_DECODER_LAYERS = config.decoder_layers

    RNN_CELL_TYPE = config.rnn_cell_type

    BIDIRECTIONAL = config.bidirectional

    DROPOUT_RATE = config.dropout_rate

    GRADIENT_CLIP = config.gradient_clip

    encoder = EncoderRNN(INPUT_DIMENSION, ENCODER_EMBEDDING_DIMENSION, HIDDEN_DIMENSION, NUM_ENCODER_LAYERS, DROPOUT_RATE, RNN_CELL_TYPE, BIDIRECTIONAL)

    decoder = DecoderRNN(OUTPUT_DIMENSION, DECODER_EMBEDDING_DIMENSION, HIDDEN_DIMENSION, NUM_DECODER_LAYERS, DROPOUT_RATE, RNN_CELL_TYPE, BIDIRECTIONAL)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = Seq2Seq(encoder, decoder, device).to(device)

    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    criterion = nn.CrossEntropyLoss(ignore_index=char2index['<PAD>'])

    wandb.watch(model, criterion, log="all")

    N_EPOCHS = config.num_epochs

    CLIP = GRADIENT_CLIP

    best_valid_loss = float('inf')

    for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_acc = train(model, train_loader, optimizer, criterion, CLIP)

        valid_loss, valid_acc = evaluate(model, valid_loader, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:

            best_valid_loss = valid_loss

            torch.save(model.state_dict(), 'tut1-model.pt')

        # Logging hyperparameters and metrics

        wandb.log({

            "train_loss": train_loss,

            "train_acc": train_acc,

            "valid_loss": valid_loss,

            "valid_acc": valid_acc,

            "input_dimension": INPUT_DIMENSION,

            "output_dimension": OUTPUT_DIMENSION,

            "encoder_embedding_dimension": ENCODER_EMBEDDING_DIMENSION,

            "decoder_embedding_dimension": DECODER_EMBEDDING_DIMENSION,

            "hidden_dimension": HIDDEN_DIMENSION,

            "num_encoder_layers": NUM_ENCODER_LAYERS,

            "num_decoder_layers": NUM_DECODER_LAYERS,

            "rnn_cell_type": RNN_CELL_TYPE,

            "bidirectional": BIDIRECTIONAL,

            "dropout_rate": DROPOUT_RATE,

            "gradient_clip": CLIP

        })

        print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')

        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')

        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

    wandb.finish()


In [None]:
wandb.agent(sweep_id, function=train_model)

[34m[1mwandb[0m: Agent Starting Run: 3tscej4p with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	bidirectional: False
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout_rate: 0
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	gradient_clip: 2
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_length: 150
[34m[1mwandb[0m: 	num_epochs: 5
[34m[1mwandb[0m: 	rnn_cell_type: lstm
[34m[1mwandb[0m: Currently logged in as: [33mch23s020[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch: 01 | Time: 86m 50s
	Train Loss: 2.600 | Train Acc: 29.92%
	 Val. Loss: 2.067 |  Val. Acc: 42.67%
