In [None]:
# Importing required libraries
import torch, torchtext
import torch.nn as nn
import os
from torchtext.data import get_tokenizer
from collections import Counter, OrderedDict
import torch
import torchvision
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from torchvision.io import read_image
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch import nn

In [None]:
# Defining dataset class
class bookDataset(Dataset):
    def __init__(self,path_to_x,path_to_y,dataDir):
        
        self.titles = pd.read_csv(os.path.join(dataDir,path_to_x)).iloc[:,2].values
        tokenizer = get_tokenizer("basic_english")
        self.titles = [tokenizer(data) for data in self.titles]
        
        self.labels = pd.read_csv(os.path.join(dataDir,path_to_y)).iloc[:,1].values
        self.labels = nn.functional.one_hot(torch.tensor(self.labels),num_classes=30).reshape(-1,30)
        
        all_words = []
        for tokens in self.titles:
            for token in tokens:
                if token.isalpha():
                    all_words.append(token)

        counter = Counter(all_words)
        words_dict = OrderedDict(sorted(counter.items(), key=lambda x: x[1], reverse=True))
        self.vocabulary = torchtext.vocab.vocab(words_dict, specials=['<unk>','<pad>'])
        self.vocabulary.set_default_index(self.vocabulary['<unk>'])
        
        self.x = self.titles
        self.data_len = len(self.x)
        self.y = self.labels
        
        for i, tokens in enumerate(self.x):
            self.x[i] = [self.vocabulary[token] for token in tokens]

    def __len__(self):
        return len(self.titles)
    
    def __getitem__(self,idx):
        return self.titles[idx], self.labels[idx]
    
def collate_fn(batch):
    inputs = []
    labels = []
    
    for b in batch:
        inputs.append(torch.tensor(b[0]))
        labels.append(b[1])

    return torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True), torch.stack(labels)
    
# dataset & dataloaders initialization
trainDataset = bookDataset("train_x.csv","train_y.csv","/kaggle/input/col774-2022/")
testDataset = bookDataset("non_comp_test_x.csv","non_comp_test_y.csv","/kaggle/input/col774-2022/")
trainloader = DataLoader(trainDataset,batch_size = 64,shuffle = True, collate_fn=collate_fn)
testloader = DataLoader(testDataset,batch_size = 64,shuffle = True, collate_fn=collate_fn)

vocab = trainDataset.vocabulary

In [None]:
glove = torchtext.vocab.GloVe(name='6B', dim=300)

In [None]:
class A4_RNN(nn.Module):
    
    def __init__(self, input_size=300, hidden_size=128, num_layers=1, num_classes=30):
        super(A4_RNN, self).__init__()
        
        mat = torch.zeros(len(vocab), 300)
        for i, word in enumerate(vocab.get_stoi()):
            if word in glove.stoi:
                mat[i] = glove[word]
            else:
                mat[i] = torch.randn(300)
        
        self.embedding = nn.Embedding.from_pretrained(mat, freeze=False)
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, nonlinearity='tanh', bias=True, batch_first=True, dropout=0, bidirectional=True)
        self.FC1 = nn.Linear(hidden_size*2,128)
        self.FC2 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.rnn(x)
        x = self.FC1(x[:,0,:])
        x = torch.tanh(x)
        x = self.FC2(x)
        x = torch.tanh(x)
        return x
    
model = A4_RNN()
model.float()

In [None]:
# Training the model
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
EPOCHS = 150
EPSILON = 1e-4
DEBUG = True

thisDevice = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(thisDevice)
model = model.to(device)
print("The device in use is : "+thisDevice)

# Testing on the test data
def calAccuracy(model,dataloader,device):
    correct = 0
    total = 0
    
    torch.set_grad_enabled(False)
    
    for data in dataloader:
        inputs,labels = data
        total += labels.size(0)
        
        inputs,labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        _,predicted = torch.max(model(inputs).data,1)
        
        
        truth = [int(np.argmax(lbl.cpu())) for lbl in labels]
        
        for pred,tru in zip(predicted,truth):
            correct += (pred==tru)
        
    torch.set_grad_enabled(True)
    print("The accuracy of the model : "+str((float(correct)*100)/total))

In [None]:
# training model function
def trainModel(model,dataloader,loss_fn,optimizer,EPOCHS,EPSILON,device):
    
    last_loss = (np.inf)/2
    for epoch in range(EPOCHS):
        this_loss = 0.0

        for idx,data in enumerate(dataloader,0):
            # if(DEBUG and idx%100==0): print("Iteration : "+str(idx))
            inputs,labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            labels = labels.float()
            loss = loss_fn(model(inputs),labels)
            loss.backward()
            optimizer.step()

            this_loss += loss
        
        this_loss = this_loss/len(dataloader)
        #if(abs(this_loss-last_loss)<EPSILON): break
        last_loss = this_loss
        
        if(DEBUG): 
            print("Epoch : "+str(epoch)+", Loss ==> "+str(last_loss))
            print("Testing Accuracy ==>")
            calAccuracy(model,testloader,device)
        
    return model

In [None]:
model = trainModel(model,trainloader,loss_fn,optimizer,EPOCHS,EPSILON,device)
PATH = "/kaggle/input/col774-2022/RNNModel.pth"

In [None]:
print("Training Accuracy ==>")
calAccuracy(model,trainloader,device)
print("Testing Accuracy ==>")
calAccuracy(model,testloader,device)