In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

# Importing libraries
import numpy as np 
import pandas as pd
import os
import torch
import torchvision
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from torchvision.io import read_image
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch import nn
import sys

inputPath = '/kaggle/input/col774-2022'

# Initializing seed to maintain consistency
SEED = 661077
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# Using various hyperparameters 
EPOCHS = 2
EPSILON = 1e-4
DEBUG = True 
LR = 0.01
MOMENTUM = 0.9

# Take cude if it is available
thisDevice = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(thisDevice)
print("The device in use is : "+thisDevice)

# Function for dataset class
class bookDataset(Dataset):
    def __init__(self,images_file,labels_file,dataDir,comp = False):
        # store the dataframe for the csv files
        
        self.images = pd.read_csv(os.path.join(dataDir,images_file))
        
        # comp part labels are not given
        self.comp = comp
        self.labels = None
        
        if not self.comp:
            self.labels = pd.read_csv(os.path.join(dataDir,labels_file))
        
        self.img_dir = os.path.join(dataDir,"images","images")
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self,idx):
        img_path = os.path.join(self.img_dir,self.images.iloc[idx,1])
        thisImg = Image.open(img_path)
        thisTransform = transforms.Compose([transforms.PILToTensor()])
        img_tensor = thisTransform(thisImg)/255
        
        thisLab = -1
        if not self.comp:
            thisLab = self.labels.iloc[idx,1]
        return img_tensor,thisLab

# dataset & dataloaders initialization
trainDataset = bookDataset("train_x.csv","train_y.csv",inputPath)
testDataset = bookDataset("non_comp_test_x.csv","non_comp_test_y.csv",inputPath)
trainloader = DataLoader(trainDataset,batch_size = 64,shuffle = True)
testloader = DataLoader(testDataset,batch_size = 64,shuffle = True)


# visualization of data points
print(trainDataset[0][0].size())
#plt.imshow(trainDataset[0][0].numpy().transpose(1, 2, 0))

# model class
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel,self).__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(3,32,kernel_size = (5,5)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = (2,2),stride = 2),
            nn.Conv2d(32,64,kernel_size = (5,5)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = (2,2),stride = 2),
            nn.Conv2d(64,128,kernel_size = (5,5)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = (2,2),stride = 2))
        
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128*24*24,128),
            nn.ReLU(),
            nn.Linear(128,30))
        
    
    def forward(self,x):
        convOut = self.conv(x)
        fcOut = self.fc(convOut)
        return fcOut

model = CNNModel()

# defining the loss function and optimizer for the model
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr = LR,momentum = MOMENTUM)   

model = model.to(device)

# Testing on the test data
def calAccuracy(model,dataloader,device):
    correct = 0
    total = 0
    
    # no need to track the forward computation
    torch.set_grad_enabled(False)
    
    for data in dataloader:
        inputs,labels = data
        total += labels.size(0)
        
        inputs,labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        _,predicted = torch.max(model(inputs).data,1)
        correct += (predicted==labels).sum().item()
        
    torch.set_grad_enabled(True)
    finalAcc = (correct*100)/total
    print("The accuracy of the model : "+str(finalAcc))
    return finalAcc


# training model function
def trainModel(model,dataloader,loss_fn,optimizer,EPOCHS,EPSILON,device,PATH):
    
    last_loss = (np.inf)/2
    max_valAcc = 0
    
    for epoch in range(EPOCHS):
        this_loss = 0.0

        for idx,data in enumerate(dataloader,0):
            if(DEBUG and idx%100==0): print("Iteration : "+str(idx))
            inputs,labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            loss = loss_fn(model(inputs),labels)
            loss.backward()
            optimizer.step()

            this_loss += loss
        
        this_loss = this_loss/len(dataloader)
        #if(abs(this_loss-last_loss)<EPSILON): break
        last_loss = this_loss
        
        print("Epoch : "+str(epoch)+", Loss ==> "+str(last_loss))
        print("Testing Accuracy ==>")
        
        this_valAcc = calAccuracy(model,testloader,device)
        
        # Save the model only if validation accuracy is greater than previous max
        if(this_valAcc>max_valAcc):
            max_valAcc = this_valAcc
            torch.save(model.state_dict(),PATH)
            print("Model saved")
        
    return model

# Running the loop
PATH = "/kaggle/working/CNNModel.pth"
trainModel(model,trainloader,loss_fn,optimizer,EPOCHS,EPSILON,device,PATH)


# reloading the last saved(best) model
model = CNNModel()
model.load_state_dict(torch.load(PATH))
model.to(device)

#print("Training Accuracy ==>"+str(calAccuracy(model,trainloader,device)))
#print("Testing Accuracy ==>"+str(calAccuracy(model,testloader,device)))

compDataset = bookDataset("non_comp_test_x.csv","non_comp_test_y.csv",inputPath,comp = True)
comploader = DataLoader(compDataset,batch_size = 64,shuffle = False)

# printing the output dataframe to csv file
def outputToFile(model,dataloader,device,outFileName):
    torch.set_grad_enabled(False)
    
    counter = 0
    Ids = []
    outputs = []
    
    for data in dataloader:
        inputs,labels = data
        inputs = inputs.to(device)
        _,predicted = torch.max(model(inputs).data,1)
        
        for pred in predicted:
            Ids.append(counter)
            outputs.append(int(pred))
            counter += 1
        
    torch.set_grad_enabled(True)
    df = pd.DataFrame(list(zip(Ids,outputs)),columns = ["Id","Genre"])
    df.to_csv(outFileName,index=False)

outputToFile(model,comploader,device,"/kaggle/working/non_comp_test_pred_y.csv")


In [None]:
import torch, torchtext
import torch.nn as nn
import os, sys
from torchtext.data import get_tokenizer
from collections import Counter, OrderedDict
import torch
import torchvision
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from torchvision.io import read_image
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch import nn
import nltk
from nltk.corpus import stopwords

dataset_dir_path = "/kaggle/input/col774-2022/"

class bookDataset(Dataset):
    def __init__(self,path_to_x,path_to_y,dataDir):
        
        # fetch titles from file
        self.titles = pd.read_csv(os.path.join(dataDir,path_to_x)).iloc[:,2].values
        tokenizer = get_tokenizer("basic_english")
        
        # tokenize the words
        self.titles = [tokenizer(data) for data in self.titles]
        self.labels = []
        
        # fetch labels and one hot encode them
        self.labels = pd.read_csv(os.path.join(dataDir,path_to_y)).iloc[:,1].values
        self.labels = nn.functional.one_hot(torch.tensor(self.labels),num_classes=30).reshape(-1,30)
        
        self.x = self.titles
        self.y = self.labels
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self,idx):
        return self.x[idx], self.y[idx]
    
    
# padding to make equal length
def collate_fn(batch):
    inputs = []
    labels = []
    
    for b in batch:
        inputs.append(torch.tensor(b[0]))
        labels.append(b[1])

    return torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True), torch.stack(labels)

def buildVocabulary(x):    
    # Build the vocabulary
    all_words = []
    for tokens in x:
        for token in tokens:
            if token.isalpha() and (token not in stopwords.words('english')):
                all_words.append(token)
                    
    counter = Counter(all_words)
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    vocab = torchtext.vocab.vocab(ordered_dict, specials=['<unk>', '<pad>'])
    vocab.set_default_index(vocab['<unk>'])
    
    return vocab

# get indices for all words from the vocabulary
def indexWords(x, vocab):
    for i, tokens in enumerate(x):
        x[i] = []
        for token in tokens:
            if not vocab[token] == 0:
                x[i].append(vocab[token])
    return x
    
# dataset & dataloaders initialization
trainDataset = bookDataset("train_x.csv","train_y.csv",dataset_dir_path)
testDataset = bookDataset("non_comp_test_x.csv","non_comp_test_y.csv",dataset_dir_path)

vocab = buildVocabulary(trainDataset.x)
trainDataset.x = indexWords(trainDataset.x, vocab)
testDataset.x = indexWords(testDataset.x, vocab)

trainloader = DataLoader(trainDataset,batch_size = 1000,shuffle = True, collate_fn=collate_fn)
testloader = DataLoader(testDataset,batch_size = 1000,shuffle = True, collate_fn=collate_fn)

# fetch glove embeddings
glove = torchtext.vocab.GloVe(name='6B', dim=300)

class A4_RNN(nn.Module):
    
    def __init__(self, input_size=300, hidden_size=128, num_layers=1, num_classes=30):
        super(A4_RNN, self).__init__()
        
        # compute mean glove embedding for missing words
        mean_embedding = torch.zeros(300)
        count = 0
        for word in glove.stoi:
            mean_embedding += glove[word]
            count+=1

        mean_embedding = mean_embedding/(count)
                
        # make embedding matrix
        mat = torch.zeros(len(vocab), 300)
        for i, word in enumerate(vocab.get_stoi()):
            if word in glove.stoi:
                mat[i] = glove[word]
            else:
                mat[i] = mean_embedding
        
        self.embedding = nn.Embedding.from_pretrained(mat, freeze=False)
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, nonlinearity='tanh', bias=True, batch_first=True, dropout=0, bidirectional=True)
        self.FC1 = nn.Linear(hidden_size*2,128)
        self.FC2 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = self.embedding(x.long())
        x,_ = self.rnn(x)
        x = self.FC1(x[:,0,:])
        x = torch.tanh(x)
        x = self.FC2(x)
        x = torch.tanh(x)
        return x
    
model = A4_RNN()
model.float()

# compute accuracy
def calAccuracy(model,dataloader,device):
    correct = 0
    total = 0
    
    model.eval()
    
    for data in dataloader:
        inputs,labels = data
        total += labels.size(0)
        
        inputs,labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        _,predicted = torch.max(model(inputs).data,1)

        truth = [int(np.argmax(lbl.cpu())) for lbl in labels]
        
        for pred,tru in zip(predicted,truth):
            correct += (pred==tru)

    print("The accuracy of the model : "+str((float(correct)*100)/total))

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
EPOCHS = 20
EPSILON = 1e-4
DEBUG = True

thisDevice = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(thisDevice)
model = model.to(device)
print("The device in use is : "+thisDevice)

# Training the model
def trainModel(model,dataloader,loss_fn,optimizer,EPOCHS,EPSILON,device):
    
    model.train()
    
    last_loss = (np.inf)/2
    for epoch in range(EPOCHS):
        this_loss = 0.0

        for idx,data in enumerate(dataloader,0):
            
            inputs,labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
                    
            labels = labels.float()
            optimizer.zero_grad()
            loss = loss_fn(model(inputs),labels)
            
            loss.backward()
            optimizer.step()

            this_loss += loss
        
        this_loss = this_loss/len(dataloader)
        last_loss = this_loss
        
        if(DEBUG): 
            print("Epoch : "+str(epoch)+", Loss ==> "+str(last_loss.item()))
            print("Testing Accuracy ==>")
            calAccuracy(model,testloader,device)
            model.train()
        
    return model

model = trainModel(model,trainloader,loss_fn,optimizer,EPOCHS,EPSILON,device)
PATH = "/kaggle/working/RNNModel.pth"
torch.save(model.state_dict(),PATH)

print("Training Accuracy ==>")
calAccuracy(model, trainloader,device)
print("Testing Accuracy ==>")
calAccuracy(model,testloader,device)

noncompDataset = bookDataset("non_comp_test_x.csv","non_comp_test_y.csv",dataset_dir_path)
noncompDataset.x = indexWords(noncompDataset.x, vocab)
noncomploader = DataLoader(noncompDataset,batch_size = 1000,shuffle = True, collate_fn=collate_fn)

# printing the output dataframe to csv file
def outputToFile(model,dataloader,device,outFileName):
    torch.set_grad_enabled(False)
    
    counter = 0
    Ids = []
    outputs = []
    total = 0
    
    for idx,data in enumerate(dataloader,0):
        inputs,labels = data
        total += labels.size(0)
        
        inputs,labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        _,predicted = torch.max(model(inputs).data,1)
        
        for pred in predicted:
            Ids.append(counter)
            outputs.append(int(pred))
            counter += 1
        
    torch.set_grad_enabled(True)
    df = pd.DataFrame(list(zip(Ids,outputs)),columns = ["Id","Genre"])
    df.to_csv(outFileName,index=False)

outputToFile(model,noncomploader,device,"/kaggle/working/non_comp_test_pred_y2.csv")

In [2]:
# Importing libraries
import numpy as np
import pandas as pd
import os
import sys
import torch
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from transformers import EarlyStoppingCallback

inputPath = "/kaggle/input/col774-2022/"

# Setting various hyperparameters
SEED = 661077
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

MAX_LENGTH = 60
NUM_LABELS = 30
modelName = "bert-large-uncased"

EPOCHS = 10
EPSILON = 1e-4
DEBUG = True 
LR = 2.5*0.00001
MOMENTUM = 0.9

thisDevice = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(thisDevice)
print("The device in use is : "+thisDevice)


# inintializing pretrained bert and its tokenizer
tokenizer = BertTokenizer.from_pretrained(modelName)
model = BertForSequenceClassification.from_pretrained(modelName, num_labels=NUM_LABELS)


# Function for data class
class bookDataset(Dataset):
    def __init__(self,path_to_x,path_to_y,dataDir,tokenizer):
        
        self.temp_titles = list(pd.read_csv(os.path.join(dataDir,path_to_x)).iloc[:,2].values)
        self.titles = tokenizer(self.temp_titles,padding = True,truncation = True,max_length = MAX_LENGTH)
        
        self.labels = None
        if path_to_y!="":
            self.labels = list(pd.read_csv(os.path.join(dataDir,path_to_y)).iloc[:,1].values)
    
    def __len__(self):
        return len(self.titles["input_ids"])
    
    def __getitem__(self,idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.titles.items()}
        if self.labels!=None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

trainDataset = bookDataset("train_x.csv","train_y.csv",inputPath,tokenizer)
testDataset = bookDataset("non_comp_test_x.csv","non_comp_test_y.csv",inputPath,tokenizer)
trainloader = DataLoader(trainDataset,batch_size = 64,shuffle = True)
testloader = DataLoader(testDataset,batch_size = 64,shuffle = True)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


model = model.to(device)

# Testing on the test data
def calAccuracy(model,dataloader,device):
    correct = 0
    total = 0
    
    torch.set_grad_enabled(False)
    
    for data in dataloader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        labels = data["labels"].to(device)
        
        total += input_ids.size(0)
        
        outputs = model(input_ids,attention_mask = attention_mask,labels = labels)
        predicted = torch.nn.functional.softmax(outputs["logits"]).argmax(1)
        correct += (predicted==labels).sum().item()
        
    torch.set_grad_enabled(True)
    finalAcc = (correct*100)/total
    print("The accuracy of the model : "+str(finalAcc))
    return finalAcc


# training model function
def trainModel(model,dataloader,loss_fn,optimizer,EPOCHS,EPSILON,device,PATH):
    
    last_loss = (np.inf)/2
    max_valAcc = 0
    
    for epoch in range(EPOCHS):
        this_loss = 0.0

        for idx,data in enumerate(dataloader,0):
            if(DEBUG and idx%100==0): 
                print("Iteration : "+str(idx))
#                 calAccuracy(model,testloader,device)
            
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            labels = data["labels"].to(device)
            
            optimizer.zero_grad()
            output = model(input_ids,attention_mask = attention_mask,labels=labels)
            loss = output[0]
            loss.backward()
            optimizer.step()

            this_loss += loss
        
        this_loss = this_loss/len(dataloader)
        #if(abs(this_loss-last_loss)<EPSILON): break
        last_loss = this_loss
        
        print("Epoch : "+str(epoch)+", Loss ==> "+str(last_loss))
        print("Testing Accuracy ==>")
        
        this_valAcc = calAccuracy(model,testloader,device)
        if(this_valAcc>max_valAcc):
            max_valAcc = this_valAcc
            torch.save(model.state_dict(),PATH)
            print("Model saved")
        
    return model

# Running the loop
PATH = "/kaggle/working/Transformers.pth"
trainModel(model,trainloader,loss_fn,optimizer,EPOCHS,EPSILON,device,PATH)


# reloading the last saved(best) model
model = BertForSequenceClassification.from_pretrained(modelName, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(PATH))
model.to(device)

# print("Training Accuracy ==>"+str(calAccuracy(model,trainloader,device)))
# print("Testing Accuracy ==>"+str(calAccuracy(model,testloader,device)))


compDataset = bookDataset("/kaggle/working/comp_test_x.csv","",inputPath,tokenizer)
comploader = DataLoader(compDataset,batch_size = 64,shuffle = False)

# printing the output dataframe to csv file
def outputToFile(model,dataloader,device,outFileName):
    torch.set_grad_enabled(False)
    
    counter = 0
    Ids = []
    outputs = []
    
    for data in dataloader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        model_outputs = model(input_ids,attention_mask = attention_mask)
        predicted = torch.nn.functional.softmax(model_outputs["logits"]).argmax(1)
        
        for pred in predicted:
            Ids.append(counter)
            outputs.append(int(pred))
            counter += 1
        
    torch.set_grad_enabled(True)
    df = pd.DataFrame(list(zip(Ids,outputs)),columns = ["Id","Genre"])
    df.to_csv(outFileName,index=False)

outputToFile(model,comploader,device,"/kaggle/working/comp_test_y.csv")

The device in use is : cuda:0


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Iteration : 0
Iteration : 100


KeyboardInterrupt: 