In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Importing libraries
import numpy as np
import pandas as pd
import os
import torch
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.utils.data import DataLoader
from transformers import EarlyStoppingCallback
from PIL import Image
import torchvision
import torchvision.transforms as transforms
from transformers import ViTFeatureExtractor
from transformers import ViTForImageClassification
import tqdm 

SEED = 661077
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

MAX_LENGTH = 100
NUM_LABELS = 30
modelName = "bert-base-uncased"
cnn_modelName = 'google/vit-base-patch16-224-in21k'

BERT_DIMENSION = 1024 if "large" in modelName else 768
EPOCHS = 10
EPSILON = 1e-4
DEBUG = True 
LR = 2.6*0.00001
MOMENTUM = 0.9

BATCH_SIZE = 64
gradAcc = 1

thisDevice = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(thisDevice)
print("The device in use is : "+thisDevice)

PATH = "/kaggle/working/Transformers+CNN.pth"
load_saved = False

The device in use is : cuda:0


In [3]:
# inintializing pretrained bert and its tokenizer
tokenizer = AutoTokenizer.from_pretrained(modelName)
feature_extractor = ViTFeatureExtractor.from_pretrained(cnn_modelName)

# Function for data class
class bookDataset(Dataset):
    def __init__(self,path_to_x,path_to_y,dataDir,tokenizer):
        
        self.images = pd.read_csv(os.path.join(dataDir,path_to_x))
        self.img_dir = os.path.join(dataDir,"images","images")
        
        self.temp_titles = list(pd.read_csv(os.path.join(dataDir,path_to_x)).iloc[:,2].values)
        self.titles = tokenizer(self.temp_titles,padding = True,truncation = True,max_length = MAX_LENGTH)
        
        self.labels = None
        if path_to_y!="":
            self.labels = list(pd.read_csv(os.path.join(dataDir,path_to_y)).iloc[:,1].values)
    
    def __len__(self):
        return len(self.titles["input_ids"])
    
    def __getitem__(self,idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.titles.items()}
        if self.labels!=None:
            item["labels"] = torch.tensor(self.labels[idx])
        
        img_path = os.path.join(self.img_dir,self.images.iloc[idx,1])
        thisImg = Image.open(img_path)
        thisTransform = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                             std=[0.229, 0.224, 0.225])])
        item["img"] = thisTransform(thisImg)
        
        return item

trainDataset = bookDataset("train_x.csv","train_y.csv","/kaggle/input/col774-2022/",tokenizer)
testDataset = bookDataset("non_comp_test_x.csv","non_comp_test_y.csv","/kaggle/input/col774-2022/",tokenizer)
trainloader = DataLoader(trainDataset,batch_size = BATCH_SIZE,shuffle = True)
testloader = DataLoader(testDataset,batch_size = BATCH_SIZE,shuffle = True)

In [4]:
# model class
class MultiModel(torch.nn.Module):
    def __init__(self):
        super(MultiModel,self).__init__()
        
        self.config = AutoConfig.from_pretrained(modelName)
        self.bert = AutoModel.from_pretrained(modelName, config = self.config)
        
        self.cnn = torchvision.models.efficientnet_b5(pretrained = True)
        
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(1000+BERT_DIMENSION,30))
        
    def forward(self,input_ids,attention_mask,imgs):
        pooler_output = self.bert(input_ids,attention_mask = attention_mask)[1]
        convOut = self.cnn(imgs)
        
        fcIn = torch.cat((pooler_output,convOut),dim = 1)
        fcOut = self.fc(fcIn)
        return fcOut

model = MultiModel()
if load_saved:
    print("Loading from pretrained model")
    model.load_state_dict(torch.load(PATH))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b5_lukemelas-b

  0%|          | 0.00/117M [00:00<?, ?B/s]

In [5]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

model = model.to(device)

# Testing on the test data
def calAccuracy(model,dataloader,device):
    correct = 0
    total = 0
    
    torch.set_grad_enabled(False)
    
    for data in dataloader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        labels = data["labels"].to(device)
        imgs = data["img"].to(device)
        
        total += input_ids.size(0)
        
        outputs = model(input_ids,attention_mask,imgs)
        predicted = torch.nn.functional.softmax(outputs).argmax(1)
        correct += (predicted==labels).sum().item()
        
    torch.set_grad_enabled(True)
    finalAcc = (correct*100)/total
    print("The accuracy of the model : "+str(finalAcc))
    return finalAcc

In [6]:
# training model function
def trainModel(model,dataloader,loss_fn,optimizer,EPOCHS,EPSILON,device,PATH):
    
    last_loss = (np.inf)/2
    max_valAcc = 0
    
    optimizer.zero_grad()
    for epoch in range(EPOCHS):
        this_loss = 0.0

        for idx,data in enumerate((dataloader)):
            if(DEBUG and idx%100==0): print("Iteration : "+str(idx))
            
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            imgs = data["img"].to(device)
            labels = data["labels"].to(device)
            
            output = model(input_ids,attention_mask,imgs)
            loss = loss_fn(output,labels)
            
            (loss/gradAcc).backward()
            
            if (idx+1)%gradAcc == 0:
                optimizer.step()
                optimizer.zero_grad()

            this_loss += loss
        
        this_loss = this_loss/len(dataloader)
        #if(abs(this_loss-last_loss)<EPSILON): break
        last_loss = this_loss
        
        print("Epoch : "+str(epoch)+", Loss ==> "+str(last_loss))
        print("Testing Accuracy ==>")
        
        this_valAcc = calAccuracy(model,testloader,device)
        if(this_valAcc>max_valAcc):
            max_valAcc = this_valAcc
            torch.save(model.state_dict(),PATH)
            print("Model saved")
        
    return model

In [7]:
# Running the loop
trainModel(model,trainloader,loss_fn,optimizer,EPOCHS,EPSILON,device,PATH)

Iteration : 0


RuntimeError: CUDA out of memory. Tried to allocate 184.00 MiB (GPU 0; 14.76 GiB total capacity; 13.29 GiB already allocated; 133.75 MiB free; 13.46 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# reloading the last saved(best) model
model = MultiModel()
model.load_state_dict(torch.load(PATH))
model.to(device)


#print("Training Accuracy ==>"+str(calAccuracy(model,trainloader,device)))
#print("Testing Accuracy ==>"+str(calAccuracy(model,testloader,device)))

In [None]:
compDataset = bookDataset("comp_test_x.csv","","/kaggle/input/col774-2022/",tokenizer)
comploader = DataLoader(compDataset,batch_size = BATCH_SIZE,shuffle = False)

# printing the output dataframe to csv file
def outputToFile(model,dataloader,device,outFileName):
    torch.set_grad_enabled(False)
    
    counter = 0
    Ids = []
    outputs = []
    
    for data in dataloader:
        input_ids = data["input_ids"].to(device)
        attention_mask = data["attention_mask"].to(device)
        imgs = data["img"].to(device)
        model_outputs = model(input_ids,attention_mask,imgs)
        predicted = torch.nn.functional.softmax(model_outputs).argmax(1)
        
        for pred in predicted:
            Ids.append(counter)
            outputs.append(int(pred))
            counter += 1
        
    torch.set_grad_enabled(True)
    df = pd.DataFrame(list(zip(Ids,outputs)),columns = ["Id","Genre"])
    df.to_csv(outFileName,index=False)

outputToFile(model,comploader,device,"/kaggle/working/efficientNet.csv")