In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hugging-face-vqa-small/data-00004-of-00136.arrow
/kaggle/input/hugging-face-vqa-small/data-00002-of-00136.arrow
/kaggle/input/hugging-face-vqa-small/data-00005-of-00136.arrow
/kaggle/input/hugging-face-vqa-small/data-00000-of-00136.arrow
/kaggle/input/hugging-face-vqa-small/data-00003-of-00136.arrow


**Data Loading**

In [2]:
import torch
from torch import nn
from transformers import AutoImageProcessor, AutoTokenizer, AutoModel, AutoConfig

2024-05-18 07:09:06.228766: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 07:09:06.228877: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 07:09:06.354547: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [4]:
from datasets import load_dataset, load_from_disk, Dataset
dataset = load_dataset("/kaggle/input/hugging-face-vqa-small")

Generating train split: 0 examples [00:00, ? examples/s]

**Model Definition:**

In [5]:
class CustomDataset(Dataset):
    def __init__(self, data_source, one_hot_encoder,transform=None):
        self.data_source = data_source
        self.one_hot_encoder = one_hot_encoder
        self.transform = transform
                
    def __len__(self):
        return len(self.data_source)

    def __getitem__(self, idx):
        item = self.data_source[idx]
#         print(type(item))
#         print(len(item))
#         print(item)
        image = item["image"]
        if self.transform:
            image = self.transform(image)
#         image.resize((224, 224))
        question = item["question"]
        label = item["multiple_choice_answer"]
        
        # Transform the label using the fitted one-hot encoder
        label_encoded = self.one_hot_encoder.transform(np.array(label).reshape(-1, 1))
        # Convert label_encoded to a PyTorch tensor
        label_tensor = torch.tensor(label_encoded, dtype=torch.float32)        
        return image, question, label_tensor

In [6]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse=False,max_categories=300)
one_hot_encoded_fit = one_hot_encoder.fit(np.array(dataset["train"]["multiple_choice_answer"]).reshape(-1, 1))
temp = one_hot_encoded_fit.transform(np.array(dataset["train"][3]["multiple_choice_answer"]).reshape(-1, 1))
# temp



In [7]:
from torchvision import transforms
from torchvision.transforms import v2

# transform = v2.Compose([
#     v2.Resize((224, 244), antialias=True),
# ])

transform = transforms.Compose([
#     transforms.ToPILImage(),
    transforms.Resize((256, 256)),
    transforms.ToTensor()
])
train_dataset = CustomDataset(dataset["train"],one_hot_encoded_fit,transform)

In [8]:
from transformers import BertConfig, BertModel, BertTokenizer

class VQAModel(nn.Module):
    def __init__(self, config, image_transformer="facebook/dinov2-base", text_transformer="google-bert/bert-base-uncased", output_size=300):
        super().__init__()
        
        # For image encoding
        self.image_processor = AutoImageProcessor.from_pretrained(image_transformer)
        self.image_model = AutoModel.from_pretrained(image_transformer).to(device)
        
        self.text_model_config = AutoConfig.from_pretrained("google-bert/bert-base-uncased", is_decoder=True,add_cross_attention=True)
        
        # For text encoding
        self.text_processor = AutoTokenizer.from_pretrained(text_transformer)
        self.text_model = AutoModel.from_config(self.text_model_config).to(device)
        
        # Freeze the parameters of the transformer models
        # As unable to train model with transformer weights
        for param in self.image_model.parameters():
            param.requires_grad = False
        for param in self.text_model.parameters():
            param.requires_grad = False

        # Concatenate the output of image and text and input to linear layer
        self.fc1 = nn.Linear(self.text_model.config.hidden_size, 2048)
        self.act_fc1 = nn.ReLU()
        
        self.fc2 = nn.Linear(2048, 1024)
        self.act_fc2 = nn.ReLU()

        self.output_logits = nn.Linear(1024, output_size)
        self.logsoftmax = nn.LogSoftmax(dim=1)

            
    def forward(self, image, text):
        # Image encoding 
        # pt for PyTorch tensor
        image_token = self.image_processor(image, return_tensors="pt").to(device)
        image_output = self.image_model(**image_token)
        last_hidden_states_image = image_output.last_hidden_state
    
        # Text encoding
        text_token = self.text_processor(text, return_tensors="pt").to(device)
        text_output = self.text_model(**text_token, encoder_hidden_states=last_hidden_states_image)
        pooler_outputs_text = text_output.pooler_output
        
        #input_to_linear = torch.cat((last_hidden_states_image, pooler_outputs_text), dim=1)
        
        x = self.act_fc1(self.fc1(pooler_outputs_text))
        x = self.act_fc2(self.fc2(x))        
        x = self.logsoftmax(self.output_logits(x))
        
        return x   
    
config = BertConfig(is_decoder="true")
model = VQAModel(config=config).to(device)

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

**Training Loop**

In [9]:
from torch.utils.data import DataLoader
import torch.optim as optim
from datetime import datetime

# Hyperparameters
batch_size = 16
num_epochs = 5
learning_rate = 1e-2

# Create DataLoader for the training dataset
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)





def Train(model,train_dataset,n_epochs=5,loss_fn=nn.CrossEntropyLoss(),optimizer=optim.SGD(model.parameters(), lr=0.001, momentum=0.9)):
    epoch_loss = []
    epoch_no=[]
    no_image_seen=[]
    loss_batch = []
    epoch_batch = []
    # Training loop
    # for epoch in range(num_epochs):
    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0
        loss_add=0.0
        print(f"epochs  {epoch}")
        start_epoch = datetime.now()
        for i in range(0,len(train_dataset)):
            images, questions, labels = train_dataset[i]
            labels = labels.to(device)
            # ignore single channel image
            if(images.shape[0]!=1):
                # Forward pass
                outputs = model(images, questions)
                loss = criterion(outputs, labels)
                loss_add+=loss
                #back propogate for every 100 inputs loss 
                if(i%200==0):
                    loss_add=loss_add/200 # average loss
                    print(f"epoch= {epoch}, i= {i} loss_add= {loss_add}")        
                    # Backward pass and optimization
                    optimizer.zero_grad()
                    loss_add.backward()
                    optimizer.step()
                    running_loss+=loss_add
                    no_image_seen.append(i)
                    loss_batch.append(loss_add)
                    epoch_batch.append(epoch)
                    loss_add=0
                    
                    
        epoch_loss.append(running_loss)
        epoch_no.append(epoch)
        end_epoch = datetime.now()
        td = (end_epoch - start_epoch).total_seconds()
        print(f"The time of execution of epoch {epoch} : {td:.03f}s")
            
    print('Training complete')
    
    return epoch_loss ,epoch_no,no_image_seen,loss_batch,epoch_batch

In [10]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
    
print_trainable_parameters(model)

trainable params: 3980588 || all params: 228410156 || trainable%: 1.74


In [11]:
n_epochs=5
loss_fn=nn.CrossEntropyLoss()
optimizer=optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

start = datetime.now()
epoch_loss ,epoch_no,no_image_seen,loss_batch,epoch_batch = Train(model,train_dataset,n_epochs,loss_fn,optimizer)
end = datetime.now()

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


epochs  0
epoch= 0, i= 0 loss_add= 0.028228122740983963
epoch= 0, i= 200 loss_add= 5.6877923011779785
epoch= 0, i= 400 loss_add= 5.6039347648620605
epoch= 0, i= 600 loss_add= 5.689967632293701
epoch= 0, i= 800 loss_add= 5.678378582000732
epoch= 0, i= 1000 loss_add= 4.772676944732666
epoch= 0, i= 1200 loss_add= 5.6781229972839355
epoch= 0, i= 1400 loss_add= 5.663033485412598
epoch= 0, i= 1600 loss_add= 5.666836261749268
epoch= 0, i= 1800 loss_add= 5.651191234588623
epoch= 0, i= 2000 loss_add= 5.649023532867432
epoch= 0, i= 2200 loss_add= 5.646534442901611
epoch= 0, i= 2400 loss_add= 5.625734329223633
epoch= 0, i= 2600 loss_add= 5.619443416595459
epoch= 0, i= 2800 loss_add= 5.611923694610596
epoch= 0, i= 3000 loss_add= 5.2950663566589355
epoch= 0, i= 3200 loss_add= 5.599128723144531
epoch= 0, i= 3400 loss_add= 5.576496601104736
epoch= 0, i= 3600 loss_add= 5.5655975341796875
epoch= 0, i= 3800 loss_add= 5.487366676330566
epoch= 0, i= 4000 loss_add= 5.556793212890625
epoch= 0, i= 4200 loss_

**Model Saving**

In [12]:
torch.save(model.state_dict(), 'd_b_concat_cross.pth')

In [13]:
model.save_pretrained("/kaggle/working/dbconcatcross")

AttributeError: 'VQAModel' object has no attribute 'save_pretrained'