In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hugging-face-vqa-small/data-00004-of-00136.arrow
/kaggle/input/hugging-face-vqa-small/data-00002-of-00136.arrow
/kaggle/input/hugging-face-vqa-small/data-00005-of-00136.arrow
/kaggle/input/hugging-face-vqa-small/data-00000-of-00136.arrow
/kaggle/input/hugging-face-vqa-small/data-00003-of-00136.arrow


**Data Loading**

In [3]:
import torch
from torch import nn
from transformers import AutoImageProcessor, AutoTokenizer, AutoModel

2024-05-18 07:09:30.760120: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 07:09:30.760223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 07:09:30.893097: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [5]:
from datasets import load_dataset, load_from_disk, Dataset
dataset = load_dataset("/kaggle/input/hugging-face-vqa-small")

Generating train split: 0 examples [00:00, ? examples/s]

**Custom Dataset**

In [6]:
class CustomDataset(Dataset):
    def __init__(self, data_source, one_hot_encoder,transform=None):
        self.data_source = data_source
        self.one_hot_encoder = one_hot_encoder
        self.transform = transform
                
    def __len__(self):
        return len(self.data_source)

    def __getitem__(self, idx):
        item = self.data_source[idx]
#         print(type(item))
#         print(len(item))
#         print(item)
        image = item["image"]
        if self.transform:
            image = self.transform(image)
#         image.resize((224, 224))
        question = item["question"]
        label = item["multiple_choice_answer"]
        
        # Transform the label using the fitted one-hot encoder
        label_encoded = self.one_hot_encoder.transform(np.array(label).reshape(-1, 1))
        # Convert label_encoded to a PyTorch tensor
        label_tensor = torch.tensor(label_encoded, dtype=torch.float32)        
        return image, question, label_tensor

In [7]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse=False,max_categories=300)
one_hot_encoded_fit = one_hot_encoder.fit(np.array(dataset["train"]["multiple_choice_answer"]).reshape(-1, 1))
temp = one_hot_encoded_fit.transform(np.array(dataset["train"][3]["multiple_choice_answer"]).reshape(-1, 1))
# temp



In [8]:
from torchvision import transforms
from torchvision.transforms import v2

# transform = v2.Compose([
#     v2.Resize((224, 244), antialias=True),
# ])

transform = transforms.Compose([
#     transforms.ToPILImage(),
    transforms.Resize((256, 256)),
    transforms.ToTensor()
])
train_dataset = CustomDataset(dataset["train"],one_hot_encoded_fit,transform)

**Define Model**

In [9]:
class VQAModel(nn.Module):
    def __init__(self, image_transformer="facebook/dinov2-base",text_transformer="google-bert/bert-base-uncased", output_size=300):
        super().__init__()
        
        #for image encoding
        self.image_processor = AutoImageProcessor.from_pretrained(image_transformer)
        self.image_model = AutoModel.from_pretrained(image_transformer).to(device)

        #for text encoding
        self.text_processor = AutoTokenizer.from_pretrained(text_transformer)
        self.text_model = AutoModel.from_pretrained(text_transformer).to(device)
        
        # freeze the parameters of the transformer models
        # As unabel to train model with transformer weights
        for param in self.image_model.parameters():
            param.requires_grad = False
        for param in self.text_model.parameters():
            param.requires_grad = False

        # concat the output of image and text and input to linear layer
        self.fc1 = nn.Linear(self.image_model.config.hidden_size+self.text_model.config.hidden_size, 2048)
        self.act_fc1 = nn.ReLU()
        
        self.fc2 = nn.Linear(2048, 1024)
        self.act_fc2 = nn.ReLU()
        

        self.output_logits = nn.Linear(1024, output_size)
        self.logsoftmax = nn.LogSoftmax(dim=1)

            
    def forward(self, image,text):
        # image encoding 
        #pt for pytorch tensor
        image_token = self.image_processor(image, return_tensors="pt").to(device)
        image_output = self.image_model(**image_token)
#         last_hidden_states_image = image_output.last_hidden_state
        pooler_outputs_image = image_output.pooler_output
    
        #Text encoding
        text_token = self.text_processor(text, return_tensors="pt").to(device)
        text_output = self.text_model(**text_token)
        pooler_outputs_text = text_output.pooler_output
        
        input_to_linear = torch.cat((pooler_outputs_image, pooler_outputs_text), dim=1)
        
        x = self.act_fc1(self.fc1(input_to_linear))
        x = self.act_fc2(self.fc2(x))        
        x = self.logsoftmax(self.output_logits(x))
        
        return x   
    

model = VQAModel()

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
# to see execution time of each cell
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.2
time: 292 µs (started: 2024-05-18 07:10:34 +00:00)


**Training Setup**

In [11]:
from torch.utils.data import DataLoader
import torch.optim as optim
from datetime import datetime

# Hyperparameters
batch_size = 16
num_epochs = 5
learning_rate = 1e-2

# Create DataLoader for the training dataset
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

def Train(model, train_dataset, n_epochs=5, loss_fn=nn.CrossEntropyLoss(), optimizer=optim.SGD(model.parameters(), lr=0.001, momentum=0.9)):
    epoch_loss = []
    epoch_no = []
    no_image_seen = []
    loss_batch = []
    epoch_batch = []
    
    # Training loop
    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0
        loss_add = 0.0
        all_labels = []
        all_predictions = []
        
        print(f"Epoch {epoch + 1}/{n_epochs}")
        start_epoch = datetime.now()
        
        for i in range(len(train_dataset)):
            images, questions, labels = train_dataset[i]
            images, labels = images.to(device), labels.to(device)
            
            # Ignore single channel image
            if images.shape[0] != 1:
                # Forward pass
                with torch.cuda.amp.autocast():
                    outputs = model(images, questions)
                    loss = criterion(outputs, labels)
                
                loss_add += loss
                _, predicted = torch.max(outputs, 1)
                
                all_labels.append(labels.cpu().numpy())
                all_predictions.append(predicted.cpu().numpy())
                
                # Back propagate for every 200 inputs loss
                if (i + 1) % 200 == 0:
                    loss_add = loss_add / 200  # average loss
                    print(f"Epoch {epoch + 1}, Batch {i + 1}, Average Loss: {loss_add:.4f}")
                    
                    # Backward pass and optimization
                    optimizer.zero_grad()
                    loss_add.backward()
                    optimizer.step()
                    
                    running_loss += loss_add.item()
                    no_image_seen.append(i)
                    loss_batch.append(loss_add.item())
                    epoch_batch.append(epoch)
                    loss_add = 0.0
        
        epoch_loss.append(running_loss / len(train_dataset))
        epoch_no.append(epoch)
        
        end_epoch = datetime.now()
        td = (end_epoch - start_epoch).total_seconds()
        print(f"Time of execution of epoch {epoch + 1}: {td:.03f}s")
        
        # Calculate accuracy
        all_labels = np.concatenate(all_labels)
        all_predictions = np.concatenate(all_predictions)
        accuracy = accuracy_score(all_labels, all_predictions)
        print(f"Training Loss: {running_loss / len(train_dataset):.4f}, Training Accuracy: {accuracy:.4f}")
    
    print('Training complete')
    
    return epoch_loss, epoch_no, no_image_seen, loss_batch, epoch_batch



# def Train(model,train_dataset,n_epochs=5,loss_fn=nn.CrossEntropyLoss(),optimizer=optim.SGD(model.parameters(), lr=0.001, momentum=0.9)):
#     epoch_loss = []
#     epoch_no=[]
#     no_image_seen=[]
#     loss_batch = []
#     epoch_batch = []
#     # Training loop
#     # for epoch in range(num_epochs):
#     for epoch in range(n_epochs):
#         model.train()
#         running_loss = 0.0
#         loss_add=0.0
#         print(f"epochs  {epoch}")
#         start_epoch = datetime.now()
#         for i in range(0,len(train_dataset)):
#             images, questions, labels = train_dataset[i]
#             labels = labels.to(device)
#             # ignore single channel image
#             if(images.shape[0]!=1):
#                 # Forward pass
#                 outputs = model(images, questions)
#                 loss = criterion(outputs, labels)
#                 loss_add+=loss
#                 #back propogate for every 100 inputs loss 
#                 if(i%200==0):
#                     loss_add=loss_add/200 # average loss
#                     print(f"epoch= {epoch}, i= {i} loss_add= {loss_add}")        
#                     # Backward pass and optimization
#                     optimizer.zero_grad()
#                     loss_add.backward()
#                     optimizer.step()
#                     running_loss+=loss_add
#                     no_image_seen.append(i)
#                     loss_batch.append(loss_add)
#                     epoch_batch.append(epoch)
#                     loss_add=0
                    
                    
#         epoch_loss.append(running_loss)
#         epoch_no.append(epoch)
#         end_epoch = datetime.now()
#         td = (end_epoch - start_epoch).total_seconds()
#         print(f"The time of execution of epoch {epoch} : {td:.03f}s")
            
#     print('Training complete')
    
#     return epoch_loss ,epoch_no,no_image_seen,loss_batch,epoch_batch

time: 12.3 ms (started: 2024-05-18 07:10:34 +00:00)


In [12]:
!pip install peft

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.11.1
time: 14 s (started: 2024-05-18 07:10:34 +00:00)


**Print_trainable_parameters**

In [13]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

time: 986 µs (started: 2024-05-18 07:10:48 +00:00)


In [14]:
#number of parameter before lora
print_trainable_parameters(model)

trainable params: 5553452 || all params: 201616172 || trainable%: 2.75
time: 11.6 ms (started: 2024-05-18 07:10:48 +00:00)


In [15]:
# print([(n, type(m)) for n, m in VQAModel().named_modules()])
# for n, m in VQAModel().named_modules():
#     print(f"n = {n}  m= {m}")

time: 7.35 ms (started: 2024-05-18 07:10:48 +00:00)


**Applying LoRA**

In [16]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=3,
    target_modules=["fc1","fc2"],
    modules_to_save = ["output_logits"]
)
# lora_model = get_peft_model(model_for_lora, config).to(device)
lora_model = get_peft_model(model, config)
lora_model.to(device)

PeftModel(
  (base_model): LoraModel(
    (model): VQAModel(
      (image_model): Dinov2Model(
        (embeddings): Dinov2Embeddings(
          (patch_embeddings): Dinov2PatchEmbeddings(
            (projection): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
          )
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): Dinov2Encoder(
          (layer): ModuleList(
            (0-11): 12 x Dinov2Layer(
              (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
              (attention): Dinov2Attention(
                (attention): Dinov2SelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_features=768, out_features=768, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )
                (output): Dinov2SelfOutput(
                  (dense):

time: 1.36 s (started: 2024-05-18 07:10:48 +00:00)


In [17]:
print_trainable_parameters(lora_model)

trainable params: 603948 || all params: 202220120 || trainable%: 0.30
time: 3.88 ms (started: 2024-05-18 07:10:50 +00:00)


In [18]:
n_epochs=5
loss_fn=nn.CrossEntropyLoss()
optimizer=optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

start = datetime.now()
epoch_loss ,epoch_no,no_image_seen,loss_batch,epoch_batch = Train(lora_model,train_dataset,n_epochs,loss_fn,optimizer)
end = datetime.now()

It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


Epoch 1/5


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 38.12 MiB is free. Process 2607 has 15.86 GiB memory in use. Of the allocated memory 15.47 GiB is allocated by PyTorch, and 87.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

time: 6.66 s (started: 2024-05-18 07:10:50 +00:00)


In [None]:
torch.save(model.state_dict(), 'd_b__lora_concat.pth')

In [None]:
# test_model = VQAModel()
# test_model.load_state_dict(torch.load('/kaggle/working/d_b_concat.pth'))