In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass
from torchtune.modules import RMSNorm
from tokenizers import Tokenizer
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils.data import random_split

In [18]:
@dataclass
class llavaArgs:
    batch_size = 32
    device = 'cuda'
    vis_embd_out = 768
    text_embd_out = 768
    vocab_size = 50257
    block_size = 256
    lr = 1e-3
    text_hidden =  768 * 4
    img_seq_len = 256

In [3]:
#Loading SigLip Vision Encoder
import torch
import torch.nn as nn
from transformers import CLIPModel, CLIPFeatureExtractor

# Vision model class using CLIP
class VisionModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").vision_model


        self.feature_extractor = CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch16")

        self.multimodalVisionLayerProjector = nn.Linear(in_features=llavaArgs.vis_embd_out, out_features=llavaArgs.text_embd_out, device=llavaArgs.device) # Use proper dimensions

        self.main = nn.Sequential(
            nn.Flatten()
        )


        for p in self.model.parameters():
            p.requires_grad = False

    def forward(self, x):

        # inputs = self.feature_extractor(x['image'], return_tensors="pt")
        # inputs = inputs.to(llavaArgs.device)


        with torch.no_grad():
            outputs = self.model(x)


        x = outputs.pooler_output  # Get the pooled image embeddings (shape: [batch_size, 768])


        x = self.main(x)
        # return x
        return self.multimodalVisionLayerProjector(x)


In [13]:
#Language Decoder

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.bfloat16
    )

tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

class TextModel(nn.Module):
    def __init__(self):
        super().__init__()




        self.model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map='cuda', torch_dtype='auto', output_hidden_states=True)
        self.tokenizer = tokenizer
        self.linear_layer = nn.Linear(in_features=llavaArgs.text_embd_out, out_features=llavaArgs.vocab_size, device=llavaArgs.device, bias=False) # Takes in logits of dimensions- embeds_dims and converts it into dimension of vocab_size (logits in range of vocab_size)


        for p in self.model.parameters():
            p.requires_grad = False

    def forward(self, x, embeds=True):

        if(embeds):

          x = self.model(inputs_embeds=x).hidden_states[-1]
          x = self.linear_layer(x)
          return x
        else:
          x = self.model(input_ids = x['input_ids'], attention_mask = x['attention_mask'])
          return x




tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [5]:

# @dataclass
# class ModelArgs:
#     #Hyperparameters

#     block_size = 128
#     batch_size = 64
#     embeddings_dims = 768
#     attn_dropout = 0.1
#     no_of_heads = 12 #IMP needs to be thoroughly calculated
#     dropout = 0.1
#     epochs = 100
#     max_lr = 2.5e-4
#     no_of_decoder_layers = 12 #IMP needs to be thoroughly calculated
#     weight_decay_optim = 0.1
#     beta_1 = 0.9
#     beta_2 = 0.95
#     device = 'cuda'
#     no_kv_heads = 2
#     vocab_size = 2000

In [6]:
# from gemma import Gemma

# gemma_model = Gemma(embeddings_dims=ModelArgs.embeddings_dims, no_of_decoder_layers=ModelArgs.no_of_decoder_layers, block_size=ModelArgs.block_size, vocab_size=ModelArgs.vocab_size, dropout=ModelArgs.dropout)

In [7]:
# @dataclass
# class VisionModelArgs:
#     embedding_dims = 768

In [8]:
# @dataclass
# class TextModelArgs:
#     embedding_dims = 768

In [9]:
# class PaliGemmaVisionProjector(nn.Module):
#     def __init__(self):
        
#         self.linear = nn.Linear(in_features=VisionModelArgs.embedding_dims, out_features=(TextModelArgs.embedding_dims), device='cuda')
#         nn.init.zeros_(self.linear_layer.weight)  # Zero-initialize weights
#         nn.init.zeros_(self.linear_layer.bias)  

#     def forward(self, x):
        
#         x = self.linear(x)
        
#         return x

In [11]:
import pandas as pd
df = pd.read_csv('data/flickr8000/captions.txt', sep=",")

In [12]:
df

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
...,...,...
40450,997722733_0cb5439472.jpg,A man in a pink shirt climbs a rock face
40451,997722733_0cb5439472.jpg,A man is rock climbing high in the air .
40452,997722733_0cb5439472.jpg,A person in a red shirt climbing up a rock fac...
40453,997722733_0cb5439472.jpg,A rock climber in a red shirt .


In [14]:
tokenizer.all_special_tokens

['<bos>', '<eos>', '<unk>', '<pad>', '<start_of_turn>', '<end_of_turn>']

In [15]:
[tokenizer.eos_token_id] + tokenizer.encode("Hi")

[1, 2, 2151]

In [16]:
tokenizer.encode("Hello My name is")

[2, 4521, 2961, 1503, 603]

In [17]:
tokenizer("Hello My name is", return_tensors='pt')

{'input_ids': tensor([[   2, 4521, 2961, 1503,  603]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [35]:
ix = torch.randint(df.shape[0] - ModelArgs.block_size, (ModelArgs.batch_size,))   
x = torch.stack([df.shape[0][i:i+ModelArgs.block_size] for i in ix])
y = torch.stack([df.shape[0][i+1:i+ModelArgs.block_size+1] for i in ix])
x, y = x.to(ModelArgs.device), y.to(ModelArgs.device)
x, y

TypeError: 'int' object is not subscriptable

In [53]:
import os
import numpy as np
from PIL import Image

train_transforms = A.Compose(
    [
        A.Resize(height=224, width=224),
        A.CenterCrop(height=224, width=224),
        # A.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], max_pixel_value=224.0,),
        # A.ToFloat(max_value=224),
        ToTensorV2(),
    ]
)

test_tyransforms = A.Compose(
    [
        # A.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], max_pixel_value=224.0,),
        # A.ToFloat(max_value=224),
        ToTensorV2(),
    ]
)



IMAGE_TOKEN = "<image>"

tokens_to_add = {"additional_special_tokens": [IMAGE_TOKEN]}
tokenizer.add_special_tokens(tokens_to_add)

#IMP!!!
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
        
class PaliGemmaDataset(Dataset):
    def __init__(self):
        super(PaliGemmaDataset, self).__init__(
        )
        
        self.dir = os.listdir('data/flickr8000/images')
        self.tokenizer = tokenizer
        self.block_size = 128
        
    def __len__(self):
        
        return df.shape[0]

    def __getitem__(self, idx):
        
        txt, img = df.iloc[idx][0], df.iloc[idx][1]
        
        img_path = os.path.join(self.path, img)
        # print(img_path)
        img = np.array(Image.open(img_path))

        input_transformed = train_transforms(image = img)['image']
        
        prompt = "Explain the different components of the picture."
        prompt_and_captions = self.tokenizer.encode("<image>") * llavaArgs.img_seq_len + self.tokenizer.encode(prompt) + self.tokenizer.encode('<bos>')   + self.tokenizer.encode('<sep>') + self.tokenizer.encode(txt) + self.tokenizer.encode('<eos>')
        temp =  self.tokenizer.encode("<image>") * llavaArgs.img_seq_len + self.self.tokenizer.encode(prompt) + self.tokenizer.encode('<bos>') + ['<sep>'] + self.tokenizer.encode(txt) + self.tokenizer.encode('<eos>')
        final_text = prompt_and_captions
        if(len(final_text) < self.block_size):
            for i in range(self.block_size - len(prompt_and_captions)):
                final_text += self.tokenizer.encode('<pad>')
                
        sep_index = 0
        # print(torch.Tensor(final_text))
        
        
        for i in range(len(temp)):
            if(temp[i] == ['<sep>']):
                sep_index = i
        
        print(prompt_and_captions)
        print(sep_index)   
        # ix = torch.randint(df.shape[0] - ModelArgs.block_size, (ModelArgs.batch_size,))   
        x = torch.stack([prompt_and_captions[i:i+llavaArgs.block_size] for i in range(len(prompt_and_captions) - sep_index)])
        y = torch.stack([prompt_and_captions[i+1:i+llavaArgs.block_size+1] for i in range(len(prompt_and_captions) - sep_index)])
        x, y = x.to(llavaArgs.device), y.to(llavaArgs.device)
        # return x, y

  
        # text_embeddings = gemma_model(torch.Tensor(final_text))
        # image_embeddings = siglip_vision(img)
        # final_embds = image_embeddings + x
        
        x_values = {
            'input_ids' : x,
            'img_embeddings' : input_transformed,
            
        }
        y_values = {
            "input_ids": y
        }
        
        x_values = {key: torch.tensor(value) for key, value in x_values.items()}
        y_values = {key: torch.tensor(value) for key, value in y_values.items()}
        
        return x_values, y_values

In [54]:
#Creating an instance of the dataset class
dataset = PaliGemmaDataset()

# Assuming 'dataset' is already created
# Split the dataset into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

import os
#Creating a dataloader
# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=ModelArgs.batch_size, shuffle=True, pin_memory=False, num_workers=os.cpu_count())
val_loader = DataLoader(val_dataset, batch_size=ModelArgs.batch_size, shuffle=True, pin_memory=False, num_workers=os.cpu_count())



In [55]:
sample = next(iter(train_loader))
print(sample)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235284, 235310, 235274, 235304, 235310, 235308, 235304, 235284, 235304, 235298, 235266, 235308, 235304, 1256, 235308, 235249, 546, 235310, 235265, 6001, 2, 1][2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235315, 235321, 235308, 235276, 235318, 235324, 235276, 235274, 235315, 235298, 235324, 235276, 235308, 1039, 235310, 235250, 235310, 830, 235265, 6001, 2, 1][2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235274, 235324, 235276, 235321, 235276, 235284, 235324, 235315, 235324, 235298, 235304, 235260, 235321, 235308, 235274, 3864, 235310, 235324, 235308, 235265, 6001, 2, 1][2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235310, 235310, 235308, 235310, 235284, 235321, 235304, 235318, 235324, 235298, 235284, 235308, 2070, 761, 1039, 235324, 235308, 235265, 6001, 2, 1]

  txt, img = df.iloc[idx][0], df.iloc[idx][1]
  txt, img = df.iloc[idx][0], df.iloc[idx][1]


[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235318, 235274, 235304, 235276, 235304, 235276, 235324, 235304, 235276, 235298, 235276, 235268, 235284, 235321, 235268, 235276, 235324, 235315, 2070, 235265, 6001, 2, 1][2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235284, 235310, 235315, 235321, 235315, 235274, 235321, 235324, 235310, 235298, 235318, 235250, 235276, 235315, 235276, 1029, 235276, 235315, 235324, 235265, 6001, 2, 1]

[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235308, 235284, 235284, 235315, 235321, 235315, 235315, 235274, 235318, 235298, 235266, 235284, 235276, 235304, 235274, 235315, 830, 235308, 235315, 235265, 6001, 2, 1]

  txt, img = df.iloc[idx][0], df.iloc[idx][1]


[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235276, 235274, 235310, 235324, 235324, 235304, 235304, 235308, 235324, 235298, 235266, 235318, 235318, 11203, 235276, 235315, 235284, 235315, 235276, 235265, 6001, 2, 1]

  txt, img = df.iloc[idx][0], df.iloc[idx][1]



[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235274, 235276, 235284, 235284, 235315, 235324, 235308, 235324, 235284, 235321, 235298, 235324, 235308, 235308, 235274, 235308, 235284, 235304, 235321, 235258, 235321, 235265, 6001, 2, 1][2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235284, 235321, 235324, 235324, 235308, 235276, 235304, 235321, 235274, 235274, 235298, 235310, 235249, 235304, 235274, 235274, 235284, 235308, 235304, 546, 235265, 6001, 2, 1]


  txt, img = df.iloc[idx][0], df.iloc[idx][1]





00
[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235284, 235321, 235315, 235321, 235321, 235274, 235276, 235318, 235304, 235318, 235298, 235321, 235310, 18563, 235308, 235260, 235276, 235268, 235318, 235304, 235265, 6001, 2, 1]0[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235308, 235304, 235284, 235276, 235315, 235321, 235315, 235315, 235315, 235298, 235310, 235249, 235276, 235324, 235250, 235276, 235250, 235274, 235324, 235249, 235265, 6001, 2, 1]

00
000


0




[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235284, 235304, 235308, 235304, 235276, 235321, 235321, 235310, 235274, 235284, 235298, 235308, 235249, 235308, 235321, 235276, 235310, 235260, 235318, 235266, 235308, 235265, 6001, 2, 1]
0[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235284, 235304, 235324, 235310, 235284, 235310, 235324, 235304, 235321, 23

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  txt, img = df.iloc[idx][0], df.iloc[idx][1]


[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235310, 235310, 235304, 235276, 235304, 235276, 235315, 235310, 235284, 235298, 235266, 235310, 235276, 235315, 235308, 235321, 235318, 235284, 235308, 235321, 235265, 6001, 2, 1]


[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235284, 235274, 235276, 235274, 235274, 235284, 235321, 235315, 235318, 235304, 235298, 182877, 235321, 235268, 235284, 235250, 235276, 235258, 235324, 235265, 6001, 2, 1][2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235304, 235310, 235318, 235276, 235308, 235308, 235274, 235324, 235284, 235321, 235298, 235318, 235304, 235284, 235308, 235308, 159622, 235274, 235321, 235265, 6001, 2, 1]

[2, 74198, 573, 2167, 8832, 576, 573, 5642, 235265, 2, 2, 2, 235322, 17062, 235313, 2, 235284, 235321, 235321, 235274, 235310, 235318, 235321, 235276, 235315, 235308, 235298, 235258, 235310, 532, 235321, 2

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/yuvrajsingh/miniconda3/envs/unsloth_env/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/yuvrajsingh/miniconda3/envs/unsloth_env/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = self.dataset.__getitems__(possibly_batched_index)
  File "/home/yuvrajsingh/miniconda3/envs/unsloth_env/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 419, in __getitems__
    return [self.dataset[self.indices[idx]] for idx in indices]
  File "/home/yuvrajsingh/miniconda3/envs/unsloth_env/lib/python3.10/site-packages/torch/utils/data/dataset.py", line 419, in <listcomp>
    return [self.dataset[self.indices[idx]] for idx in indices]
  File "/tmp/ipykernel_333157/2273889209.py", line 37, in __getitem__
    x = torch.stack([prompt_and_captions[i:i+ModelArgs.block_size] for i in range(len(prompt_and_captions) - sep_index)])
TypeError: expected Tensor as element 0 in argument 0, but got list


In [17]:
class PaliGemma(nn.Module):
    def __init__(self):
        
        self.vision_encoder = SiglipVisionModel()
        self.text_decoder = Gemma()
        self.linear_proj = PaliGemmaVisionProjector()
        
    def forward(self, img, text):
        vision_embd = self.vision_encoder(img)
        vision_embd = self.linear(vision_embd)
        text_embd = self.text_decoder(text)
        combined = vision_embd + text_embd
        return combined