In [1]:
from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, PhiConfig

# Initializing a CLIP-vision config
vision_config = CLIPVisionConfig()

# Initializing a Llama config
text_config = PhiConfig()

# Initializing a Llava llava-1.5-7b style configuration
configuration = LlavaConfig(vision_config, text_config)

# Initializing a model from the llava-1.5-7b style configuration
model = LlavaForConditionalGeneration(configuration)


In [2]:
model.config

LlavaConfig {
  "ignore_index": -100,
  "image_token_index": 32000,
  "model_type": "llava",
  "projector_hidden_act": "gelu",
  "text_config": {
    "embd_pdrop": 0.0,
    "hidden_act": "gelu_new",
    "hidden_size": 2048,
    "intermediate_size": 8192,
    "layer_norm_eps": 1e-05,
    "model_type": "phi",
    "num_hidden_layers": 24,
    "partial_rotary_factor": 0.5,
    "qk_layernorm": false,
    "resid_pdrop": 0.0,
    "vocab_size": 51200
  },
  "transformers_version": "4.41.2",
  "vision_config": {
    "hidden_size": 768,
    "image_size": 224,
    "intermediate_size": 3072,
    "model_type": "clip_vision_model",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "patch_size": 32,
    "projection_dim": 512
  },
  "vision_feature_layer": -2,
  "vision_feature_select_strategy": "default"
}

In [4]:
from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig

# Initializing a CLIP-vision config
vision_config2 = CLIPVisionConfig()

# Initializing a Llama config
text_config2 = LlamaConfig()

# Initializing a Llava llava-1.5-7b style configuration
configuration2 = LlavaConfig(vision_config2, text_config2)

# Initializing a model from the llava-1.5-7b style configuration
model2 = LlavaForConditionalGeneration(configuration2)

In [5]:
model2.config.text_config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [13]:
model.config.text_config

PhiConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 2,
  "hidden_act": "gelu_new",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "phi",
  "num_attention_heads": 32,
  "num_hidden_layers": 24,
  "num_key_value_heads": 32,
  "partial_rotary_factor": 0.5,
  "qk_layernorm": false,
  "resid_pdrop": 0.0,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 51200
}

In [21]:
for key, value in temp.items():
    if key in temp2.keys():
        if temp2[key] != value:
            print(f"not equal: {key}")
    if key not in temp2.keys():
        print(f"not exist: {key}")

not exist: embd_pdrop
not equal: hidden_act
not equal: hidden_size
not equal: intermediate_size
not exist: layer_norm_eps
not equal: model_type
not equal: num_hidden_layers
not exist: partial_rotary_factor
not exist: qk_layernorm
not exist: resid_pdrop
not equal: vocab_size


In [None]:
hidden_size
vocab_size

In [30]:
print("phi:")
print(f"hidden_size: {model.config.text_config.hidden_size}")
print(f"vocab_size: {model.config.text_config.vocab_size}")

phi:
hidden_size: 2048
vocab_size: 51200


In [31]:
print("llama:")
print(f"hidden_size: {model2.config.text_config.hidden_size}")
print(f"vocab_size: {model2.config.text_config.vocab_size}")

llama:
hidden_size: 4096
vocab_size: 32000


In [35]:
configuration.text_config

PhiConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 2,
  "hidden_act": "gelu_new",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "phi",
  "num_attention_heads": 32,
  "num_hidden_layers": 24,
  "num_key_value_heads": 32,
  "partial_rotary_factor": 0.5,
  "qk_layernorm": false,
  "resid_pdrop": 0.0,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 51200
}

In [36]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [37]:
processor.image_processor

CLIPImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_center_crop",
    "crop_size",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "do_convert_rgb",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "crop_size": {
    "height": 336,
    "width": 336
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "CLIPImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "LlavaProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 336
  }
}

In [49]:
processor.tokenizer

LlamaTokenizerFast(name_or_path='llava-hf/llava-1.5-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [54]:
processor.tokenizer.decode([32000])

'<image>'

In [58]:
new_added_token = "kirimirikir"

In [55]:
processor.tokenizer.add_tokens(["kirimirikir"])

1

In [56]:
processor.tokenizer.vocab_size

32000

In [57]:
processor.tokenizer.convert_tokens_to_ids("kirimirikir")

32002

In [59]:
processor.tokenizer.decode(32002)

'kirimirikir'

In [60]:
processor.tokenizer.pad_token_id

32001

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")

In [26]:
tokenizer.padding_side = 'left'

In [29]:
tokenizer("#")

{'input_ids': [2], 'attention_mask': [1]}

In [30]:
tokenizer("<pad>")

{'input_ids': [27, 15636, 29], 'attention_mask': [1, 1, 1]}

In [25]:
tokenizer(["amir is here", "arsalan was here yesterday and today and he is"], padding=True)

{'input_ids': [[2, 2, 2, 2, 2, 2, 321, 343, 318, 994], [945, 25786, 373, 994, 7415, 290, 1909, 290, 339, 318]], 'attention_mask': [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [62]:
tokenizer.pad_token_id

In [66]:
configuration.text_config.pad_token_id

In [68]:
model.config.image_token_index

32000

In [69]:
model2.config.image_token_index

32000

In [70]:
tokenizer.batch_decode([32000])

[' coercion']

In [74]:
configuration

LlavaConfig {
  "ignore_index": -100,
  "image_token_index": 32000,
  "model_type": "llava",
  "projector_hidden_act": "gelu",
  "text_config": {
    "embd_pdrop": 0.0,
    "hidden_act": "gelu_new",
    "hidden_size": 2048,
    "intermediate_size": 8192,
    "layer_norm_eps": 1e-05,
    "model_type": "phi",
    "num_hidden_layers": 24,
    "partial_rotary_factor": 0.5,
    "qk_layernorm": false,
    "resid_pdrop": 0.0,
    "vocab_size": 51200
  },
  "transformers_version": "4.41.2",
  "vision_config": {
    "hidden_size": 768,
    "image_size": 224,
    "intermediate_size": 3072,
    "model_type": "clip_vision_model",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "patch_size": 32,
    "projection_dim": 512
  },
  "vision_feature_layer": -2,
  "vision_feature_select_strategy": "default"
}

In [104]:
from transformers import CLIPImageProcessor, AutoTokenizer

image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")

loading configuration file preprocessor_config.json from cache at /home/arsalan/.cache/huggingface/hub/models--openai--clip-vit-large-patch14/snapshots/32bd64288804d66eefd0ccbe215aa642df71cc41/preprocessor_config.json
size should be a dictionary on of the following set of keys: ({'height', 'width'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.
crop_size should be a dictionary on of the following set of keys: ({'height', 'width'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.
Image processor CLIPImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_center_crop",
    "crop_size",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "do_convert_rgb",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "crop_size": 

In [105]:
image_processor

CLIPImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_center_crop",
    "crop_size",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "do_convert_rgb",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "CLIPImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}

In [106]:
tokenizer.vocab_size

50257

In [107]:
vocab = tokenizer.get_vocab()

In [108]:
max_token_id = max(vocab.values())

In [109]:
max_token_id

50294

In [110]:
tokenizer(["<image>"])

{'input_ids': [[27, 9060, 29]], 'attention_mask': [[1, 1, 1]]}

In [111]:
print(tokenizer.decode([27]))
print(tokenizer.decode([9060]))
print(tokenizer.decode([29]))

<
image
>


In [112]:
tokenizer.eos_token

'<|endoftext|>'

In [113]:
image_token = "<image>"
tokenizer.add_tokens([image_token])

#pad_token = "<pad>"
#tokenizer.add_tokens([pad_token])

1

In [114]:
print(tokenizer(image_token))
#print(tokenizer(pad_token))

{'input_ids': [50295], 'attention_mask': [1]}


In [115]:
tokenizer.pad_token = tokenizer.eos_token

In [116]:
tokenizer.pad_token_id

50256

In [12]:
tokenizer.padding_side = "left"
#tokenizer.pad_token_id = 50296

In [13]:
tokenizer(["this is short", "this is a longer text with <image>"], padding=True)

{'input_ids': [[50256, 50256, 50256, 50256, 50256, 5661, 318, 1790], [5661, 318, 257, 2392, 2420, 351, 220, 50295]], 'attention_mask': [[0, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [16]:
from transformers import LlavaProcessor
processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer)

In [17]:
import torch
from PIL import Image


class ImageTextInstructionFollowingDataset(torch.utils.data.Dataset):
    def __init__(self, data, image_folder_path=None):
        self.data = data
        self.image_folder_path = image_folder_path

        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        conversation = self.data["conversations"][index]
        image_path = self.data["image"][index]
        if self.image_folder_path is not None:
            image_path = f"{self.image_folder_path}/{image_path}"
        img = Image.open(image_path)
        instruction = conversation[0]["value"]
        answer = conversation[1]["value"]

        instruction_answer = instruction + "\n\nAnswer:" + answer
        return instruction_answer, img

In [35]:
class MyCustomDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        instruction_answers = [item[0] for item in examples]
        images = [item[1] for item in examples]
        
        input_ids = self.processor.tokenizer(
            instruction_answers, padding=True, truncation=True, return_tensors="pt"
        )["input_ids"]

        pixel_values = self.processor.image_processor(
            #images=images, return_tensors="pt", padding=True
            images=images, return_tensors="pt"
        )["pixel_values"]

        batch = {
            'input_ids': input_ids,
            'pixel_values': pixel_values,
            'labels': input_ids
        }
        return batch

In [36]:
import pandas as pd
 
train_df = pd.read_json("/home/arsalan/Desktop/multimodal_LLM-master/multimodal_LLM-master/data/chat_train.json", lines=True)
dataset = ImageTextInstructionFollowingDataset(data=train_df, image_folder_path="/home/arsalan/Desktop/multimodal_LLM-master/multimodal_LLM-master/data/images")

In [37]:
from torch.utils.data import DataLoader

data_collator = MyCustomDataCollator(processor)
dataloader = DataLoader(dataset, batch_size=2, collate_fn=data_collator)

In [38]:
batch = next(iter(dataloader))

# Defining the model

In [1]:
import torch
from transformers import CLIPVisionConfig, AutoConfig, LlavaConfig
vision_config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
vision_config._name_or_path = "openai/clip-vit-large-patch14"
text_config = AutoConfig.from_pretrained("microsoft/phi-1_5")



In [2]:
configuration = LlavaConfig(vision_config, text_config)
configuration._attn_implementation = "flash_attention_2"
configuration.image_token_index = 50295
configuration.pad_token_id = 50256

In [3]:
import torch.nn as nn
from transformers.activations import ACT2FN
from transformers import LlavaForConditionalGeneration, CLIPVisionModel,AutoModelForCausalLM

class LlavaMultiModalProjector(nn.Module):
    def __init__(self, config: LlavaConfig):
        super().__init__()

        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
        self.act = ACT2FN[config.projector_hidden_act]
        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)

    def forward(self, image_features):
        hidden_states = self.linear_1(image_features)
        hidden_states = self.act(hidden_states)
        hidden_states = self.linear_2(hidden_states)
        return hidden_states

class CustomLlavaForConditionalGeneration(LlavaForConditionalGeneration):
    def __init__(self, config: LlavaConfig):
        super(LlavaForConditionalGeneration, self).__init__(config)  # Use super() to call the parent class constructor
        self.vision_tower = CLIPVisionModel.from_pretrained(config.vision_config._name_or_path,
                                                            torch_dtype=torch.float32)

        self.multi_modal_projector = LlavaMultiModalProjector(config).to(torch.float32)
        self.vocab_size = config.text_config.vocab_size
        self.language_model = AutoModelForCausalLM.from_pretrained(config.text_config._name_or_path,
                                                                   torch_dtype=torch.float32, 
                                                                   #attn_implementation=config._attn_implementation
                                                                   )
        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
        self.post_init()

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
model = CustomLlavaForConditionalGeneration(configuration).to(device)

Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in CustomLlavaForConditionalGeneration is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`


In [6]:
model.config.image_token_index

50295

In [7]:
def check_module_devices(module):
    devices = {param.device for param in module.parameters()}
    return devices

In [8]:
print("Multi-Modal Projector Devices:", check_module_devices(model.multi_modal_projector))
print("Vision Tower Device:", check_module_devices(model.vision_tower))
print("Language Model Device:", check_module_devices(model.language_model))

Multi-Modal Projector Devices: {device(type='cuda', index=0)}
Vision Tower Device: {device(type='cuda', index=0)}
Language Model Device: {device(type='cuda', index=0)}


### !!! param requries grad False for LLM and vision clip

# Definign the processor

In [1]:
from transformers import CLIPImageProcessor, AutoTokenizer

image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")

In [2]:
tokenizer.vocab_size

50257

In [3]:
vocab = tokenizer.get_vocab()

In [4]:
max_token_id = max(vocab.values())

In [5]:
max_token_id

50294

In [6]:
tokenizer(["<image>"])

{'input_ids': [[27, 9060, 29]], 'attention_mask': [[1, 1, 1]]}

In [7]:
tokenizer.eos_token

'<|endoftext|>'

In [8]:
image_token = "<image>"
tokenizer.add_tokens([image_token])

1

In [9]:
tokenizer(image_token)['input_ids'][0]

50295

In [17]:
tokenizer("<image>")

{'input_ids': [50295], 'attention_mask': [1]}

In [18]:
print(tokenizer(image_token))

{'input_ids': [50295], 'attention_mask': [1]}


In [11]:
tokenizer.pad_token = tokenizer.eos_token

In [12]:
tokenizer.pad_token_id

50256

In [21]:
tokenizer.padding_side = "left"

In [22]:
tokenizer(["this is short", "this is a longer text with <image>"], padding=True)

{'input_ids': [[50256, 50256, 50256, 50256, 50256, 5661, 318, 1790], [5661, 318, 257, 2392, 2420, 351, 220, 50295]], 'attention_mask': [[0, 0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
from transformers import LlavaProcessor
processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer)

In [18]:
processor.tokenizer("<image>")

{'input_ids': [50295], 'attention_mask': [1]}

# Defining the Dataset and collator fn

In [24]:
import torch
from PIL import Image


class ImageTextInstructionFollowingDataset(torch.utils.data.Dataset):
    def __init__(self, data, image_folder_path=None):
        self.data = data
        self.image_folder_path = image_folder_path

        #self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        conversation = self.data["conversations"][index]
        image_path = self.data["image"][index]
        if self.image_folder_path is not None:
            image_path = f"{self.image_folder_path}/{image_path}"
        img = Image.open(image_path)
        instruction = conversation[0]["value"]
        answer = conversation[1]["value"]

        instruction_answer = instruction + "\n\nAnswer:" + answer
        return instruction_answer, img

In [25]:
class MyCustomDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        instruction_answers = [item[0] for item in examples]
        images = [item[1] for item in examples]
        
        tokenizer_outputs = self.processor.tokenizer(
            instruction_answers, padding=True, truncation=True, return_tensors="pt"
        )
        input_ids = tokenizer_outputs['input_ids']
        attention_mask = tokenizer_outputs['attention_mask']

        pixel_values = self.processor.image_processor(
            #images=images, return_tensors="pt", padding=True
            images=images, return_tensors="pt"
        )["pixel_values"]

        #if torch.isnan(input_ids).any() or torch.isinf(input_ids).any():
        #    raise ValueError("NaN or Inf found in input_ids during batching")

        #if torch.isnan(attention_mask).any() or torch.isinf(attention_mask).any():
        #    raise ValueError("NaN or Inf found in attention_mask during batching")

        #if torch.isnan(pixel_values).any() or torch.isinf(pixel_values).any():
        #    raise ValueError("NaN or Inf found in pixel_values during batching")

        batch = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'pixel_values': pixel_values,
            'labels': input_ids
        }
        return batch

In [26]:
processor.tokenizer("amir is here")

{'input_ids': [321, 343, 318, 994], 'attention_mask': [1, 1, 1, 1]}

In [27]:
import pandas as pd
 
train_df = pd.read_json("/home/arsalan/Desktop/multimodal_LLM-master/multimodal_LLM-master/data/chat_train.json", lines=True)
train_dataset = ImageTextInstructionFollowingDataset(data=train_df, image_folder_path="/home/arsalan/Desktop/multimodal_LLM-master/multimodal_LLM-master/data/images")

eval_df = pd.read_json("/home/arsalan/Desktop/multimodal_LLM-master/multimodal_LLM-master/data/chat_val_5K.json", lines=True)
eval_dataset = ImageTextInstructionFollowingDataset(data=eval_df, image_folder_path="/home/arsalan/Desktop/multimodal_LLM-master/multimodal_LLM-master/data/images")

In [28]:
from torch.utils.data import DataLoader

data_collator = MyCustomDataCollator(processor)
dataloader = DataLoader(eval_dataset, batch_size=2, collate_fn=data_collator)

In [29]:
#batch = next(iter(dataloader))

In [30]:
#for key,val in batch.items():
#    batch[key] = batch[key].to("cuda:0")

In [31]:
#model.forward(**batch)

# Training 

In [32]:
from transformers import TrainingArguments
from transformers import Trainer

In [33]:
training_args = TrainingArguments(
    #report_to = 'wandb',
    output_dir="./outputs/phi_adaptor-test3",
    remove_unused_columns=False,
    save_strategy="steps",
    save_steps=100,
    learning_rate=1e-5,
    gradient_accumulation_steps=16,
    per_device_train_batch_size=4,
    #bf16=True,
    #bf16_full_eval=True,
    warmup_steps=0,
    lr_scheduler_type="linear",
    lr_scheduler_kwargs={},
    max_steps=3000,
    dataloader_num_workers=14,
    logging_steps=10,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=1000,
    label_names=["labels"],
    )



In [34]:
# Freeze language_model and vision_tower parameters
for param in model.language_model.parameters():
    param.requires_grad = False

for param in model.vision_tower.parameters():
    param.requires_grad = False

# Verify that only multi_modal_projector parameters require gradients
for name, param in model.named_parameters():
    print(f"{name}: requires_grad = {param.requires_grad}")

vision_tower.vision_model.embeddings.class_embedding: requires_grad = False
vision_tower.vision_model.embeddings.patch_embedding.weight: requires_grad = False
vision_tower.vision_model.embeddings.position_embedding.weight: requires_grad = False
vision_tower.vision_model.pre_layrnorm.weight: requires_grad = False
vision_tower.vision_model.pre_layrnorm.bias: requires_grad = False
vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight: requires_grad = False
vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias: requires_grad = False
vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight: requires_grad = False
vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias: requires_grad = False
vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight: requires_grad = False
vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias: requires_grad = False
vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight: requires_grad =

In [35]:
#for param in model.parameters():
    # Check if parameter dtype is  Float (float32)
#    if param.dtype == torch.bfloat16:
#        param.data = param.data.to(torch.float16)

In [36]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

trainable_params = count_trainable_parameters(model)
print(f"Number of trainable parameters: {trainable_params}")

Number of trainable parameters: 6295552


In [37]:
trainer = Trainer(
            model,
            training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
        )

max_steps is given, it will override any value given in num_train_epochs


In [38]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [39]:
trainer.train()

[2024-07-12 15:01:11,444] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/arsalan/anaconda3/envs/vlm/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mamirarsalan-rajabi[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/3000 [00:00<?, ?it/s]

{'loss': 5.0464, 'grad_norm': 6.137228965759277, 'learning_rate': 9.996666666666669e-06, 'epoch': 0.0}
{'loss': 4.9284, 'grad_norm': 5.80035924911499, 'learning_rate': 9.993333333333333e-06, 'epoch': 0.0}
{'loss': 4.9527, 'grad_norm': 5.846682071685791, 'learning_rate': 9.990000000000001e-06, 'epoch': 0.0}
{'loss': 4.9416, 'grad_norm': 14.239758491516113, 'learning_rate': 9.986666666666667e-06, 'epoch': 0.0}
{'loss': 4.9133, 'grad_norm': 4.910815238952637, 'learning_rate': 9.983333333333333e-06, 'epoch': 0.0}
{'loss': 4.7086, 'grad_norm': 6.767263412475586, 'learning_rate': 9.980000000000001e-06, 'epoch': 0.0}
{'loss': 4.8151, 'grad_norm': 6.37697696685791, 'learning_rate': 9.976666666666667e-06, 'epoch': 0.0}
{'loss': 4.892, 'grad_norm': 4.32615852355957, 'learning_rate': 9.973333333333333e-06, 'epoch': 0.0}
{'loss': 4.6973, 'grad_norm': 4.271944046020508, 'learning_rate': 9.970000000000001e-06, 'epoch': 0.0}
{'loss': 4.8359, 'grad_norm': 6.476644515991211, 'learning_rate': 9.96666666

KeyboardInterrupt: 

In [38]:
model.config.ignore_index

-100