## Load the Model

The model we will be using is GenerativeImage2Text (GIT) from Microsoft. Check out the model on [Huggingface](https://huggingface.co/microsoft/git-base). 

In [1]:
from transformers import AutoProcessor, AutoModelForCausalLM
import torch

processor = AutoProcessor.from_pretrained("microsoft/git-base")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")

if torch.cuda.is_available():
    model.to("cuda")
elif torch.backends.mps.is_available():
    model.to("mps")
else:
    model.to("cpu")
print(model.device)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


{"timestamp":"2025-08-08T16:35:15.373659Z","level":"WARN","fields":{"message":"Status Code: 500. Retrying...","request_id":""},"filename":"/Users/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":236}
{"timestamp":"2025-08-08T16:35:15.373709Z","level":"WARN","fields":{"message":"Status Code: 500. Retrying...","request_id":""},"filename":"/Users/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":236}
{"timestamp":"2025-08-08T16:35:15.373715Z","level":"WARN","fields":{"message":"Retry attempt #0. Sleeping 793.817157ms before the next attempt"},"filename":"/Users/runner/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs","line_number":171}
{"timestamp":"2025-08-08T16:35:15.373729Z","level":"WARN","fields":{"message":"Retry attempt #0. Sleeping 1.747479665s before the next attempt"},"filename":"/Users/runner/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs","li

In [2]:
from datasets import load_dataset
dataset = load_dataset("vipulmaheshwari/GTA-Image-Captioning-Dataset")['train']

train_ds, test_ds = dataset.train_test_split(test_size=0.2).values()


Generating train split: 100%|██████████| 785/785 [00:03<00:00, 251.47 examples/s]


In [3]:
print(f"Train dataset size: {len(train_ds)}")
print(f"Test dataset size: {len(test_ds)}")

Train dataset size: 628
Test dataset size: 157


### Preprocess Data

Now, we will preprocess the data to get it ready for the model. This involves tokenizing the text and processing the images.

In [4]:
def preprocess_function(examples):
    # Process images
    image_inputs = processor(images=[x.convert("RGB") for x in examples["image"]], return_tensors="pt")
    
    # Process captions
    text_inputs = processor.tokenizer(
        text=examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    # Set labels, ignoring padding
    labels = text_inputs.input_ids.clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    
    return {
        "pixel_values": image_inputs.pixel_values,
        "input_ids": text_inputs.input_ids,
        "attention_mask": text_inputs.attention_mask,
        "labels": labels
    }

In [5]:
import gc
import multiprocessing

try:
	multiprocessing.set_start_method('spawn')
except RuntimeError:
	pass  # start method has already been set

processed_train_ds = train_ds.map(preprocess_function, batched=True, batch_size=4, num_proc=4, remove_columns=train_ds.column_names)
processed_test_ds = test_ds.map(preprocess_function, batched=True, batch_size=4, num_proc=4, remove_columns=test_ds.column_names)

del train_ds, test_ds
gc.collect()

Map (num_proc=4): 100%|██████████| 628/628 [00:08<00:00, 70.67 examples/s]
Map (num_proc=4): 100%|██████████| 157/157 [00:02<00:00, 64.53 examples/s]


135

In [6]:
print(processed_train_ds.column_names)

['pixel_values', 'input_ids', 'attention_mask', 'labels']


In [7]:
frozen_layers = [
    model.git.embeddings,
    model.git.image_encoder.vision_model.embeddings,
]

frozen_layers.extend(model.git.image_encoder.vision_model.encoder.layers[i].self_attn for i in range(len(model.git.image_encoder.vision_model.encoder.layers)))
frozen_layers.extend(model.git.encoder.layer[i].attention.self for i in range(len(model.git.encoder.layer)))

print(f"Freezing {len(frozen_layers)} layers")
for layer in frozen_layers:
    for param in layer.parameters():
        param.requires_grad = False

Freezing 20 layers


### Set up Training

In [8]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import numpy as np

# --- Training Configuration ---
num_epochs = 6
train_batch_size = 16
eval_batch_size = 16
learning_rate = 2e-4
gradient_accumulation_steps = 2

# --- DataLoaders ---
processed_train_ds.set_format(type='torch', columns=['pixel_values', 'input_ids', 'attention_mask', 'labels'])
processed_test_ds.set_format(type='torch', columns=['pixel_values', 'input_ids', 'attention_mask', 'labels'])

train_dataloader = DataLoader(processed_train_ds, shuffle=True, batch_size=train_batch_size)
eval_dataloader = DataLoader(processed_test_ds, batch_size=eval_batch_size)

# --- Optimizer and Scheduler ---
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=20, num_training_steps=num_training_steps
)

# --- Training Loop ---
best_eval_loss = float('inf')
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        inputs = {k: v.to(model.device) for k, v in batch.items()}
        
        outputs = model(**inputs)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        
        loss.backward()
        
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * gradient_accumulation_steps
        progress_bar.update(1)
        progress_bar.set_description(f"Epoch {epoch+1}, Loss: {total_loss / (step + 1):.4f}")

    # --- Evaluation Loop ---
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in eval_dataloader:
            inputs = {k: v.to(model.device) for k, v in batch.items()}
            outputs = model(**inputs)
            eval_loss += outputs.loss.item()
    
    avg_eval_loss = eval_loss / len(eval_dataloader)
    perplexity = np.exp(avg_eval_loss)
    print(f"\n--- Epoch {epoch+1} Evaluation ---")
    print(f"  Average Loss: {avg_eval_loss:.4f}")
    print(f"  Perplexity: {perplexity:.4f}")
    print("--------------------------")

print("Training complete.")

Epoch 1, Loss: 3.3519:  17%|█▋        | 40/240 [01:08<04:17,  1.29s/it]


--- Epoch 1 Evaluation ---
  Average Loss: 1.3471
  Perplexity: 3.8463
--------------------------


Epoch 2, Loss: 1.1087:  33%|███▎      | 80/240 [02:10<02:22,  1.13it/s]


--- Epoch 2 Evaluation ---
  Average Loss: 0.9903
  Perplexity: 2.6920
--------------------------


Epoch 3, Loss: 0.8276:  50%|█████     | 120/240 [02:55<01:41,  1.19it/s]


--- Epoch 3 Evaluation ---
  Average Loss: 0.9486
  Perplexity: 2.5820
--------------------------


Epoch 4, Loss: 0.7230:  67%|██████▋   | 160/240 [03:40<01:05,  1.23it/s]


--- Epoch 4 Evaluation ---
  Average Loss: 0.8774
  Perplexity: 2.4047
--------------------------


Epoch 5, Loss: 0.5845:  83%|████████▎ | 200/240 [04:25<00:33,  1.21it/s]


--- Epoch 5 Evaluation ---
  Average Loss: 0.9003
  Perplexity: 2.4604
--------------------------


Epoch 6, Loss: 0.4692: 100%|██████████| 240/240 [05:09<00:00,  1.23it/s]


--- Epoch 6 Evaluation ---
  Average Loss: 0.8842
  Perplexity: 2.4210
--------------------------
Training complete.


### Save the Model

Finally, we will save the model to a directory so we can use it later.

In [9]:
model.save_pretrained("gta-captioner")