<a href="https://colab.research.google.com/github/ajaysuseel/MiniProject_AD/blob/main/incremental_contrastive_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import json
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipConfig
from peft import LoraConfig, get_peft_model
import subprocess

#GitHub Update Functions

In [2]:
def pull_latest_changes(repo_path):
    """Pull the latest changes from GitHub."""
    try:
        subprocess.run(["git", "-C", repo_path, "pull"], check=True)
        print("✅ Successfully pulled the latest changes.")
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Error pulling latest changes: {e}")

def push_to_github(repo_path, file_path, commit_message="Update"):
    """Push updated file(s) back to GitHub."""
    try:
        subprocess.run(["git", "-C", repo_path, "add", file_path], check=True)
        subprocess.run(["git", "-C", repo_path, "commit", "-m", commit_message], check=True)
        subprocess.run(["git", "-C", repo_path, "push"], check=True)
        print(f"🚀 Successfully pushed {file_path} to GitHub.")
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Git push error: {e}")


#Incremental Fine-Tuning Functions

In [3]:
def load_used_files(used_files_path):
    """Load the set of filenames that have been used for fine-tuning."""
    if os.path.exists(used_files_path):
        with open(used_files_path, "r") as f:
            used = set(json.load(f))
        print(f"Loaded {len(used)} used filenames from {used_files_path}.")
    else:
        used = set()
        print("No used files record found; starting fresh.")
    return used

def save_used_files(used_files, used_files_path):
    """Save the set of filenames to a JSON file."""
    with open(used_files_path, "w") as f:
        json.dump(list(used_files), f)
    print(f"Saved {len(used_files)} used filenames to {used_files_path}.")

def get_new_samples(captions_data, used_files):
    """
    Filter the new dataset samples to only include those whose filenames
    are not in the used_files set.
    """
    new_samples = [item for item in captions_data if item["filename"] not in used_files]
    print(f"Found {len(new_samples)} new samples for fine-tuning.")
    return new_samples


#Custom dataset

In [16]:
from torch.utils.data import Dataset, DataLoader
import os
from PIL import Image
import torch

class FineTuningDataset(Dataset):
    def __init__(self, data, processor, images_folder):
        """
        data: list of dictionaries with keys "filename", "pos_caption", and "neg_caption".
        images_folder: directory containing images.
        """
        self.data = data
        self.processor = processor
        self.images_folder = images_folder

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = os.path.join(self.images_folder, item["filename"])
        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {item['filename']}: {e}")
            return None

        # Tokenize positive caption using "pos_caption"
        pos_encoding = self.processor(
            text=item["pos_caption"],
            images=image,
            return_tensors="pt",
            padding="max_length",
            truncation=True
        )
        pos_encoding = {k: v.squeeze(0) for k, v in pos_encoding.items()}

        # Tokenize negative caption using "neg_caption"
        neg_encoding = self.processor(
            text=item["neg_caption"],
            images=image,
            return_tensors="pt",
            padding="max_length",
            truncation=True
        )
        neg_encoding = {k: v.squeeze(0) for k, v in neg_encoding.items()}

        # Set labels for contrastive loss
        pos_encoding["pos_labels"] = pos_encoding["input_ids"]
        pos_encoding["neg_labels"] = neg_encoding["input_ids"]

        return pos_encoding

def create_dataloader(data, processor, images_folder, batch_size=2):
    dataset = FineTuningDataset(data, processor, images_folder)

    def collate_fn(batch):
        batch = [b for b in batch if b is not None]
        if not batch:
            return None
        keys = batch[0].keys()
        return {key: torch.stack([b[key] for b in batch]) for key in keys}

    return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


#Contrastive loss function

In [5]:
def contrastive_loss(image_embeds, pos_text_embeds, neg_text_embeds, margin=1.0):
    pos_sim = torch.cosine_similarity(image_embeds, pos_text_embeds, dim=-1)
    neg_sim = torch.cosine_similarity(image_embeds, neg_text_embeds, dim=-1)
    loss = torch.relu(margin - pos_sim + neg_sim).mean()
    return loss

#Loading BLIP2

In [6]:
def load_blip2_model_with_lora(base_model_name, model_save_path):
    """
    Load a previously fine-tuned model if it exists; otherwise, load the base model.
    Apply LoRA to the vision encoder's QKV modules.
    """
    from transformers import BlipProcessor, BlipForConditionalGeneration, BlipConfig
    if os.path.exists(model_save_path):
        print("Loading previously fine-tuned model...")
        processor = BlipProcessor.from_pretrained(model_save_path, ignore_mismatched_sizes=True)
        model = BlipForConditionalGeneration.from_pretrained(model_save_path, ignore_mismatched_sizes=True)
    else:
        print("No previously fine-tuned model found; loading base model...")
        config = BlipConfig.from_pretrained(base_model_name)
        processor = BlipProcessor.from_pretrained(base_model_name, ignore_mismatched_sizes=True)
        model = BlipForConditionalGeneration.from_pretrained(base_model_name, config=config, ignore_mismatched_sizes=True)
        # Apply LoRA
        target_modules = [f"vision_model.encoder.layers.{i}.self_attn.qkv" for i in range(12)]
        lora_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.1, target_modules=target_modules)
        model = get_peft_model(model, lora_config)
    return model, processor


#Finetuning...

In [14]:
def incremental_finetuning(new_data_json, new_images_folder, used_files_path,
                           model_save_path, base_model_name, num_epochs=3, learning_rate=5e-5, batch_size=2):
    # Pull latest changes from GitHub (optional; requires authentication configured)
    pull_latest_changes(GIT_LOCAL_PATH)

    # Load new fine-tuning data from JSON file
    try:
        with open(new_data_json, "r") as f:
            new_data = json.load(f)
        print(f"Loaded {len(new_data)} new fine-tuning samples from {new_data_json}.")
    except Exception as e:
        print(f"Error loading new fine-tuning data: {e}")
        return

    # Load used files list and filter new samples
    used_files = load_used_files(used_files_path)
    new_samples = get_new_samples(new_data, used_files)
    if not new_samples:
        print("No new samples to fine-tune on. Exiting incremental fine-tuning.")
        return

    # Load the previously fine-tuned model or base model with LoRA applied
    model, processor = load_blip2_model_with_lora(base_model_name, model_save_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    # Create DataLoader for new samples
    dataloader = create_dataloader(new_samples, processor, new_images_folder, batch_size=batch_size)
    if dataloader is None:
        print("No valid data samples found. Exiting incremental fine-tuning.")
        return

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    print(f"🚀 Starting incremental fine-tuning on {device} for {num_epochs} epochs...")
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            optimizer.zero_grad()
            pixel_values = batch["pixel_values"].to(device)
            pos_input_ids = batch["pos_labels"].to(device)
            neg_input_ids = batch["neg_labels"].to(device)
            image_embeds = model.vision_model(pixel_values).last_hidden_state.mean(dim=1)
            with torch.no_grad():
                pos_outputs = model.text_decoder(input_ids=pos_input_ids.long(), output_hidden_states=True)
                neg_outputs = model.text_decoder(input_ids=neg_input_ids.long(), output_hidden_states=True)
            pos_text_embeds = pos_outputs.hidden_states[-1].mean(dim=1)
            neg_text_embeds = neg_outputs.hidden_states[-1].mean(dim=1)
            loss = contrastive_loss(image_embeds, pos_text_embeds, neg_text_embeds)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"✅ Epoch {epoch+1} completed | Average Loss: {epoch_loss/len(dataloader):.4f}")

    # Save updated model and processor
    model.save_pretrained(model_save_path)
    processor.save_pretrained(model_save_path)
    print(f"Model saved to {model_save_path}")

    # Update used files list with new samples
    new_used_files = {item["filename"] for item in new_samples}
    used_files.update(new_used_files)
    save_used_files(used_files, used_files_path)
    push_to_github(GIT_LOCAL_PATH, used_files_path, "Updated used files after incremental fine-tuning")
    push_to_github(GIT_LOCAL_PATH, model_save_path, "Saved updated fine-tuned BLIP-2 model")

    return model, processor

#Github push and pull

In [8]:
def pull_latest_changes(repo_path):
    try:
        subprocess.run(["git", "-C", repo_path, "pull"], check=True)
        print("✅ Pulled latest changes from GitHub.")
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Error pulling latest changes: {e}")

def push_to_github(repo_path, file_path, commit_message="Updated"):
    try:
        subprocess.run(["git", "-C", repo_path, "add", file_path], check=True)
        subprocess.run(["git", "-C", repo_path, "commit", "-m", commit_message], check=True)
        subprocess.run(["git", "-C", repo_path, "push"], check=True)
        print(f"🚀 Pushed {file_path} to GitHub.")
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Error pushing {file_path} to GitHub: {e}")

In [21]:

BASE_MODEL_NAME = "Salesforce/blip2-flan-t5-xl"  # Base model identifier
MODEL_SAVE_PATH = "./models/finetuned_blip2"       # Directory where fine-tuned model is saved

# GitHub repository settings (adjust these paths accordingly)
GIT_REPO_URL = "https://github.com/ajaysuseel/MiniProject_AD.git"
GIT_LOCAL_PATH = "/content/MiniProject_AD"  # Local path where repo is cloned

CAPTIONS_PATH = os.path.join(GIT_LOCAL_PATH, "abhiram", "contrastive_captions.json")
USED_FILES_PATH = os.path.join(GIT_LOCAL_PATH, "abhiram", "used_files.json")
NEW_IMAGES_FOLDER = os.path.join(GIT_LOCAL_PATH, "abhiram", "images")  # All images are here

In [19]:
!git config --global user.email "ajaysuseel673@gmail.com"
!git config --global user.name "ajaysuseel"

In [22]:
if __name__ == "__main__":
    # Ensure your repository is cloned in Colab:
    if not os.path.exists(GIT_LOCAL_PATH):
        subprocess.run(["git", "clone", GIT_REPO_URL, GIT_LOCAL_PATH], check=True)
    else:
        pull_latest_changes(GIT_LOCAL_PATH)

    # Set the paths for the new fine-tuning data (e.g., images from file62 onward)
    NEW_DATA_JSON = os.path.join(GIT_LOCAL_PATH, "ajay", "contrastive_captions.json")
    # (You can update NEW_DATA_JSON to point to a different file if you split your data)
    # For example, if you have a separate JSON for new images, use that path.
    # Here we assume the JSON contains all samples; our filtering will exclude already used ones.

    # Perform incremental fine-tuning
    model, processor = incremental_finetuning(
        new_data_json=NEW_DATA_JSON,
        new_images_folder=NEW_IMAGES_FOLDER,
        used_files_path=USED_FILES_PATH,
        model_save_path=MODEL_SAVE_PATH,
        base_model_name=BASE_MODEL_NAME,
        num_epochs=3,
        learning_rate=5e-5,
        batch_size=2
    )

✅ Pulled latest changes from GitHub.
✅ Pulled latest changes from GitHub.
Loaded 10 new fine-tuning samples from /content/MiniProject_AD/ajay/contrastive_captions.json.
No used files record found; starting fresh.
Found 10 new samples for fine-tuning.
Loading previously fine-tuned model...


You are using a model of type blip-2 to instantiate a model of type blip. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of BlipForConditionalGeneration were not initialized from the model checkpoint at Salesforce/blip2-flan-t5-xl and are newly initialized: ['text_decoder.bert.embeddings.LayerNorm.bias', 'text_decoder.bert.embeddings.LayerNorm.weight', 'text_decoder.bert.embeddings.position_embeddings.weight', 'text_decoder.bert.embeddings.word_embeddings.weight', 'text_decoder.bert.encoder.layer.0.attention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.0.attention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.0.attention.output.dense.bias', 'text_decoder.bert.encoder.layer.0.attention.output.dense.weight', 'text_decoder.bert.encoder.layer.0.attention.self.key.bias', 'text_decoder.bert.encoder.layer.0.attention.self.key.weight', 'text_decoder.bert.encoder.layer.0.attention.self.query.bias', 'text_decoder.bert.encoder.layer.0.attention.self.query.weight', 'text_decoder.bert.encoder.layer.0.attention.self.value.bias', 'text_decoder.bert.encoder.layer.0.attention.self.va

🚀 Starting incremental fine-tuning on cuda for 3 epochs...


Epoch 1/3:   0%|          | 0/5 [00:00<?, ?it/s]

Error loading image file41.jpg: [Errno 2] No such file or directory: '/content/MiniProject_AD/abhiram/images/file41.jpg'
Error loading image file49.jpg: [Errno 2] No such file or directory: '/content/MiniProject_AD/abhiram/images/file49.jpg'





TypeError: 'NoneType' object is not subscriptable

In [18]:
!git -C /content/MiniProject_AD add ajay/used_files.json
!git -C /content/MiniProject_AD commit -m "Test commit"
!git -C /content/MiniProject_AD push


Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@22cf0ddc289b.(none)')
fatal: could not read Username for 'https://github.com': No such device or address
