## Finalize dataframes for projector training


In [None]:
!pip install transformers accelerate bitsandbytes sentencepiece

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0


In [None]:
import pandas as pd
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from concurrent.futures import ThreadPoolExecutor
import os
from google.colab import drive
from pathlib import Path
import numpy as np

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
drive.mount('/content/drive')
BASE_DIR = Path('/content/drive/MyDrive/capstone')

Mounted at /content/drive


In [None]:
df=pd.read_csv("/content/drive/MyDrive/ProjectLabTMIT/df_index_with_clip.csv")

In [None]:
def fast_load_caption(path):
    try:
        with open(path, "r") as f:
            return f.read().strip()
    except:
        return None

In [None]:
with ThreadPoolExecutor(max_workers=32) as ex:
    captions = list(ex.map(fast_load_caption, df["caption_path"]))
df["caption"] = captions


In [None]:
def wnid_to_name(wnid):
    try:
        syn = wn.synset_from_pos_and_offset('n', int(wnid[1:]))
        return syn.lemma_names()[0].replace("_", " ")
    except:
        return None

In [None]:
df["class_name"] = df["class"].apply(wnid_to_name)

In [None]:
save_path = "/content/drive/MyDrive/ProjectLabTMIT/df_index_with_captions.csv"
df.to_csv(save_path, index=False)

In [None]:
print("\ Final df saved to", save_path)
print(df.head())

In [None]:
import pandas as pd

df_stage2=pd.read_csv("/content/drive/MyDrive/ProjectLabTMIT/df_index_with_captions.csv")

In [None]:
print(df_stage2.head())

          base_id      class  \
0  n02510455_4616  n02510455   
1  n02510455_4616  n02510455   
2  n02510455_4616  n02510455   
3  n02510455_4616  n02510455   
4  n02510455_4616  n02510455   

                                            eeg_path  \
0  /content/drive/MyDrive/capstone/images/n025104...   
1  /content/drive/MyDrive/capstone/images/n025104...   
2  /content/drive/MyDrive/capstone/images/n025104...   
3  /content/drive/MyDrive/capstone/images/n025104...   
4  /content/drive/MyDrive/capstone/images/n025104...   

                                          image_path  \
0  /content/drive/MyDrive/capstone/images/n025104...   
1  /content/drive/MyDrive/capstone/images/n025104...   
2  /content/drive/MyDrive/capstone/images/n025104...   
3  /content/drive/MyDrive/capstone/images/n025104...   
4  /content/drive/MyDrive/capstone/images/n025104...   

                                        caption_path  \
0  /content/drive/MyDrive/capstone/images/n025104...   
1  /content/drive/MyD

## Alignment check

In [None]:
df_stage2.sample(5)


Unnamed: 0,base_id,class,eeg_path,image_path,caption_path,clip_emb_path,caption,class_name
5813,n02106662_62679,n02106662,/content/drive/MyDrive/capstone/images/n021066...,/content/drive/MyDrive/capstone/images/n021066...,/content/drive/MyDrive/capstone/images/n021066...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,"A German Shepherd dog with its tongue out, sta...",German shepherd
1810,n03792972_4668,n03792972,/content/drive/MyDrive/capstone/images/n037929...,/content/drive/MyDrive/capstone/images/n037929...,/content/drive/MyDrive/capstone/images/n037929...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,A rainy day at a campsite with tents and umbre...,mountain tent
11831,n02992529_3905,n02992529,/content/drive/MyDrive/capstone/images/n029925...,/content/drive/MyDrive/capstone/images/n029925...,/content/drive/MyDrive/capstone/images/n029925...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,A black and silver cell phone with a silver an...,cellular telephone
3003,n03584829_42056,n03584829,/content/drive/MyDrive/capstone/images/n035848...,/content/drive/MyDrive/capstone/images/n035848...,/content/drive/MyDrive/capstone/images/n035848...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,An iron with a cord and a white cord plugged i...,iron
5322,n03445777_16360,n03445777,/content/drive/MyDrive/capstone/images/n034457...,/content/drive/MyDrive/capstone/images/n034457...,/content/drive/MyDrive/capstone/images/n034457...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,A golf ball in a hole on a green.,golf ball


In [None]:
path = df_stage2.iloc[0].clip_emb_path
clip_emb = np.load(path)
print(clip_emb.shape)

(512,)


## DataLoading module

In [None]:
from torch.utils.data import Dataset


In [None]:

from torch.utils.data import Dataset
import torch
import numpy as np

class Stage2Dataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        clip_emb = torch.tensor(
            np.load(row.clip_emb_path),
            dtype=torch.float32
        ).view(-1)

        user_prompt = f"Describe an image of a {row.class_name}."
        prefix_text = deepseek_chat_text(user_prompt)
        suffix_text = row.caption

        ids1 = self.tokenizer(
            prefix_text,
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        ).input_ids.squeeze(0)

        ids2 = self.tokenizer(
            suffix_text,
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        ).input_ids.squeeze(0)

        return clip_emb, ids1, ids2


## Projector module

In [None]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class Projector_V1(nn.Module):
    def __init__(self, in_dim=512, out_dim=2048):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_dim, out_dim),
            nn.GELU(),
            nn.LayerNorm(out_dim),
            nn.Linear(out_dim, out_dim),
        )

    def forward(self, x):
        return self.layers(x)


In [None]:
llm_dim = 2048

## Building the prompt

In [None]:
def deepseek_chat_text(user_message: str) -> str:
    return (
        "<|begin_of_text|>"
        "<|User|>\n"
        f"{user_message}<|EOT|>"
        "<|Assistant|>"
    )


In [None]:
print(deepseek_chat_text("Describe an image of a giant panda."))


<|begin_of_text|><|User|>
Describe an image of a giant panda.<|EOT|><|Assistant|>


In [None]:
def build_deepseek_prefix(class_name: str):
    """Return a DeepSeek chat-style prefix without assistant content."""
    messages = [
        {"role": "user", "content": f"Describe an image of a {class_name}."}
    ]

    prefix = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,     # I added the <|Assistant|> tag
        return_tensors=None

    )
    return prefix


In [None]:
print(build_deepseek_prefix("giant panda")) # check check check


[32013, 2042, 417, 274, 20926, 14244, 20391, 11, 26696, 254, 20676, 30742, 339, 8589, 2008, 11, 6908, 457, 20676, 30742, 7958, 11, 285, 340, 885, 3495, 4301, 4512, 276, 4531, 8214, 13, 1487, 4636, 2223, 13143, 4301, 11, 5411, 285, 13936, 4447, 11, 285, 746, 2159, 12, 13517, 250, 8214, 4301, 11, 340, 540, 20857, 276, 3495, 185, 13518, 3649, 3475, 25, 185, 6998, 7489, 274, 3310, 280, 245, 16361, 265, 8697, 13, 185, 13518, 21289, 25, 185]


In [None]:
def build_sequence_and_labels(llm, ids1, mm_embed, ids2):
    ids1 = ids1.view(-1)
    ids2 = ids2.view(-1)

    instr_emb = llm.model.embed_tokens(ids1.unsqueeze(0))
    tgt_emb   = llm.model.embed_tokens(ids2.unsqueeze(0))

    if mm_embed.dim() == 1:
        mm_embed = mm_embed.unsqueeze(0).unsqueeze(1)
    elif mm_embed.dim() == 2:
        mm_embed = mm_embed.unsqueeze(1)

    final_emb = torch.cat([instr_emb, mm_embed, tgt_emb], dim=1)

    labels = torch.cat([
        torch.full(ids1.shape, -100, device=ids1.device),
        torch.full((1,), -100, device=ids1.device),
        ids2
    ])

    return final_emb, labels


## Load LLM

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)

In [None]:
MODEL_ID = "deepseek-ai/deepseek-coder-1.3b-instruct"

In [None]:
bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
llm = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
)

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [None]:
llm_dim = llm.model.embed_tokens.embedding_dim
llm_dim

2048

In [None]:
print("Model loaded on device:", llm.device)

Model loaded on device: cuda:0


In [None]:
llm_dim = llm.model.embed_tokens.embedding_dim
print("LLM token embedding dimension:", llm_dim)

LLM token embedding dimension: 2048


## Inference

In [None]:
train_dataset = Stage2Dataset(df_stage2, tokenizer)
train_loader  = DataLoader(train_dataset, batch_size=1, shuffle=True)


In [None]:
from tqdm import tqdm

In [None]:
from tqdm import tqdm
import torch

def train_stage2(projector, llm, loader, lr=2e-5, epochs=1):
    device = "cuda"
    projector = projector.to(device)

    llm.eval()
    llm.requires_grad_(False)

    opt = torch.optim.AdamW(projector.parameters(), lr=lr)
    llm_dtype = llm.model.embed_tokens.weight.dtype

    for ep in range(epochs):
        total_loss = 0

        for clip_emb, ids1, ids2 in tqdm(loader, desc=f"Epoch {ep+1}"):

            clip_emb = clip_emb.to(device).float()        # FIX 2: projector requires FP32
            ids1     = ids1.to(device)
            ids2     = ids2.to(device)

            mm = projector(clip_emb)                      # FP32
            mm = mm.to(llm_dtype)                         # FIX 3: convert to LLM dtype

            final_emb, labels = build_sequence_and_labels(llm, ids1, mm, ids2)
            final_emb = final_emb.to(llm_dtype)           # ensure embed dtype

            out = llm(
                input_ids=None,
                inputs_embeds=final_emb,
                labels=labels.unsqueeze(0)
            )

            loss = out.loss
            loss.backward()

            opt.step()
            opt.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {ep+1}: loss = {total_loss / len(loader):.4f}")


In [None]:
train_stage2(proj, llm, train_loader, epochs=2, lr=2e-5)
torch.save(proj.state_dict(), "projector_stage2.pt")


In [None]:
clip_emb, ids1, ids2 = next(iter(train_loader))

clip_emb = clip_emb.cuda()
ids1 = ids1.cuda().squeeze(0)
ids2 = ids2.cuda().squeeze(0)

mm = proj(clip_emb)

instr_emb = llm.model.embed_tokens(ids1.unsqueeze(0))
tgt_emb   = llm.model.embed_tokens(ids2.unsqueeze(0))

print("instr_emb shape:", instr_emb.shape)
print("mm_embed shape:", mm.shape)
print("tgt_emb shape:", tgt_emb.shape)


instr_emb shape: torch.Size([1, 23, 2048])
mm_embed shape: torch.Size([1, 2048])
tgt_emb shape: torch.Size([1, 12, 2048])


## Inference

In [None]:
# Load projector
proj = Projector_V1().cuda()
proj.load_state_dict(torch.load("projector_stage2.pt"))
proj.eval()

# Load DeepSeek LLM + tokenizer (same config as training)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

llm = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
)
llm.eval()


In [None]:
def build_prefix_text(class_name: str):
    messages = [
        {"role": "user", "content": f"Describe an image of a {class_name}."}
    ]
    return tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors=None
    )


In [None]:
def run_stage2_inference(clip_emb_path, class_name, max_new_tokens=80):
    # 1. Load CLIP embedding
    clip_emb = torch.tensor(np.load(clip_emb_path), dtype=torch.float32).cuda().view(-1)
    mm = proj(clip_emb).to(llm.model.embed_tokens.weight.dtype)
    mm = mm.unsqueeze(0).unsqueeze(1)

    # 2. Build DeepSeek prefix
    messages = [
        {"role": "user", "content": f"Describe an image of a {class_name}."}
    ]
    prefix_text = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False
    )

    # 3. Tokenize prefix
    tok = tokenizer(prefix_text, return_tensors="pt", add_special_tokens=False).to("cuda")
    ids = tok.input_ids
    attention_mask = tok.attention_mask

    # 4. Convert prefix to embeddings
    instr_emb = llm.model.embed_tokens(ids)

    # 5. Add projector embedding
    full_embeds = torch.cat([instr_emb, mm], dim=1)

    # 6. Mask
    mm_mask = torch.ones((1,1), dtype=torch.long, device="cuda")
    full_mask = torch.cat([attention_mask, mm_mask], dim=1)

    # 7. Generate
    out = llm.generate(
        inputs_embeds=full_embeds,
        attention_mask=full_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
    )

    # 8. FIX: decode full output, do NOT slice
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)

    print("\nGenerated Caption:")
    print(decoded.strip())
    return decoded.strip()


In [None]:
save_path = "/content/drive/MyDrive/ProjectLabTMIT/projector_stage2.pt"
torch.save(proj.state_dict(), save_path)
print("Saved to:", save_path)


Saved to: /content/drive/MyDrive/ProjectLabTMIT/projector_stage2.pt


## Alignment check

In [None]:
proj = Projector_V1().cuda()
proj.load_state_dict(torch.load("/content/drive/MyDrive/ProjectLabTMIT/projector_stage2.pt"))
proj.eval()


Projector_V1(
  (layers): Sequential(
    (0): Linear(in_features=512, out_features=2048, bias=True)
    (1): GELU(approximate='none')
    (2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=2048, out_features=2048, bias=True)
  )
)

In [None]:
# --- Pick a random sample ---
row = df_stage2.sample(1).iloc[0]

class_name = row.class_name
clip_path   = row.clip_emb_path
gt_caption  = row.caption

print("\n============================================")
print("CLASS:", class_name)
print("--------------------------------------------")
print("Ground Truth Caption:")
print(gt_caption)
print("--------------------------------------------")

# --- Run Stage-2 inference ---
gen_caption = run_stage2_inference(clip_path, class_name)




CLASS: pajama
--------------------------------------------
Ground Truth Caption:
A pink robe with a floral pattern displayed in a store.
--------------------------------------------

Generated Caption:
A pink robe with a floral design and a gown with a pink leather collar. The robe is displayed in a store. Image captured by a customer. @customer_experience_bot.png

A gown with a pink leather collar and a floral pattern is displayed in a store. @customer_experience_bot.png

A pink and white
