## New loss function
Loss =  λ_caption * CrossEntropy(LLM output, caption)
      + λ_clip    * CosineDistance(EEG_emb, CLIP_emb)


In [5]:
from google.colab import drive
from pathlib import Path




In [6]:
drive.mount('/content/drive')
BASE_DIR = Path('/content/drive/MyDrive/capstone')


Mounted at /content/drive


In [36]:

!pip install transformers accelerate bitsandbytes sentencepiece

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import math

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


BASE = Path("/content/drive/MyDrive/ProjectLabTMIT")
df = pd.read_csv(BASE / "df_index_with_captions.csv")

print("Loaded dataframe:", df.shape)
df.head()


Loaded dataframe: (11965, 8)


Unnamed: 0,base_id,class,eeg_path,image_path,caption_path,clip_emb_path,caption,class_name
0,n02510455_4616,n02510455,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,A panda bear sitting on the ground in an enclo...,giant panda
1,n02510455_4616,n02510455,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,A panda bear sitting on the ground in an enclo...,giant panda
2,n02510455_4616,n02510455,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,A panda bear sitting on the ground in an enclo...,giant panda
3,n02510455_4616,n02510455,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,A panda bear sitting on the ground in an enclo...,giant panda
4,n02510455_4616,n02510455,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/capstone/images/n025104...,/content/drive/MyDrive/ProjectLabTMIT/clip_emb...,A panda bear sitting on the ground in an enclo...,giant panda


In [4]:
class Stage3OptionBDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # EEG
        eeg = Image.open(row.eeg_path).convert("L").resize((440,128))
        eeg = torch.tensor(np.array(eeg)/255.0, dtype=torch.float32).unsqueeze(0)

        # CLIP vector
        clip = torch.tensor(np.load(row.clip_emb_path), dtype=torch.float32)

        # Prefix prompt
        msgs = [{"role":"user","content":f"Describe an image of a {row.class_name}."}]
        prefix = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
        ids1 = tokenizer(prefix, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)

        # Target caption
        ids2 = tokenizer(row.caption, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)

        return eeg, clip, ids1, ids2, row.caption


In [5]:
MODEL_ID = "deepseek-ai/deepseek-coder-1.3b-instruct"
bnb_cfg = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

llm = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    quantization_config=bnb_cfg,
    torch_dtype=torch.float16
)

llm.eval()
llm.requires_grad_(False)

llm_dtype = llm.model.embed_tokens.weight.dtype
print("LLM dtype:", llm_dtype)
print("LLM embed dim:", llm.config.hidden_size)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

LLM dtype: torch.float16
LLM embed dim: 2048


In [8]:
class Projector(nn.Module):
    def __init__(self, in_dim=512, out_dim=2048):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_dim, out_dim),
            nn.GELU(),
            nn.LayerNorm(out_dim),
            nn.Linear(out_dim, out_dim),
        )
    def forward(self, x):
        return self.layers(x)

projector = Projector().cuda()
projector.load_state_dict(torch.load(BASE / "projector_stage2.pt"))
projector.eval()
print("Loaded projector.")


Loaded projector.


In [7]:
import math

class ConvLayer2D(nn.Sequential):
    def __init__(self, in_channels, out_channels, kernel, stride, padding, dilation):
        super().__init__()
        self.add_module("bn", nn.BatchNorm2d(in_channels))
        self.add_module("relu", nn.ReLU(inplace=True))
        self.add_module("conv", nn.Conv2d(
            in_channels, out_channels,
            kernel_size=kernel,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=True
        ))

class TemporalBlock(nn.Module):
    def __init__(self, in_channels, out_channels, dilation_list, kernel, stride):
        super().__init__()
        layers = []
        paddings = []
        for dil in dilation_list:
            pad = (kernel[1] * dil - 1) // 2
            paddings.append((0, pad))

        for pad, dil in zip(paddings, dilation_list):
            layers.append(
                ConvLayer2D(
                    in_channels, out_channels,
                    kernel, stride,
                    padding=pad,
                    dilation=(1, dil)
                )
            )
        self.layers = nn.ModuleList(layers)

    def forward(self, x):
        feats = [layer(x) for layer in self.layers]
        min_w = min(f.shape[-1] for f in feats)
        feats = [f[..., :min_w] for f in feats]
        return torch.cat(feats, dim=1)

class SpatialBlock(nn.Module):
    def __init__(self, in_channels, out_channels, height):
        super().__init__()
        kernel_sizes = [32, 16, 8, 4]
        layers = []
        for k_h in kernel_sizes:
            pad_h = k_h // 2
            layers.append(
                ConvLayer2D(
                    in_channels, out_channels,
                    kernel=(k_h, 1),
                    stride=(1, 1),
                    padding=(pad_h, 0),
                    dilation=1
                )
            )
        self.layers = nn.ModuleList(layers)

    def forward(self, x):
        feats = [l(x) for l in self.layers]
        min_h = min(f.shape[-2] for f in feats)
        min_w = min(f.shape[-1] for f in feats)
        feats = [f[..., :min_h, :min_w] for f in feats]
        return torch.cat(feats, dim=1)

class ResidualBlock(nn.Module):
    def __init__(self, ch):
        super().__init__()
        self.conv1 = nn.Conv2d(ch, ch, 3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(ch)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(ch, ch, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(ch)

    def forward(self, x):
        r = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        return self.relu(out + r)

class EEGEncoderV3(nn.Module):
    def __init__(self, embedding_dim=512, num_classes=40,
                 in_channels=1, height=128, width=440):
        super().__init__()

        self.temp = TemporalBlock(
            in_channels, 10,
            dilation_list=[1,2,4,8,16],
            kernel=(1,33),
            stride=(1,2)
        )

        self.spatial = SpatialBlock(10*5, 50, height)

        res_in = 50*4
        self.res_blocks = nn.ModuleList([ResidualBlock(res_in) for _ in range(4)])

        self.down = ConvLayer2D(res_in, 50, 3, 2, 1, 1)

        dummy = torch.zeros(1,1,height,width)
        with torch.no_grad():
            h = self.temp(dummy)
            h = self.spatial(h)
            for rb in self.res_blocks: h = rb(h)
            h = self.down(h)
            flat = h.view(1,-1).shape[1]

        self.embedding_proj = nn.Sequential(
            nn.Linear(flat, 1024),
            nn.GELU(),
            nn.Linear(1024, embedding_dim)
        )

        self.classifier = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        h = self.temp(x)
        h = self.spatial(h)
        for rb in self.res_blocks:
            h = rb(h)
        h = self.down(h)

        h = h.view(h.size(0), -1)
        emb = F.normalize(self.embedding_proj(h), dim=-1)
        cls = self.classifier(emb)
        return emb, cls


In [10]:
# ============================================================
# 6) LOSS FUNCTIONS
# ============================================================
def cosine_dist(a, b):
    a = F.normalize(a, dim=-1)
    b = F.normalize(b, dim=-1)
    return 1 - (a * b).sum(dim=-1)


In [9]:
encoder = EEGEncoderV3().cuda()
encoder.load_state_dict(torch.load(BASE / "checkpoints_v3/eeg_encoder_v3_epoch_10.pt"))
encoder.eval()
print("Loaded EEGEncoderV3 V3 OK.")


Loaded EEGEncoderV3 V3 OK.


## Test EEG -> Text output

In [3]:
def build_inputs_and_labels(llm, ids1, mm_embed, ids2):

    # --- ALWAYS flatten tokens ---
    ids1 = ids1.view(-1)      # (L1,)
    ids2 = ids2.view(-1)      # (L2,)

    # --- Embed text ---
    instr = llm.model.embed_tokens(ids1.unsqueeze(0))    # (1, L1, 2048)
    tgt   = llm.model.embed_tokens(ids2.unsqueeze(0))    # (1, L2, 2048)

    # --- Embed EEG feature ---
    mm = mm_embed.unsqueeze(0)     # (1, 1, 2048)

    # --- Concatenate full sequence ---
    full = torch.cat([instr, mm, tgt], dim=1)

    # --- Build labels ---
    labels = torch.cat([
        torch.full((ids1.shape[0],), -100, device=ids1.device),
        torch.full((1,), -100, device=ids1.device),
        ids2,
    ]).unsqueeze(0)

    return full, labels


In [12]:
def train_stage3_optionB(
    encoder, projector, llm, loader,
    lr=2e-5, epochs=2,
    w_caption=1.0,
    w_clip=1.0
):
    device = "cuda"

    encoder.train()
    projector.train()
    llm.eval()

    opt = torch.optim.AdamW(
        list(encoder.parameters()) + list(projector.parameters()),
        lr=lr
    )

    for ep in range(1, epochs+1):
        loop = tqdm(loader, desc=f"Epoch {ep}")
        total_loss = 0

        for eeg, clip, ids1, ids2, _ in loop:
            eeg  = eeg.to(device)
            clip = clip.to(device)
            ids1 = ids1.to(device)
            ids2 = ids2.to(device)

            # EEG → embedding
            eeg_emb, _ = encoder(eeg)         # (1,512)

            # CLIP alignment loss
            cos = F.cosine_similarity(eeg_emb, clip, dim=-1)
            loss_clip = 1 - cos.mean()

            # Project to LLM space
            mm = projector(eeg_emb).to(llm_dtype)

            # Build LLM inputs
            full_emb, labels = build_inputs_and_labels(llm, ids1, mm, ids2)
            full_emb = full_emb.to(llm_dtype)

            # LLM caption loss
            out = llm(inputs_embeds=full_emb, labels=labels)
            loss_caption = out.loss

            # Total
            loss = w_caption*loss_caption + w_clip*loss_clip

            loss.backward()
            opt.step()
            opt.zero_grad()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {ep} avg loss = {total_loss/len(loader):.4f}")

        torch.save(encoder.state_dict(), BASE / f"encoder_optB_ep{ep}.pt")
        torch.save(projector.state_dict(), BASE / f"proj_optB_ep{ep}.pt")
        print("Saved epoch", ep)


In [None]:
df = pd.read_csv(BASE / "df_index_with_captions.csv")
ds = Stage3OptionBDataset(df, tokenizer)
loader = DataLoader(ds, batch_size=1, shuffle=True)

train_stage3_optionB(
    encoder, projector, llm, loader,
    lr=2e-5, epochs=2,
    w_caption=1.0, w_clip=1.0
)


In [None]:
import random
import torch
import numpy as np
from PIL import Image

In [25]:


def load_eeg_image(path):
    eeg = Image.open(path).convert("L")
    eeg = eeg.resize((440,128))
    eeg = np.array(eeg).astype(np.float32) / 255.0
    eeg = torch.tensor(eeg).unsqueeze(0).unsqueeze(0)   # (1,1,128,440)
    return eeg.cuda()


def build_prefix(class_name):
    # DeepSeek chat-style prefix
    messages = [
        {"role": "user", "content": f"Describe an image of a {class_name}."}
    ]
    prefix = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False
    )
    return prefix


def generate_stage3_caption(eeg_encoder, projector, llm, tokenizer, row, max_new_tokens=40):

    class_name = row["class_name"]
    gt_caption = row["caption"]
    eeg_path   = row["eeg_path"]

    # -----------------------------
    # 1. Load EEG → embedding
    # -----------------------------
    eeg = load_eeg_image(eeg_path)        # (1,1,128,440)

    with torch.no_grad():
        emb = eeg_encoder(eeg)[0]
        mm  = projector(emb)              # (1,2048)

    mm = mm.to(llm.model.embed_tokens.weight.dtype)
    mm = mm.unsqueeze(1)                  # (1,1,2048)

    # -----------------------------
    # 2. Build text prefix
    # -----------------------------
    prefix = build_prefix(class_name)

    tok = tokenizer(prefix, return_tensors="pt", add_special_tokens=False).to("cuda")
    ids = tok.input_ids
    mask = tok.attention_mask

    instr_emb = llm.model.embed_tokens(ids)  # (1, L, 2048)

    full_embeds = torch.cat([instr_emb, mm], dim=1)

    mm_mask = torch.ones((1,1), dtype=torch.long, device="cuda")
    full_mask = torch.cat([mask, mm_mask], dim=1)

    # -----------------------------
    # 3. Generate
    # -----------------------------
    output = llm.generate(
        inputs_embeds=full_embeds,
        attention_mask=full_mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
    )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True).strip()

    # -----------------------------
    # 4. Pretty print
    # -----------------------------
    print("\n======================= RAW LLM OUTPUT =======================")
    print(decoded)
    print("==============================================================\n")

    print("============== SAMPLE COMPARISON ==============")
    print(f"Class Name: {class_name}")
    print("------------------------------------------------")
    print("Ground Truth:")
    print(gt_caption)
    print("------------------------------------------------")
    print("Generated:")
    print(decoded)
    print("================================================\n")

    return decoded


In [79]:
row = df.sample(1).iloc[0]
generate_stage3_caption(encoder, projector, llm, tokenizer, row)


'A small coral tank with a coral tankfish swimming in the ocean.\nThe tankfish is sitting on a rock'

## Test LoRA finetuned DeepSeek model

### prompt engineering and light llm finetuning - V 1

In [11]:
!pip install transformers peft bitsandbytes accelerate --quiet

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM
import random
import numpy as np


In [8]:
def load_eeg_image(path):
    eeg = Image.open(path).convert("L").resize((440, 128))
    eeg = np.array(eeg).astype(np.float32) / 255.0
    eeg = torch.tensor(eeg).unsqueeze(0).unsqueeze(0)  # (1,1,128,440)
    return eeg.cuda()


In [9]:
def unwrap(x):
    """Ensure encoder output is a tensor, not a tuple."""
    return x[0] if isinstance(x, tuple) else x


In [10]:
def build_prefix(class_name):
    prompt = (
        f"This EEG corresponds to an image containing a {class_name}.\n"
        "Write a short COCO-style caption (10–15 words).\n"
        "Keep it factual and concrete.\n"
        "Do NOT add imaginary objects.\n"
        f"Include the word '{class_name}' once.\n"
    )
    messages = [{"role": "user", "content": prompt}]
    return tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False
    )


In [5]:
BASE_MODEL = "deepseek-ai/deepseek-coder-1.3b-instruct"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

llm = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)


In [40]:
EEG_TOKEN = "<EEG>"

if EEG_TOKEN not in tokenizer.get_vocab():
    tokenizer.add_tokens([EEG_TOKEN])
    llm.resize_token_embeddings(len(tokenizer))

EEG_ID = tokenizer.convert_tokens_to_ids(EEG_TOKEN)

print("EEG token id =", EEG_ID)


EEG token id = 32022


In [41]:
tokenizer.add_special_tokens({"additional_special_tokens": ["<eeg>"]})
llm.resize_token_embeddings(len(tokenizer))


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32024, 2048)

In [6]:
llm.train()
llm.requires_grad_(False)

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

llm = get_peft_model(llm, lora_cfg)
llm.print_trainable_parameters()


trainable params: 6,291,456 || all params: 1,352,763,392 || trainable%: 0.4651


In [37]:
def get_embed_tokens(llm):
    """
    DeepSeek structure:
    model.model.embed_tokens
    """
    base = llm
    if hasattr(base, "model"):
        base = base.model
    if hasattr(base, "model"):
        base = base.model

    if hasattr(base, "embed_tokens"):
        print("Found embed layer at: model.embed_tokens")
        return base.embed_tokens

    raise AttributeError(" Could not locate DeepSeek embed_tokens")


In [38]:
def make_eeg_slot(embed, eeg, encoder, projector, eeg_token, scale=0.2):
    with torch.no_grad():
        eeg_emb = encoder(eeg)
        eeg_emb = unwrap(eeg_emb)
        proj = projector(eeg_emb)          # (1, hidden)

    # embedded <eeg> token
    tok_emb = embed(eeg_token.cuda())      # (1,1,hidden)

    # scale projected EEG embedding to match token scale
    proj = proj.unsqueeze(1) * scale       # (1,1,hidden)

    return tok_emb + proj


In [39]:
class Stage3Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # 1) EEG tensor
        eeg = load_eeg_image(row.eeg_path).cuda()

        # 2) Prefix
        prefix = (
            f"This EEG corresponds to an image containing a {row.class_name}.\n"
            "Write a factual 10–15-word caption.\n"
            f"Include the word '{row.class_name}'.\n"
        )
        ids1 = tokenizer(prefix, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)

        # 3) EEG token ID
        eeg_token = torch.tensor([tokenizer.convert_tokens_to_ids("<eeg>")], dtype=torch.long)

        # 4) Caption target
        ids2 = tokenizer(row.caption, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)

        return eeg, ids1, eeg_token, ids2


In [31]:
def make_eeg_slot(embed, eeg, encoder, projector, eeg_token):
  with torch.no_grad():
      eeg_emb = encoder(eeg)
      proj = projector(eeg_emb)
      target_std = embed.weight.std()
      current_std = proj.std()
      scale = target_std / (current_std + 1e-6)

      proj_scaled = (proj * scale).to(embed.weight.dtype)

      # EEG slot = base token embedding + scaled projector vector
      eeg_slot = embed(eeg_token) + proj_scaled.unsqueeze(1)

  return eeg_slot


In [8]:
def get_embed_layer(llm):
    return llm.base_model.model.embed_tokens


In [32]:
def train_stage3_v4(llm, encoder, projector, df, epochs=1, lr=2e-4):

    loader = DataLoader(Stage3Dataset(df), batch_size=1, shuffle=True)
    opt = torch.optim.AdamW(llm.parameters(), lr=lr)

    embed = get_embed_tokens(llm)

    for ep in range(epochs):
        print(f"\n Stage-3 Epoch {ep+1}")

        for i, batch in enumerate(loader):

            eeg, ids1, eeg_token, ids2 = batch
            eeg = eeg.cuda()
            ids1 = ids1.cuda()
            ids2 = ids2.cuda()
            eeg_token = eeg_token.cuda()

            # ---------------------------------------
            # Convert tokens → embeddings
            # ---------------------------------------
            prefix_emb = embed(ids1)          # (1,L1,2048)
            tgt_emb    = embed(ids2)          # (1,L2,2048)

            # Insert scaled EEG slot
            eeg_slot = make_eeg_slot(embed, eeg, encoder, projector, eeg_token)

            # Full sequence
            full_emb = torch.cat([prefix_emb, eeg_slot, tgt_emb], dim=1)

            # Mask and labels
            mask = torch.ones(full_emb.size()[:-1], device="cuda")

            labels = torch.cat([
                torch.full((prefix_emb.size(1),), -100, device="cuda"),
                torch.full((1,), -100, device="cuda"),
                ids2[0]
            ], dim=0).unsqueeze(0)

            # Forward
            out = llm(inputs_embeds=full_emb, attention_mask=mask, labels=labels)
            loss = out.loss

            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(llm.parameters(), 1.0)
            opt.step()

            if i % 100 == 0:
                print(f"step {i}: loss={loss.item():.4f}")

        print("Epoch done.")


In [44]:
def train_stage3_v4(llm, encoder, projector, df, epochs=3, lr=2e-4):

    loader = DataLoader(Stage3Dataset(df), batch_size=1, shuffle=True)
    embed = get_embed_tokens(llm)

    opt = torch.optim.AdamW(llm.parameters(), lr=lr)

    print("\n Starting Stage-3 Training v4.1\n")

    for ep in range(epochs):
        print(f"\n Epoch {ep+1}")

        for i, batch in enumerate(loader):
            eeg, ids1, eeg_token, ids2 = batch

            ids1 = ids1.cuda()
            ids2 = ids2.cuda()
            eeg_token = eeg_token.cuda()
            eeg = eeg.cuda()

            # 1) prefix embeddings
            prefix_emb = embed(ids1)

            # 2) EEG hidden-state slot
            eeg_slot = make_eeg_slot(embed, eeg, encoder, projector, eeg_token)

            # 3) target caption embeddings
            tgt_emb = embed(ids2)

            # 4) full sequence of embeddings
            full_emb = torch.cat([prefix_emb, eeg_slot, tgt_emb], dim=1)

            # 5) attention mask
            mask = torch.ones(full_emb.size()[:-1], device="cuda")

            # 6) labels (ignore prefix + eeg)
            labels = torch.cat([
                torch.full((prefix_emb.size(1),), -100, device="cuda"),
                torch.full((1,), -100, device="cuda"),
                ids2[0]
            ]).unsqueeze(0)

            # 7) forward
            out = llm(inputs_embeds=full_emb, attention_mask=mask, labels=labels)
            loss = out.loss

            opt.zero_grad()
            loss.backward()
            opt.step()

            if i % 50 == 0:
                print(f"step {i}: loss={loss.item():.4f}")

        print("Epoch done.")


In [42]:
def alignment_check(encoder, projector, llm, df):
    print("\n=== ALIGNMENT CHECK ===")
    embed = get_embed_tokens(llm)
    print("Embedding matrix:", embed.weight.shape)

    eeg = load_eeg_image(df.iloc[0].eeg_path).cuda()

    with torch.no_grad():
        eeg_emb = encoder(eeg)
        eeg_emb = unwrap(eeg_emb)
        proj = projector(eeg_emb)

    print("Encoder out:", eeg_emb.shape)
    print("Projector out:", proj.shape)

    # scaling check
    print("EEG emb mean/std:", eeg_emb.mean().item(), eeg_emb.std().item())
    print("Projector mean/std:", proj.mean().item(), proj.std().item())
    print("Embed mean/std:", embed.weight.mean().item(), embed.weight.std().item())

    print("\nIf projector std >> embed std, scaling is correct (we scale inside loop).")


In [None]:
alignment_check(encoder, projector, llm, df)

df_lora = df.sample(1500, random_state=42)

train_stage3_v4(
    llm=llm,
    encoder=encoder,
    projector=projector,
    df=df_lora,
    epochs=3,
    lr=2e-4
)

llm.save_pretrained("/content/stage3_v4_lora/")
tokenizer.save_pretrained("/content/stage3_v4_lora/")


In [45]:
def generate_stage3_v4(encoder, projector, llm, tokenizer, row, max_new_tokens=25):

    embed = get_embed_tokens(llm)

    eeg = load_eeg_image(row.eeg_path).cuda()
    eeg_token = torch.tensor([tokenizer.convert_tokens_to_ids("<eeg>")], dtype=torch.long).cuda()

    # EEG slot
    eeg_slot = make_eeg_slot(embed, eeg, encoder, projector, eeg_token)

    # Prefix construction
    prefix = (
        f"This EEG corresponds to an image containing a {row.class_name}.\n"
        "Write a factual caption.\n"
    )
    ids1 = tokenizer(prefix, return_tensors="pt", add_special_tokens=False).input_ids.cuda()

    # prefix embeddings
    prefix_emb = embed(ids1)

    # full prompt embeddings
    full = torch.cat([prefix_emb, eeg_slot], dim=1)
    mask = torch.ones(full.size()[:-1], device="cuda")

    out = llm.generate(
        inputs_embeds=full,
        attention_mask=mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.4,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(out[0], skip_special_tokens=True)


#### Alignment checks

In [None]:
# -----------------------------
# 1. LLM embedding layer
# -----------------------------
embed = get_embed_tokens(llm)
print("LLM embed weight:", embed.weight.shape)  # expected: (vocab_size, 2048)


# -----------------------------
# 2. EEG → encoder
# -----------------------------
sample_path = df.iloc[0].eeg_path
eeg = load_eeg_image(sample_path).cuda()

with torch.no_grad():
    eeg_emb, _ = encoder(eeg)   # <-- FIXED

print("EEG encoder output:", eeg_emb.shape)
# expected: (1, 512)


# -----------------------------
# 3. Projector
# -----------------------------
with torch.no_grad():
    proj = projector(eeg_emb)

print("Projector output:", proj.shape)
# expected: (1, 2048)


# -----------------------------
# 4. EEG slot inserted into hidden dimension
# -----------------------------
eeg_slot = proj.unsqueeze(1)  # → (1, 1, 2048)
print("EEG slot shape:", eeg_slot.shape)


# -----------------------------
# 5. Compare projector dim with LLM hidden dim
# -----------------------------
print("Projector dim:", proj.shape[-1])
print("LLM hidden dim:", embed.weight.shape[-1])

if proj.shape[-1] != embed.weight.shape[-1]:
    print(" DIMENSION MISMATCH: projector must output exactly hidden_dim.")
else:
    print(" Dimensions aligned!")


In [12]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
from PIL import Image


## test 2

In [13]:
def load_eeg_image(path):
    eeg = Image.open(path).convert("L").resize((440, 128))
    eeg = np.array(eeg).astype(np.float32) / 255.0

    # Guarantee shape is (1,1,H,W)
    eeg = torch.tensor(eeg)

    if eeg.dim() == 2:
        eeg = eeg.unsqueeze(0).unsqueeze(0)  # (1,1,H,W)
    elif eeg.dim() == 3:
        eeg = eeg.unsqueeze(0)  # (1,1,H,W)
    # if already (1,1,H,W), do nothing

    return eeg.cuda()


In [14]:
def unwrap(x):
    return x[0] if isinstance(x, tuple) else x


In [14]:
BASE = "deepseek-ai/deepseek-coder-1.3b-instruct"

tokenizer = AutoTokenizer.from_pretrained(BASE)
tokenizer.pad_token = tokenizer.eos_token

llm = AutoModelForCausalLM.from_pretrained(
    BASE,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

if "<eeg>" not in tokenizer.get_vocab():
    tokenizer.add_tokens(["<eeg>"])
    llm.resize_token_embeddings(len(tokenizer))

EEG_ID = tokenizer.convert_tokens_to_ids("<eeg>")
print("EEG token id =", EEG_ID)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

EEG token id = 32022


In [15]:
llm.train()
llm.requires_grad_(False)

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

llm = get_peft_model(llm, lora_cfg)
llm.print_trainable_parameters()


trainable params: 3,145,728 || all params: 1,348,663,296 || trainable%: 0.2332


In [52]:
def get_embed_tokens(llm):
    m = llm
    if hasattr(m, "model"): m = m.model
    if hasattr(m, "model"): m = m.model

    if hasattr(m, "embed_tokens"):
        return m.embed_tokens

    raise AttributeError("embed_tokens not found")


In [70]:
def make_eeg_slot(embed, eeg, encoder, projector, eeg_token_id):
    with torch.no_grad():
        eeg_emb = encoder(eeg)              # maybe tuple, maybe wrong shape
        eeg_emb = unwrap(eeg_emb)           # ensure (1,512)

        proj = projector(eeg_emb)           # (1,2048)

        # normalize scale
        target_std = embed.weight.std()
        current_std = proj.std()
        scale = target_std / (current_std + 1e-6)
        proj = proj * scale                 # keep dtype float32

    # embed <eeg> token → (1, 2048)
    tok_emb = embed(eeg_token_id.cuda())    # (1, hidden)

    # build (1,1,2048)
    eeg_slot = tok_emb.unsqueeze(1) + proj.unsqueeze(1)

    # ---- FINAL FIX: enforce 3 dims ----
    eeg_slot = eeg_slot.squeeze()           # remove accidental extra dims
    eeg_slot = eeg_slot.unsqueeze(0).unsqueeze(1)
    # now exact: (1,1,2048)

    return eeg_slot.to(embed.weight.dtype)


In [71]:
class Stage3Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # ---- EEG IMAGE (correct shape) ----
        eeg = load_eeg_image(row.eeg_path)   # (1,128,440)

        # ---- PREFIX ----
        prefix = (
            f"This EEG corresponds to an image containing a {row.class_name}.\n"
            "Write a factual 10–15 word caption.\n"
            f"Include '{row.class_name}'.\n"
        )
        ids1 = tokenizer(prefix, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)

        # ---- EEG token (<eeg>) ----
        eeg_token_id = tokenizer.convert_tokens_to_ids("<eeg>")
        eeg_tok = torch.tensor([eeg_token_id], dtype=torch.long)

        # ---- Target caption ----
        ids2 = tokenizer(row.caption, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)

        return eeg, ids1, eeg_tok, ids2


In [72]:
def train_stage3(llm, encoder, projector, df, epochs=3, lr=2e-4):

    loader = DataLoader(Stage3Dataset(df), batch_size=1, shuffle=True)
    embed = get_embed_tokens(llm)
    opt = torch.optim.AdamW(llm.parameters(), lr=lr)

    for ep in range(epochs):
        print(f"\n Epoch {ep+1}")

        for i, (eeg, ids1, eeg_tok, ids2) in enumerate(loader):

            eeg = eeg.cuda()
            ids1 = ids1.cuda()
            ids2 = ids2.cuda()
            eeg_tok = eeg_tok.cuda()

            # prefix + target embeddings
            prefix_emb = embed(ids1)
            tgt_emb = embed(ids2)

            # EEG slot
            eeg_slot = make_eeg_slot(embed, eeg, encoder, projector, eeg_tok)

            # merge sequence
            full = torch.cat([prefix_emb, eeg_slot, tgt_emb], dim=1)
            mask = torch.ones(full.size()[:-1], device="cuda")

            # labels
            labels = torch.cat([
                torch.full((prefix_emb.size(1),), -100, device="cuda"),
                torch.full((1,), -100, device="cuda"),
                ids2[0]
            ]).unsqueeze(0)

            # forward
            out = llm(inputs_embeds=full, attention_mask=mask, labels=labels)
            loss = out.loss

            opt.zero_grad()
            loss.backward()
            opt.step()

            if i % 100 == 0:
                print(f"step {i}: loss={loss.item():.4f}")

        print("Epoch complete.")


In [None]:
df_lora = df.sample(1500, random_state=42)

train_stage3(
    llm=llm,
    encoder=encoder,
    projector=projector,
    df=df_lora,
    epochs=3,
    lr=2e-4,
)


In [83]:
def generate_stage3_caption(encoder, projector, llm, tokenizer, row, max_new_tokens=100):

    # -------------------------
    # 1. Get embedding layer
    # -------------------------
    embed = get_embed_tokens(llm)

    # -------------------------
    # 2. EEG → projector → slot
    # -------------------------
    eeg = load_eeg_image(row.eeg_path).cuda()
    eeg_token_id = torch.tensor([tokenizer.convert_tokens_to_ids("<eeg>")], device="cuda")

    # Use your final corrected version of make_eeg_slot()
    eeg_slot = make_eeg_slot(embed, eeg, encoder, projector, eeg_token_id)     # (1,1,2048)

    # -------------------------
    # 3. Prefix prompt
    # -------------------------
    prefix = (
        f"This EEG corresponds to an image containing a {row.class_name}.\n"
        "Write a factual 10–15 word caption.\n"
        f"Include the word '{row.class_name}'.\n"
    )

    prefix_ids = tokenizer(prefix, return_tensors="pt", add_special_tokens=False).input_ids.cuda()
    prefix_emb = embed(prefix_ids)    # (1, L1, 2048)

    # -------------------------
    # 4. Merge full embeddings
    # -------------------------
    full = torch.cat([prefix_emb, eeg_slot], dim=1)
    mask = torch.ones(full.size()[:-1], device="cuda")

    # -------------------------
    # 5. Generate caption
    # -------------------------
    out = llm.generate(
        inputs_embeds=full,
        attention_mask=mask,
        max_new_tokens=max_new_tokens,
        temperature=0.5,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(out[0], skip_special_tokens=True)


In [88]:
row = df.sample(1).iloc[0]

print("======== SAMPLE INFO ========")
print("Class:", row.class_name)
print("Ground Truth:", row.caption)
print("------------------------------")

pred = generate_stage3_caption(
    encoder=encoder,
    projector=projector,
    llm=llm,
    tokenizer=tokenizer,
    row=row
)

print("Generated:", pred)
print("==============================")


Class: giant panda
Ground Truth: A panda bear is laying on a tree branch, looking upwards with its eyes closed.
------------------------------
Generated: A panda bear is hanging from a tree branch, eating bamboo, in a zoo enclosure.
The image features two panda bears, one on top of the other, hanging from a branch.
The two panda bears are displayed in a zoo enclosure, with one on top of the other, hanging from a branch.
The two panda bears are displayed in a zoo enclosure, with one on top of the other, hanging from a branch.


## test 3

In [26]:
def build_prefix(class_name):
    prompt = (
        f"This EEG corresponds to an image containing a {class_name}.\n"
        "Write a short, factual COCO-style caption (10–15 words).\n"
        "Describe ONLY objects that are certainly present.\n"
        "Do NOT add imaginary objects.\n"
        f"Include the word '{class_name}' exactly once.\n"
    )
    messages = [{"role": "user", "content": prompt}]
    return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)


In [27]:
def unwrap(x):
    return x[0] if isinstance(x, tuple) else x


In [28]:
class Stage3Dataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # EEG: (1,1,128,440)
        eeg = load_eeg_image(row.eeg_path)

        prefix = (
              f"This EEG corresponds to an image containing a {row.class_name}.\n"
              "Write a factual caption (1 short sentence).\n"
              "Do NOT hallucinate.\n"
              f"Include the word '{row.class_name}'.\n"
          )
        ids1 = tokenizer(prefix, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)

        eeg_token_id = torch.tensor([tokenizer.convert_tokens_to_ids("<eeg>")], dtype=torch.long)

        ids2 = tokenizer(row.caption, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)

        return eeg, ids1, eeg_token_id, ids2


In [29]:
def load_eeg_image(path):
    eeg = Image.open(path).convert("L").resize((440, 128))
    eeg = np.array(eeg).astype(np.float32) / 255.0
    eeg = torch.tensor(eeg).unsqueeze(0).unsqueeze(0)  # (1,1,128,440)
    return eeg


In [30]:
def get_embed_tokens(llm):
    """
    Final DeepSeek-1.3B + PEFT embedding resolver.
    Works for both training and inference.
    """

    # 1. PEFT wrapper → unwrap once
    if hasattr(llm, "base_model"):
        m = llm.base_model
    else:
        m = llm

    # 2. DeepSeek uses: model.model.embed_tokens
    if hasattr(m, "model") and hasattr(m.model, "model"):
        if hasattr(m.model.model, "embed_tokens"):
            return m.model.model.embed_tokens

    # 3. Extra fallback
    raise AttributeError("embed_tokens not found at expected location: base_model.model.model.embed_tokens")


In [31]:
def make_eeg_slot(embed, eeg, encoder, projector, eeg_token_id):
    with torch.no_grad():
        eeg_emb = unwrap(encoder(eeg))           # (1,512)
        proj = projector(eeg_emb)                # (1,2048)

        # scale match to LLM token distribution
        target_std = embed.weight.std()
        proj_std   = proj.std()
        scale = target_std / (proj_std + 1e-6)
        proj_scaled = (proj * scale).to(embed.weight.dtype)

        tok_emb = embed(eeg_token_id)            # (1,2048)

        return tok_emb.unsqueeze(1) + proj_scaled.unsqueeze(1)  # (1,1,2048)


In [32]:
def train_stage3(llm, encoder, projector, df, epochs=3, lr=2e-4):

    loader = DataLoader(Stage3Dataset(df), batch_size=1, shuffle=True)
    embed = get_embed_tokens(llm)
    opt = torch.optim.AdamW(llm.parameters(), lr=lr)

    print("\n🚀 Stage-3 Training — FINAL VERSION\n")

    for ep in range(epochs):
        print(f"\n Epoch {ep+1}")

        for i, batch in enumerate(loader):
            eeg, ids1, eeg_tok, ids2 = batch


            eeg = eeg.squeeze(0).cuda()
            ids1 = ids1.cuda()
            ids2 = ids2.cuda()
            eeg_tok = eeg_tok.cuda()

            # 1) prefix embeddings
            prefix_emb = embed(ids1)

            # 2) EEG slot
            eeg_slot = make_eeg_slot(embed, eeg, encoder, projector, eeg_tok)

            # 3) caption embeddings
            tgt_emb = embed(ids2)

            # 4) full sequence of embeddings
            full = torch.cat([prefix_emb, eeg_slot, tgt_emb], dim=1)

            mask = torch.ones(full.size()[:-1], device="cuda")

            labels = torch.cat([
                torch.full((prefix_emb.size(1),), -100, device="cuda"),
                torch.full((1,), -100, device="cuda"),
                ids2[0]
            ]).unsqueeze(0)

            out = llm(inputs_embeds=full, attention_mask=mask, labels=labels)
            loss = out.loss

            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(llm.parameters(), 1.0)
            opt.step()

            if i % 50 == 0:
                print(f"step {i}: loss={loss.item():.4f}")

        print("Epoch done.")


In [33]:
from transformers import StoppingCriteria, StoppingCriteriaList

class StopAfterTwoPeriods(StoppingCriteria):
    def __call__(self, output_ids, scores, **kwargs):
        text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return text.count(".") >= 2

def clean_to_two_sentences(text):
    parts = text.split(".")
    parts = [p.strip() for p in parts if p.strip()]

    if len(parts) == 0:
        return text

    final = ". ".join(parts[:2]) + "."
    return final


def generate_stage3_caption(encoder, projector, llm, tokenizer, row, max_new_tokens=50):

    embed = get_embed_tokens(llm)

    # 1) EEG → embedding
    eeg = load_eeg_image(row.eeg_path).cuda()


    eeg_tok_id = torch.tensor(
        [tokenizer.convert_tokens_to_ids("<eeg>")],
        dtype=torch.long,
        device="cuda"
    )

    eeg_slot = make_eeg_slot(embed, eeg, encoder, projector, eeg_tok_id)

    # 2) Improved prefix
    prefix = (
        f"This EEG corresponds to an image containing a {row.class_name}.\n"
        "Write a factual caption in 1 or 2 short sentences.\n"
        "Describe what is likely visible without adding imaginary items.\n"
        "If unsure, keep the description simple.\n"
        f"Include the word '{row.class_name}'.\n"
    )

    prefix_ids = tokenizer(prefix, return_tensors="pt", add_special_tokens=False).input_ids.cuda()
    prefix_emb = embed(prefix_ids)

    full = torch.cat([prefix_emb, eeg_slot], dim=1)
    mask = torch.ones(full.size()[:-1], device="cuda")

    stopping = StoppingCriteriaList([StopAfterTwoPeriods()])

    out = llm.generate(
        inputs_embeds=full,
        attention_mask=mask,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.5,
        top_p=0.9,
        stopping_criteria=stopping,
        pad_token_id=tokenizer.eos_token_id
    )

    raw = tokenizer.decode(out[0], skip_special_tokens=True)

    return clean_to_two_sentences(raw)


In [None]:
df_lora = df.sample(1500, random_state=42)

train_stage3(
    llm=llm,
    encoder=encoder,
    projector=projector,
    df=df_lora,
    epochs=3,
    lr=2e-4
)

llm.save_pretrained("/content/drive/MyDrive/ProjectLabTMIT/stage3_v5_lora_final/")
tokenizer.save_pretrained("/content/drive/MyDrive/ProjectLabTMIT/stage3_v5_lora_final/")


In [37]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch


BASE_MODEL = "deepseek-ai/deepseek-coder-1.3b-instruct"
LORA_PATH  = "/content/drive/MyDrive/ProjectLabTMIT/stage3_v5_lora_final/"
SPECIAL = "<eeg>"


# -----------------------------
# 1) Load tokenizer
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

if SPECIAL not in tokenizer.get_vocab():
    tokenizer.add_tokens([SPECIAL])

tokenizer.save_pretrained(LORA_PATH)


# -----------------------------
# 2) Load base LLM
# -----------------------------
llm = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="cuda",
    dtype=torch.bfloat16
)

# Match vocab size
llm.resize_token_embeddings(len(tokenizer))


# -----------------------------
# 3) Load LoRA weights
# -----------------------------
llm = PeftModel.from_pretrained(
    llm,
    LORA_PATH,
    adapter_name="default",
    is_trainable=False,
    ignore_mismatched_sizes=True
)

llm.eval()

print(">>> Inference model ready!")


>>> Inference model ready!


In [57]:
row = df.sample(1).iloc[0]

print("\n======== SAMPLE INFO ========")
print("Class:", row.class_name)
print("Ground Truth:", row.caption)
print("------------------------------")


pred = generate_stage3_caption(
    encoder=encoder,
    projector=projector,
    llm=llm,
    tokenizer=tokenizer,
    row=row
)

print("Final Generated caption from EEG:", pred)
print("==============================")



Class: reflex camera
Ground Truth: A vintage camera with a brown leather strap and case.
------------------------------
Final Generated caption from EEG: A black and white photo camera with a lens and a viewfinder. ',A vintage camera with a strap and a viewfinder.


## Run evaluation metrics

In [89]:
pip install bert-score sentence-transformers rouge-score evaluate openai-clip


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting openai-clip
  Downloading openai-clip-1.0.1.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from openai-clip)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownlo

In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from PIL import Image
import re

# NLTK metrics
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score

# ROUGE
from rouge_score import rouge_scorer

# BERTScore
from bert_score import score as bert_score

# SBERT
from sentence_transformers import SentenceTransformer, util

# CLIP
import clip


In [99]:
# SBERT (fast)
sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").cuda()

# CLIP (ViT-B/32)
clip_model, clip_preprocess = clip.load("ViT-B/32", device="cuda")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [27]:
def compute_bleu(ref, hyp):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref.split()], hyp.split(), smoothing_function=smoothie)

def compute_meteor(ref, hyp):
    try:
        ref_tokens = nltk.word_tokenize(ref)
        hyp_tokens = nltk.word_tokenize(hyp)
        return meteor_score([ref_tokens], hyp_tokens)
    except:
        return 0.0


def compute_rouge_l(ref, hyp):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    return scorer.score(ref, hyp)["rougeL"].fmeasure

def compute_bertscore(ref, hyp):
    P, R, F1 = bert_score([hyp], [ref], lang="en", verbose=False)
    return float(F1[0])

def compute_sbert_similarity(ref, hyp):
    ref_emb = sbert.encode(ref, convert_to_tensor=True)
    hyp_emb = sbert.encode(hyp, convert_to_tensor=True)
    return float(util.cos_sim(ref_emb, hyp_emb)[0][0])


In [104]:
def clean_for_clip(text):
    text = re.sub(r"[^a-zA-Z0-9 .,?!'\"-]", "", text)
    words = text.split()
    return " ".join(words[:60])  # CLIP safe length

def compute_clip_score(caption, img_path):
    caption = clean_for_clip(caption)
    image = clip_preprocess(Image.open(img_path)).unsqueeze(0).cuda()
    text = clip.tokenize([caption]).cuda()

    with torch.no_grad():
        img_features = clip_model.encode_image(image)
        txt_features = clip_model.encode_text(text)

        img_features /= img_features.norm(dim=-1, keepdim=True)
        txt_features /= txt_features.norm(dim=-1, keepdim=True)

    sim = (img_features @ txt_features.T).item()
    return sim


In [105]:
N = 50
sample_df = df.sample(N, random_state=42).reset_index(drop=True)

results = []

for i, row in tqdm(sample_df.iterrows(), total=N):

    # --- Generate model caption ---
    pred = generate_stage3_caption(
        encoder=encoder,
        projector=projector,
        llm=llm,
        tokenizer=tokenizer,
        row=row
    )

    gt = row.caption
    img_path = row.eeg_path

    # --- Metrics ---
    bleu = compute_bleu(gt, pred)
    met = compute_meteor(gt, pred)
    rouge_l = compute_rouge_l(gt, pred)
    bert_f1 = compute_bertscore(gt, pred)
    sbert_sim = compute_sbert_similarity(gt, pred)
    clip_s = compute_clip_score(pred, img_path)

    results.append({
        "class_name": row.class_name,
        "ground_truth": gt,
        "generated_caption": pred,
        "BLEU": bleu,
        "METEOR": met,
        "ROUGE-L": rouge_l,
        "BERTScore": bert_f1,
        "SBERT-Similarity": sbert_sim,
        "CLIPScore": clip_s
    })


  0%|          | 0/50 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/50 [00:03<02:59,  3.67s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▍         | 2/50 [00:07<02:47,  3.49s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  6%|▌         | 3/50 [00:10<02:48,  3.59

In [107]:
print(results)


[{'class_name': 'pajama', 'ground_truth': 'A young boy is sitting on the floor, working on a toy car.', 'generated_caption': 'A young boy in a pajama outfit is sitting on the floor, working on a toy car. His feet are dirty, and he is wearing dirty clothes.', 'BLEU': 0.38242568749566935, 'METEOR': 0.0, 'ROUGE-L': 0.65, 'BERTScore': 0.9490821361541748, 'SBERT-Similarity': 0.7749506235122681, 'CLIPScore': 0.138427734375}, {'class_name': 'canoe', 'ground_truth': 'A wooden canoe on display in a room.', 'generated_caption': 'A wooden canoe on display in a room. The canoe is in full bloom and is standing on a wooden fence.', 'BLEU': 0.3288580454955831, 'METEOR': 0.0, 'ROUGE-L': 0.5517241379310345, 'BERTScore': 0.950850248336792, 'SBERT-Similarity': 0.8390108346939087, 'CLIPScore': 0.1824951171875}, {'class_name': 'German shepherd', 'ground_truth': 'A German Shepherd dog running through the snow.', 'generated_caption': 'A large black and tan dog is curled up in a ball on a couch. The dog is la