<a href="https://colab.research.google.com/github/andhavarapu77/Capstone/blob/main/NanoVLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Clone the nanoVLM repository
!git clone https://github.com/huggingface/nanoVLM.git
%cd nanoVLM

# Install required dependencies
!pip install torch torchvision torchaudio \
  transformers datasets pillow scikit-learn tqdm matplotlib huggingface_hub

Cloning into 'nanoVLM'...
remote: Enumerating objects: 610, done.[K
remote: Counting objects: 100% (366/366), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 610 (delta 301), reused 234 (delta 229), pack-reused 244 (from 2)[K
Receiving objects: 100% (610/610), 13.25 MiB | 28.86 MiB/s, done.
Resolving deltas: 100% (384/384), done.
/content/nanoVLM/nanoVLM/nanoVLM
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata

In [None]:
import kagglehub
devvratmathur_micro_expression_dataset_for_lie_detection_path = kagglehub.dataset_download('devvratmathur/micro-expression-dataset-for-lie-detection')
mohammadabuayyash1_micro_data_path = kagglehub.dataset_download('mohammadabuayyash1/micro-data')

print('Data source import complete.')

Downloading from https://www.kaggle.com/api/v1/datasets/download/devvratmathur/micro-expression-dataset-for-lie-detection?dataset_version_number=3...


100%|██████████| 7.73G/7.73G [01:24<00:00, 97.9MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/mohammadabuayyash1/micro-data?dataset_version_number=1...


100%|██████████| 7.72G/7.72G [01:22<00:00, 101MB/s]

Extracting files...





Data source import complete.


In [None]:

import os
import json
from pathlib import Path

output_jsonl = 'combined_dataset.jsonl'

def collect_jsonl_from_folder(folder_path):
    entries = []
    folder_path = Path(folder_path)

    for file in folder_path.rglob("*.[jp][pn]g"):  # Match .jpg/.jpeg/.png
        label_folder = file.parent.name.lower()
        if "lie" in label_folder:
            label = "lie"
        elif "truth" in label_folder:
            label = "truth"
        else:
            continue
        entries.append({
            "image": str(file.resolve()),  # ensure absolute path
            "text": f"The person is telling a {label}."
        })
    return entries

# Collect entries
entries = []
entries += collect_jsonl_from_folder(devvratmathur_micro_expression_dataset_for_lie_detection_path)
entries += collect_jsonl_from_folder(mohammadabuayyash1_micro_data_path)

# Save to JSONL file
if entries:
    with open(output_jsonl, 'w') as f:
        for item in entries:
            f.write(json.dumps(item) + '\n')
    print(f"✅ Prepared {len(entries)} entries in {output_jsonl}")
else:
    print("❌ No valid entries found. Check dataset structure.")

✅ Prepared 23472 entries in combined_dataset.jsonl


In [None]:
from datasets import Dataset
import json

# Read JSONL manually
data = []
with open("combined_dataset.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

# Create Hugging Face Dataset from list of dicts
dataset = Dataset.from_list(data)

print(dataset)

Dataset({
    features: ['image', 'text'],
    num_rows: 23472
})


In [None]:
from transformers import CLIPProcessor, CLIPModel, TrainingArguments, Trainer
from datasets import Dataset
from PIL import Image
import torch

# 1. Load processor and model (adjust path if fine-tuning from checkpoint)
model_name = "hf-hub:nanoVLM/nanovlm-base"  # Replace with your model path if needed
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)

# 2. Preprocessing function
def preprocess(example):
    image = Image.open(example["image"]).convert("RGB")
    inputs = processor(text=example["text"], images=image, return_tensors="pt", padding=True, truncation=True)
    return {
        "pixel_values": inputs["pixel_values"].squeeze(0),
        "input_ids": inputs["input_ids"].squeeze(0),
        "attention_mask": inputs["attention_mask"].squeeze(0)
    }

# 3. Map preprocessing on dataset
processed_dataset = dataset.map(preprocess, remove_columns=["image", "text"])

# 4. Define PyTorch Dataset
class NanoVLMDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "pixel_values": item["pixel_values"],
            "input_ids": item["input_ids"],
            "attention_mask": item["attention_mask"]
        }

train_dataset = NanoVLMDataset(processed_dataset)

# 5. Training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_nanovlm",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=20,
    save_steps=100,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none"
)

# 6. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=processor,
)

# 7. Train
trainer.train()

# 8. Save fine-tuned model and processor
model.save_pretrained("./fine_tuned_nanovlm")
processor.save_pretrained("./fine_tuned_nanovlm")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


OSError: Can't load image processor for 'hf-hub:nanoVLM/nanovlm-base'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'hf-hub:nanoVLM/nanovlm-base' is the correct path to a directory containing a preprocessor_config.json file

In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load model and processor
model_path = "./fine_tuned_nanovlm"
model = CLIPModel.from_pretrained(model_path)
processor = CLIPProcessor.from_pretrained(model_path)

# Input image path
image_path = "/content/IMG-20250523-WA0019.jpg"  # Replace with your image

# Text options
texts = ["The person is telling a lie.", "The person is telling the truth."]

# Load and preprocess image
image = Image.open(image_path).convert("RGB")
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)

# Run inference
with torch.no_grad():
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # shape: [1, num_texts]
    probs = logits_per_image.softmax(dim=1)  # convert logits to probabilities

# Print result
for text, prob in zip(texts, probs[0]):
    print(f"{text} → {prob.item():.4f}")

# Final prediction
predicted_text = texts[probs.argmax()]
print(f"\n✅ Predicted: {predicted_text}")