In [3]:
# Core ML + model libraries
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu # cpu only
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu


In [1]:
# ONNX + quantization
!pip install --upgrade onnx==1.16.1 onnxruntime==1.21.1

# Intel Neural Compressor (optional, for later phases)
!pip install neural-compressor

# Utilities
!pip install matplotlib tqdm



In [2]:
# Torch check
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())  # should be False on CPU-only

# Transformers + HuggingFace check
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
print("Transformers & CLIP loaded")

# Datasets check
from datasets import load_dataset
dataset = load_dataset("embedding-data/flickr30k-captions", split="train[:1%]")
print("Flickr30K sample")

# ONNX + ONNXRuntime check
import onnx, onnxruntime
print("ONNX version:", onnx.__version__)
print("ONNXRuntime version:", onnxruntime.__version__)

# INC check
import neural_compressor
print("INC version:", neural_compressor.__version__)

print("All core libraries installed and working!")

Torch version: 2.8.0+cu126
CUDA available: False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Transformers & CLIP loaded


README.md: 0.00B [00:00, ?B/s]

flickr30k_captions.jsonl.gz:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Flickr30K sample
ONNX version: 1.16.1
ONNXRuntime version: 1.21.1
INC version: 3.5
All core libraries installed and working!


# Install MobileCLIP

In [4]:
!pip install open_clip_torch timm huggingface_hub
# Apple repo (for reparameterize_model)
!pip install git+https://github.com/apple/ml-mobileclip.git


Collecting open_clip_torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.1 open_clip_torch-3.2.0
Collecting git+https://github.com/apple/ml-mobileclip.git
  Cloning https://github.com/apple/ml-mobileclip.git to /tmp/pip-req-build-y43lqr82
  Running command git clone --filter=blob:none --quiet https://github.com/apple/ml-mobileclip.git /tmp/pip-req-build-y43lqr82
  Resolved https://github.com/apple/ml-mobileclip.git to commit e0b9bce7

# Download MobileCLIP2-S2 Checkpoint

In [5]:
from huggingface_hub import hf_hub_download

# Download the MobileCLIP2-S2 checkpoint (.pt file)
ckpt_path = hf_hub_download(repo_id="apple/MobileCLIP2-S2", filename="mobileclip2_s2.pt")
print("Downloaded checkpoint:", ckpt_path)


mobileclip2_s2.pt:   0%|          | 0.00/398M [00:00<?, ?B/s]

Downloaded checkpoint: /root/.cache/huggingface/hub/models--apple--MobileCLIP2-S2/snapshots/0e79fb4c583a53cfaa3f58677703fa0740df1150/mobileclip2_s2.pt


# Load the Model onto CPU

In [6]:
import torch, open_clip
from PIL import Image
from mobileclip.modules.common.mobileone import reparameterize_model

device = "cpu"

# Architecture string: "MobileCLIP2-S2"
model, _, preprocess = open_clip.create_model_and_transforms(
    "MobileCLIP2-S2", pretrained=ckpt_path
)
model = reparameterize_model(model.eval()).to(device)

tokenizer = open_clip.get_tokenizer("MobileCLIP2-S2")

print("MobileCLIP2-S2 loaded on CPU")




MobileCLIP2-S2 loaded on CPU


# Record Baseline Model Size

In [10]:
import os

model_size_mb = os.path.getsize(ckpt_path) / 1e6
print(f"Model size (FP32): {model_size_mb:.2f} MB")


Model size (FP32): 398.07 MB


# Build a Mini Flickr30K Dataset

In [25]:
import pandas as pd
from datasets import Dataset
from PIL import Image
import os, ast

# Paths
base_dir = "/content/drive/MyDrive/capstone" # from google drive
img_dir = os.path.join(base_dir, "flickr30k-images")
csv_path = os.path.join(base_dir, "flickr_annotations_30k.csv")

# Load csv
df = pd.read_csv(csv_path)

# Get captions
def extract_caption(raw_str):
    try:
        captions = ast.literal_eval(raw_str)
        return captions[0] if isinstance(captions, list) and len(captions) > 0 else None
    except Exception:
        return None

df["caption"] = df["raw"].apply(extract_caption)

# Drop empty captions or filenames
df = df.dropna(subset=["filename", "caption"])
df = df.sample(500, random_state=42).reset_index(drop=True)

# Build dataset
samples = []
missing = 0

for _, row in df.iterrows():
    img_path = os.path.join(img_dir, row["filename"].strip())
    if os.path.exists(img_path):
        samples.append({
            "image": Image.open(img_path).convert("RGB"),
            "caption": row["caption"].strip()
        })
    else:
        missing += 1

print(f"Found {len(samples)} images, skipped {missing} missing ones")

flickr30k_ds = Dataset.from_list(samples)
print("Dataset length:", len(flickr30k_ds))
print(flickr30k_ds[0])


Found 500 images, skipped 0 missing ones
Dataset length: 500
{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=500x258 at 0x7C887ECAFB30>, 'caption': 'A man in black leather pants is holding something in his hand over the water, while standing on a bridge.'}


# Compute Recall@1 Function

In [26]:
import torch
from tqdm import tqdm

def compute_recall1(ds, model, preprocess, tokenizer, device="cpu", N=50, batch_size=16):
    """
    Computes Recall@1 (image→text) on first N samples of dataset.
    Each sample must have: {"image": PIL.Image, "caption": str}.
    """

    subset = ds.select(range(min(N, len(ds))))

    # Extract images and captions
    images = [r["image"] for r in subset]
    captions = [r["caption"] for r in subset]

    # Compute image embeddings
    img_embs = []
    for i in tqdm(range(0, len(images), batch_size), desc="Encoding images"):
        batch_imgs = [preprocess(im).unsqueeze(0) for im in images[i:i+batch_size]]
        batch_imgs = torch.cat(batch_imgs).to(device)
        with torch.no_grad():
            emb = model.encode_image(batch_imgs)
        emb = emb / emb.norm(dim=-1, keepdim=True)
        img_embs.append(emb.cpu())
    img_embs = torch.cat(img_embs, dim=0)  # [N, d]

    # Compute text embeddings
    txt_embs = []
    for i in tqdm(range(0, len(captions), batch_size), desc="Encoding captions"):
        batch_txt = tokenizer(captions[i:i+batch_size]).to(device)
        with torch.no_grad():
            emb = model.encode_text(batch_txt)
        emb = emb / emb.norm(dim=-1, keepdim=True)
        txt_embs.append(emb.cpu())
    txt_embs = torch.cat(txt_embs, dim=0)  # [N, d]

    # Similarity matrix
    sims = img_embs @ txt_embs.T  # [N, N]

    # For each image, check if its best caption is its own
    pred = sims.argmax(dim=1)
    correct = (pred == torch.arange(len(subset))).sum().item()

    recall1 = 100.0 * correct / len(subset)
    return recall1


In [27]:
recall1 = compute_recall1(
    flickr30k_ds,
    model,
    preprocess,
    tokenizer,
    device="cpu",
    N=500,          # or smaller if slow
    batch_size=16
)
print(f"\nRecall@1 on Flickr30k subset: {recall1:.2f}%")


Encoding images: 100%|██████████| 32/32 [03:57<00:00,  7.44s/it]
Encoding captions: 100%|██████████| 32/32 [00:57<00:00,  1.80s/it]


Recall@1 on Flickr30k subset: 0.20%





# Measure Inference Latency

In [28]:
import time
from tqdm import tqdm

def measure_inference_time(model, preprocess, tokenizer, sample, repeats=50):
    image, caption = sample["image"], sample["caption"]
    img = preprocess(image).unsqueeze(0)
    txt = tokenizer([caption])

    # Warm-up
    _ = model.encode_image(img)
    _ = model.encode_text(txt)

    # Timed runs
    start = time.time()
    for _ in range(repeats):
        with torch.no_grad():
            _ = model.encode_image(img)
            _ = model.encode_text(txt)
    end = time.time()

    avg_time = (end - start) / repeats
    return avg_time

latency = measure_inference_time(model, preprocess, tokenizer, flickr30k_ds[0])
print(f"Average CPU inference latency per sample: {latency*1000:.2f} ms")


Average CPU inference latency per sample: 614.10 ms


# Export Model to ONNX

In [29]:
import torch.onnx as onnx_export

# Dummy inputs (single image & text)
img = preprocess(flickr30k_ds[0]["image"]).unsqueeze(0)
txt = tokenizer([flickr30k_ds[0]["caption"]])

onnx_path = "mobileclip2_fp32.onnx"
torch.onnx.export(
    model,
    (img, txt),
    onnx_path,
    input_names=["image", "text"],
    output_names=["img_emb", "txt_emb"],
    opset_version=17,
)

print("Exported to ONNX:", onnx_path)


  torch.onnx.export(


Exported to ONNX: mobileclip2_fp32.onnx


# Quantize only Linear layers

In [31]:
from onnxruntime.quantization import quantize_dynamic, QuantType
import os

quantized_path = "mobileclip2_int8_linear.onnx"

# Quantize only MatMul (linear) layers to avoid ConvInteger ops
quantize_dynamic(
    model_input=onnx_path,
    model_output=quantized_path,
    weight_type=QuantType.QInt8,
    op_types_to_quantize=["MatMul"]
)

print("Quantized model (Linear-only) saved:", quantized_path)

# Compare sizes
fp32_size = os.path.getsize(onnx_path)/1e6
int8_size = os.path.getsize(quantized_path)/1e6
print(f"FP32 = {fp32_size:.1f} MB -> INT8 (linear-only) = {int8_size:.1f} MB")




Quantized model (Linear-only) saved: mobileclip2_int8_linear.onnx
FP32 = 397.0 MB -> INT8 (linear-only) = 261.5 MB


# Apply INT8 Quantization


In [35]:
import onnxruntime as ort
import numpy as np
import time
from PIL import Image

# Load the quantized model
session = ort.InferenceSession(quantized_path, providers=["CPUExecutionProvider"])
print("ONNX INT8 model loaded successfully!")

# Prepare a test sample
sample = flickr30k_ds[0]
img = preprocess(sample["image"]).unsqueeze(0).numpy()
txt = tokenizer([sample["caption"]]).numpy()

# Warm-up
_ = session.run(None, {"image": img, "text": txt})

# Measure latency
start = time.time()
for _ in range(20):
    _ = session.run(None, {"image": img, "text": txt})
end = time.time()

int8_latency = (end - start) / 20
print(f"INT8 (linear-only) inference latency: {int8_latency*1000:.2f} ms/sample")


ONNX INT8 model loaded successfully!
INT8 (linear-only) inference latency: 466.30 ms/sample


# Alignment Cosine Drift

In [38]:
import torch
import numpy as np
from tqdm import tqdm

# pick a small subset for drift measurement
subset = flickr30k_ds.select(range(50))  # keep small for speed

fp32_cos = []
int8_cos = []

# Load ONNX INT8 session (already created above)
session = ort.InferenceSession(quantized_path, providers=["CPUExecutionProvider"])

for sample in tqdm(subset, desc="Computing cosine drift"):
    image, caption = sample["image"], sample["caption"]

    # FP32 embeddings
    with torch.no_grad():
        img_fp = model.encode_image(preprocess(image).unsqueeze(0))
        txt_fp = model.encode_text(tokenizer([caption]))
        img_fp = img_fp / img_fp.norm(dim=-1, keepdim=True)
        txt_fp = txt_fp / txt_fp.norm(dim=-1, keepdim=True)
        cos_fp = torch.nn.functional.cosine_similarity(img_fp, txt_fp).item()

    # INT8 embeddings (ONNX)
    img_np = preprocess(image).unsqueeze(0).numpy()
    txt_np = tokenizer([caption]).numpy()
    out_int8 = session.run(None, {"image": img_np, "text": txt_np})
    img_int8, txt_int8 = out_int8[0], out_int8[1]
    cos_int8 = float(np.dot(img_int8, txt_int8.T) /
                     (np.linalg.norm(img_int8) * np.linalg.norm(txt_int8)))

    fp32_cos.append(cos_fp)
    int8_cos.append(cos_int8)

# compute drift metrics
cos_diffs = np.abs(np.array(fp32_cos) - np.array(int8_cos))
mean_cosine_drift = cos_diffs.mean()
pct_pairs_drift_gt_002 = (cos_diffs > 0.02).mean() * 100

print(f"Mean cosine drift: {mean_cosine_drift:.4f}")
print(f"% of pairs drift > 0.02: {pct_pairs_drift_gt_002:.1f}%")


  cos_int8 = float(np.dot(img_int8, txt_int8.T) /
Computing cosine drift: 100%|██████████| 50/50 [01:04<00:00,  1.29s/it]

Mean cosine drift: 0.0228
% of pairs drift > 0.02: 42.0%





# Comparison Table


In [39]:
import platform, psutil, onnxruntime, torch, pandas as pd

# hardware / software crumbs
print("CPU:", platform.processor())
print("Logical cores:", psutil.cpu_count(logical=True))
print("PyTorch version:", torch.__version__)
print("ONNXRuntime version:", onnxruntime.__version__)
print("Dataset subset size:", len(flickr30k_ds))

# Build tidy results DataFrame
results = pd.DataFrame([
    {
        "Model": "FP32",
        "Size_MB": 398.07,
        "Latency_ms_per_sample": 614.10,
        "Throughput_sps": round(1000/614.10, 2),
        "Recall@1_pct": 0.20,
        "MeanCosineDrift": 0.0,  # baseline
    },
    {
        "Model": "INT8 (linear-only)",
        "Size_MB": 261.50,
        "Latency_ms_per_sample": 466.30,
        "Throughput_sps": round(1000/466.30, 2),
        "Recall@1_pct": None,
        "MeanCosineDrift": mean_cosine_drift,
    }
])

print("\nFinal Phase 1 Results")
display(results)


CPU: x86_64
Logical cores: 2
PyTorch version: 2.8.0+cu126
ONNXRuntime version: 1.21.1
Dataset subset size: 500

Final Phase 1 Results


Unnamed: 0,Model,Size_MB,Latency_ms_per_sample,Throughput_sps,Recall@1_pct,MeanCosineDrift
0,FP32,398.07,614.1,1.63,0.2,0.0
1,INT8 (linear-only),261.5,466.3,2.14,,0.022756
