In [19]:
!pip install -q transformers accelerate pillow sentencepiece torch


In [1]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image

model_id = "HuggingFaceTB/SmolVLM-Instruct"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/4.49G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [3]:
from google.colab import files

uploaded = files.upload()
image_paths = list(uploaded.keys())


Saving 20240203_174153.jpg to 20240203_174153.jpg
Saving 20240203_181132.jpg to 20240203_181132.jpg
Saving 20240203_183128(0).jpg to 20240203_183128(0).jpg
Saving 20240203_184135.jpg to 20240203_184135.jpg
Saving 20240209_170845.jpg to 20240209_170845.jpg
Saving 20240726_151131.jpg to 20240726_151131.jpg
Saving 20240726_151404.jpg to 20240726_151404.jpg
Saving 20240726_155924.jpg to 20240726_155924.jpg
Saving 20240726_161550.jpg to 20240726_161550.jpg
Saving 20240726_163140.jpg to 20240726_163140.jpg
Saving 20240726_193414.jpg to 20240726_193414.jpg
Saving 20240727_152106.jpg to 20240727_152106.jpg
Saving different.jpg to different.jpg
Saving image.jpg to image.jpg
Saving IMG-20240212-WA0272.jpg to IMG-20240212-WA0272.jpg
Saving IMG-20240212-WA0616.jpg to IMG-20240212-WA0616.jpg
Saving sample.jpg to sample.jpg
Saving similar.jpg to similar.jpg
Saving sss.jpg to sss.jpg


In [37]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": """
Analyze the image and extract metadata.

IMPORTANT:
- Do NOT reuse or copy any example values
- Do NOT hallucinate generic captions
- Every value must be derived from THIS image only
- If unsure, use null or empty lists
- Output ONLY valid JSON
- Do NOT add explanations

Populate this schema using the image:

{
  "caption": null,
  "objects": [],
  "actions": [],
  "expressions": [],
  "posture": null,
  "scene": null,
  "time_of_day": null,
  "text_in_image": [],
  "mood": null
}
"""
            }
        ]
    }
]

prompt = processor.apply_chat_template(
    messages,
    add_generation_prompt=True
)


In [38]:
def extract_first_json(text):
    start = text.find("{")
    if start == -1:
        raise ValueError("No JSON object found")

    brace_count = 0
    for i in range(start, len(text)):
        if text[i] == "{":
            brace_count += 1
        elif text[i] == "}":
            brace_count -= 1
            if brace_count == 0:
                return text[start:i+1]

    raise ValueError("Incomplete JSON object")


In [39]:
import json
from PIL import Image
import torch

def generate_metadata(image_path):
    image = Image.open(image_path).convert("RGB")

    inputs = processor(
        images=image,
        text=prompt,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=300,
            do_sample=False
        )

    decoded = processor.batch_decode(output, skip_special_tokens=True)[0]

    json_text = extract_first_json(decoded)
    return json.loads(json_text)


In [40]:
meta = generate_metadata(image_paths[0])
print(meta)


{'caption': None, 'objects': [], 'actions': [], 'expressions': [], 'posture': None, 'scene': None, 'time_of_day': None, 'text_in_image': [], 'mood': None}


In [15]:
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

embed_model_id = "sentence-transformers/all-MiniLM-L6-v2"

tok = AutoTokenizer.from_pretrained(embed_model_id)
emb_model = AutoModel.from_pretrained(embed_model_id).to("cuda")

def embed_text(text):
    inputs = tok(text, return_tensors="pt", truncation=True).to("cuda")
    with torch.no_grad():
        output = emb_model(**inputs).last_hidden_state.mean(dim=1)
    return F.normalize(output, dim=1)[0]

embeddings = {}

for k, v in metadata.items():
    embeddings[k] = embed_text(v)



In [16]:
def search(query, top_k=5):
    q_emb = embed_text(query)

    scores = []
    for path, emb in embeddings.items():
        score = torch.dot(q_emb, emb).item()
        scores.append((path, score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

from IPython.display import display
from PIL import Image

def show_search_results(query, top_k=5):
    results = search(query, top_k)

    print(f"\n🔍 QUERY: {query}\n" + "-"*50)

    for path, score in results:
        print(f"\n📷 {path}  |  score: {score:.3f}")
        print("📝 Metadata:")
        print(metadata[path][:300], "...\n")  # show first part only

        img = Image.open(path)
        display(img.resize((300, 300)))


show_search_results("sunset on the train tracks")
show_search_results("vintage car")
show_search_results("bikes")
show_search_results("group of friends outdoors")
show_search_results("sunset on the beach")
show_search_results("mountains")
show_search_results("solo photo outdoors")


Output hidden; open in https://colab.research.google.com to view.