# Dependencies

## Installation

In [1]:
!pip install python-crfsuite flashtext fasttext-wheel scikit-learn numpy gensim nltk
!pip install faiss-cpu transformers torch accelerate bitsandbytes Pillow datasets unsloth
!pip install multilingual-clip
!pip install git+https://github.com/openai/CLIP.git
!git clone https://github.com/roshan-research/hazm.git libs/hazm
!pip install datasets
!pip install gdown
!pip install faiss-cpu

Collecting faiss-cpu
  Using cached faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting unsloth
  Downloading unsloth-2025.9.2-py3-none-any.whl.metadata (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting unsloth_zoo>=2025.9.3 (from unsloth)
  Downloading unsloth_zoo-2025.9.3-py3-none-any.whl.metadata (9.5 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.31-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Downloading trl-0.22.2-py3-none-an

## Imports

In [None]:
import json
import sys
import os

hazm_parent_dir = os.path.join("/content", "libs", "hazm")

if hazm_parent_dir not in sys.path:
    sys.path.insert(0, hazm_parent_dir)

In [None]:
import clip
import faiss
import gdown
import numpy as np
import pickle
import torch


from datasets import load_dataset
from hazm import Normalizer
from multilingual_clip import pt_multilingual_clip
from PIL import Image
from tqdm import tqdm
from transformers import AutoProcessor, AutoTokenizer, TextStreamer
from unsloth import FastLanguageModel


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Download Data

In [None]:
gdown.download(
    url="https://drive.google.com/file/d/1RL4FDbE2zMqauAjoa6FTPjhhMTArQ4O_/view?usp=drive_link",
    fuzzy=True,
)  # images.zip
gdown.download(
    url="https://drive.google.com/file/d/1ccp_4FQwPIJFjMNke_SJ2SrMnkzGNvkF/view?usp=drive_link",
    fuzzy=True,
)  # dataset.json
gdown.download(
    url="https://drive.google.com/file/d/1DA1q8wA6af9O4gfgKZhHSWk41_ZiZ7xX/view?usp=drive_link",
    fuzzy=True,
)  # ImageTextTest_merged.json
gdown.download(
    url="https://drive.google.com/file/d/1cobe1fIiovAwz2TaLC9gXyOCDp_KdLrV/view?usp=drive_link",
    fuzzy=True,
)  # TextTest_merged.json
gdown.download(
    url="https://drive.google.com/file/d/1ANhsTyzzEdZC8dhah2V807t-ySv6ZFba/view?usp=drive_link",
    fuzzy=True,
)  # image_inference.zip

Downloading...
From (original): https://drive.google.com/uc?id=1RL4FDbE2zMqauAjoa6FTPjhhMTArQ4O_
From (redirected): https://drive.google.com/uc?id=1RL4FDbE2zMqauAjoa6FTPjhhMTArQ4O_&confirm=t&uuid=e9854ef4-23f7-4965-a3d9-e1daba904484
To: /content/images.zip
100%|██████████| 188M/188M [00:05<00:00, 31.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ccp_4FQwPIJFjMNke_SJ2SrMnkzGNvkF
To: /content/dataset.json
100%|██████████| 853k/853k [00:00<00:00, 99.9MB/s]


'dataset.json'

In [5]:
!unzip images.zip

Archive:  images.zip
   creating: images/
  inflating: images/999.png          
  inflating: images/7.png            
  inflating: images/842.png          
  inflating: images/841.png          
  inflating: images/843.png          
  inflating: images/846.png          
  inflating: images/845.png          
  inflating: images/844.png          
  inflating: images/501.png          
  inflating: images/503.png          
  inflating: images/502.png          
  inflating: images/1098.png         
  inflating: images/868.png          
  inflating: images/1076.png         
  inflating: images/1077.png         
  inflating: images/1078.png         
  inflating: images/1065.png         
  inflating: images/1064.png         
  inflating: images/955.png          
  inflating: images/769.png          
  inflating: images/1060.png         
  inflating: images/612.png          
  inflating: images/611.png          
  inflating: images/529.png          
  inflating: images/1071.png         
  inflat

# Configs

In [None]:
TEXT_MODEL_NAME = "M-CLIP/XLM-Roberta-Large-Vit-B-32"
INDEX_FILE = "tourist_attractions_fused_fine_tuned.index"
MAPPING_FILE = "index_to_data_mapping_fused_fine_tuned.pkl"
DATASET_TOURISM = "alisharifi/tourist-attractions-text-image"
LVM_MODEL_NAME = "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit"

# Load Models

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
clip_text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(
    TEXT_MODEL_NAME
).to(device)
clip_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME)
clip_image_model, image_preprocess = clip.load("ViT-B/32", device=device)

config.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

100%|███████████████████████████████████████| 338M/338M [00:07<00:00, 48.7MiB/s]


In [9]:
dataset = load_dataset(DATASET_TOURISM)

README.md:   0%|          | 0.00/313 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/215k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1089 [00:00<?, ? examples/s]

# Retrieval

In [None]:
def cuda_tokenzier(txt, padding=True, return_tensors="pt"):
    # TODO: Find a better way
    return clip_tokenizer(
        txt,
        padding="max_length",
        return_tensors=return_tensors,
        max_length=512,
        truncation=True,
    ).to(device)

In [None]:
all_fused_embeddings = []
index_to_data_map = {}
idx = 0

for item in tqdm(dataset["train"]):
    text = item["text"]
    image_address = item["image_address"]

    image = Image.open(image_address).convert("RGB")
    image = image_preprocess(image).unsqueeze(0).to(device)

    with torch.no_grad():
        image_features = clip_image_model.encode_image(image)

    with torch.no_grad():
        text_features = clip_text_model.forward(text, cuda_tokenzier)

    fused_embedding = text_features  # (image_features + text_features) / 2.0
    fused_embedding = fused_embedding / torch.linalg.norm(
        fused_embedding, dim=-1, keepdim=True
    )

    all_fused_embeddings.append(fused_embedding.squeeze().cpu().numpy())
    index_to_data_map[idx] = item
    idx += 1

100%|██████████| 1089/1089 [02:18<00:00,  7.85it/s]


In [13]:
embeddings_matrix = np.array(all_fused_embeddings, dtype=np.float32)
embedding_dimension = embeddings_matrix.shape[1]
index = faiss.IndexFlatIP(embedding_dimension)

print(f"FAISS index created with dimension {embedding_dimension}.")

index.add(embeddings_matrix)
print(f"Added {index.ntotal} fused vectors to the index.")

FAISS index created with dimension 512.
Added 1089 fused vectors to the index.


In [None]:
print(f"Saving FAISS index to {INDEX_FILE}...")
faiss.write_index(index, INDEX_FILE)

print(f"Saving mapping file to {MAPPING_FILE}...")
with open(MAPPING_FILE, "wb") as f_out:
    pickle.dump(index_to_data_map, f_out)

print("\n--- Indexing with Fused Embeddings Complete! ---")
print(f"FAISS index and data mapping have been saved successfully.")

Saving FAISS index to tourist_attractions_fused_fine_tuned.index...
Saving mapping file to index_to_data_mapping_fused_fine_tuned.pkl...

--- Indexing with Fused Embeddings Complete! ---
FAISS index and data mapping have been saved successfully.


In [None]:
# Load FAISS index and the mapping
index = faiss.read_index(INDEX_FILE)
with open(MAPPING_FILE, "rb") as f_in:
    index_to_data = pickle.load(f_in)

In [16]:
# --- 2. Retrieval Function (Updated for normalized vectors) ---
def retrieve_relevant_items(query_embedding, k=3):
    """Searches the FAISS index and returns the original data items."""
    # Ensure the query embedding is also normalized before search
    query_embedding_normalized = query_embedding / np.linalg.norm(query_embedding)

    # Reshape for FAISS search
    query_vector = np.array([query_embedding_normalized], dtype=np.float32)

    distances, indices = index.search(query_vector, k)

    retrieved_items = []
    for i in range(k):
        item_index = indices[0][i]
        retrieved_data = index_to_data[item_index]
        retrieved_items.append(retrieved_data)

    return retrieved_items

In [None]:
lvm_model, lvm_tokenizer = FastLanguageModel.from_pretrained(
    model_name=LVM_MODEL_NAME,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
    trust_remote_code=True,  # Keep this from the previous fix
)
lvm_processor = AutoProcessor.from_pretrained(LVM_MODEL_NAME)

Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.9.2: Fast Llava_Next patching. Transformers: 4.56.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Llava_Next does not support SDPA - switching to fast eager.


model.safetensors:   0%|          | 0.00/4.30G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_response(query_text, retrieved_items, query_image=None):
    """Constructs the prompt and generates a response from the LVM based on query type."""

    all_images_for_lvm = []
    context_text = "اطلاعات مرتبط بازیابی شده به شرح زیر است:\n\n"
    retrieved_images = []

    if query_image is None:
        prompt_instruction = (
            "شما یک دستیار متخصص گردشگری ایران هستید."
            "با استفاده از متن و تصاویر ارائه شده، اطلاعات کاملی درباره مکان‌های مرتبط با جستجوی کاربر ارائه دهید."
            "تصاویر بازیابی شده را شرح دهید و ارتباط آنها را با سوال کاربر توضیح دهید."
            # "با استفاده از متن و تصاویر ارائه شده فقط شماره یا متن گزینه‌ی صحیح را از بین گزینه‌های داده شده انتخاب کن.",
            # "هیچ توضیح اضافه‌ای نده، فقط پاسخ درست را بازگردان."
        )

        for i, item in enumerate(retrieved_items):
            context_text += f"مورد بازیابی شده {i+1}:\n"
            context_text += f"توضیحات: {item['text']}\n"
            try:
                context_text += f"تصویر: {os.path.basename(item['image_address'])}\n\n"
                all_images_for_lvm.append(Image.open(item["image_address"]))
            except FileNotFoundError:
                print(
                    f"Warning: Could not find image for context: {item['image_address']}"
                )
                context_text += "تصویر: یافت نشد\n\n"

        image_placeholders = "".join(["<image>\n"] * len(all_images_for_lvm))

        prompt_body = f"""{image_placeholders}{prompt_instruction}

--- اطلاعات بازیابی شده ---
{context_text}
--- پایان اطلاعات بازیابی شده ---

سوال کاربر: {query_text}
"""

    else:
        prompt_instruction = (
            "شما یک دستیار متخصص گردشگری ایران هستید."
            "با استفاده از تصویر ارائه شده و متن سوال کاربر، همچنین اطلاعات مرتبط بازیابی شده، به سوال کاربر پاسخ دهید."
            "تصویر اصلی را شرح دهید و با استفاده از اطلاعات بازیابی شده جزئیات بیشتری ارائه دهید."
            # "با استفاده از متن و تصاویر ارائه شده فقط شماره یا متن گزینه‌ی صحیح را از بین گزینه‌های داده شده انتخاب کن.",
            # "هیچ توضیح اضافه‌ای نده، فقط پاسخ درست را بازگردان."
        )

        all_images_for_lvm.append(query_image)

        retrieved_images = []
        for i, item in enumerate(retrieved_items):
            context_text += f"مورد بازیابی شده {i+1}:\n"
            context_text += f"توضیحات: {item['text']}\n"
            try:
                retrieved_images.append(Image.open(item["image_address"]))
                context_text += f"تصویر: {os.path.basename(item['image_address'])}\n\n"
                all_images_for_lvm.append(
                    Image.open(item["image_address"])
                )  # Add retrieved images

            except FileNotFoundError:
                print(
                    f"Warning: Could not find image for context: {item['image_address']}"
                )
                context_text += "تصویر: یافت نشد\n\n"

        image_placeholders = "".join(["<image>\n"] * len(all_images_for_lvm))

        prompt_body = f"""{image_placeholders}{prompt_instruction}

--- اطلاعات بازیابی شده ---
{context_text}
--- پایان اطلاعات بازیابی شده ---

سوال کاربر: {query_text}
"""
    final_prompt = f"USER: {prompt_body}\nASSISTANT:"

    inputs = lvm_processor(
        text=final_prompt, images=all_images_for_lvm, return_tensors="pt"
    ).to("cuda")

    streamer = TextStreamer(lvm_tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = lvm_model.generate(
        **inputs,
        streamer=streamer,
        max_new_tokens=1024,
        use_cache=True,
        do_sample=True,
        top_k=50,
        temperature=0.7,
    )

In [19]:
def normalize_farsi_text(text):
    """
    Normalizes a given Farsi text using the Hazm library.
    """
    normalizer = Normalizer()
    return normalizer.normalize(text)

In [None]:
def multimodal_rag_pipeline(query, query_type="text"):
    """
    The main pipeline function.
    Handles text or combined text+image queries.
    """
    print("\n" + "=" * 50)
    print(f"Processing new {query_type} query...")

    query_embedding = None
    query_text_for_prompt = ""
    query_image_for_prompt = None

    if query_type == "text":
        print(f'Query: "{query}"')
        query_text_for_prompt = normalize_farsi_text(query)

        with torch.no_grad():
            query_embedding = clip_text_model.forward(query, cuda_tokenzier)[0]

    elif query_type == "text+image":
        if not isinstance(query, dict) or "text" not in query or "image" not in query:
            raise ValueError(
                "For 'text+image' query type, 'query' must be a dictionary with 'text' and 'image' keys."
            )

        print(f"Query: Text=\"{query['text']}\", Image=[Image Object]")
        query_text_for_prompt = normalize_farsi_text(query["text"])
        query_image_for_prompt = query["image"]

        with torch.no_grad():
            text_embedding = clip_text_model.forward(
                query_text_for_prompt, cuda_tokenzier
            )[0]
            image = image_preprocess(query_image_for_prompt).unsqueeze(0).to(device)
            image_embedding = clip_image_model.encode_image(image)

        query_embedding = (text_embedding + image_embedding) / 2.0

    else:
        raise ValueError("Invalid query_type. Must be 'text' or 'text+image'.")

    query_embedding = query_embedding.squeeze().cpu().numpy()

    retrieved_items = retrieve_relevant_items(query_embedding)
    print(f"\nRetrieved {len(retrieved_items)} relevant items:")
    for item in retrieved_items:
        print(f"- {item['text']}")

    print("\n--- Generating Response ---")
    generate_response(
        query_text=query_text_for_prompt,
        retrieved_items=retrieved_items,
        query_image=query_image_for_prompt,
    )
    print("\n" + "=" * 50)

# Queries

## Helper Functions

In [None]:
def build_prompt(item, typee="text"):
    prompt = (
        f"{item['question']}"
        # f"گزینه‌ها:\n" +
        # "\n".join([f"{i+1}) {opt}" for i, opt in enumerate(item["options"])])
    )

    if typee == "text+hint" and "hint" in item:
        prompt += f"\nراهنمایی: {item['hint']}"

    # prompt += "\n\nفقط شماره گزینه صحیح را جواب بده."
    return prompt


def chunk_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


def evaluate_item(item, typee="text"):
    prompt = build_prompt(item, typee)

    if typee == "text" or typee == "text+hint":
        print("Question", item["id"], ":")
        multimodal_rag_pipeline(prompt, query_type="text")

    elif typee == "text+image":
        image_path = os.path.join("image_inference", f"{item['id']}.jpg")
        if not os.path.exists(image_path):
            print(f"Image not found: {image_path}")
            return

        image = load_image(image_path, size=(224, 224))
        print("Question", item["id"], ":")
        multimodal_rag_pipeline(
            query={"text": prompt, "image": image}, query_type="text+image"
        )
    else:
        raise ValueError("typee must be 'text' or 'text+image'")


def evaluate_batch(batch, typee="text"):
    for item in batch:
        evaluate_item(item, typee)


def load_image(path_or_url, size=(224, 224)):
    image = Image.open(path_or_url).convert("RGB")
    image = image.resize(size, Image.Resampling.LANCZOS)
    return image

## Text Queries

In [27]:
with open("TextTest_merged.json", "r", encoding="utf-8") as f:
    qa_list = json.load(f)

batches = list(chunk_list(qa_list, 10))

In [None]:
for batch in batches:
    evaluate_batch(batch, typee="text")

## Text & Hint Queries

In [None]:
with open("ImageTextTest_merged.json", "r", encoding="utf-8") as f:
    qa_list = json.load(f)

batches = list(chunk_list(qa_list, 10))

In [None]:
for batch in batches:
    evaluate_batch(batch, typee="text+hint")

## Text & Image Queries

In [None]:
with open("ImageTextTest_merged.json", "r", encoding="utf-8") as f:
    qa_list = json.load(f)

batches = list(chunk_list(qa_list, 10))

In [None]:
for batch in batches:
    evaluate_batch(batch, typee="text+image")