In [2]:
!pip install hazm

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gensim<5.0.0,>=4.3.1 (from hazm)
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)
  Downloading pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Collecting scipy<1.14.0,>=1.7.0

In [31]:
import json
import numpy as np
from hazm import Normalizer, stopwords_list
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
with open("aggregate.json", "r", encoding="utf-8") as f:
    data = json.load(f)

with open("Test_data.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)

In [35]:
normalizer = Normalizer()
stop_words = stopwords_list()

def convert_persian_digits(text):
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    english_digits = '0123456789'
    trans_table = str.maketrans(''.join(persian_digits), ''.join(english_digits))
    return text.translate(trans_table)

def preprocess(text):
    text = normalizer.normalize(text)
    text = convert_persian_digits(text)
    return text

In [33]:
contexts = [preprocess(item['context']) for item in data]

vectorizer = TfidfVectorizer(stop_words=stop_words)
context_vectors = vectorizer.fit_transform(contexts)

results = []

for eval_item in eval_data:
    q_id = eval_item['id']
    question = preprocess(eval_item['question'])

    q_vec = vectorizer.transform([question])
    similarities = cosine_similarity(q_vec, context_vectors).flatten()
    top_indices = similarities.argsort()[::-1][:3]

    top_results = []
    for idx in top_indices:
        top_results.append({
            "context": data[idx]["context"],
            "similarity": float(similarities[idx])
        })

    results.append({
        "id": q_id,
        "question": eval_item["question"],
        "answer": eval_item["answer"],
        "retrieved": top_results
    })

In [34]:
with open("tfidf_eval_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)