In [None]:
!pip install datasets evaluate cohere faiss-cpu



In [None]:
!pip install clean-text[emoji] nltk

Collecting clean-text[emoji]
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
[0mCollecting emoji<2.0.0,>=1.0.0 (from clean-text[emoji])
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy<7.0,>=6.0 (from clean-text[emoji])
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171031 sha256=615cd1b56b5cf5ad39c8fa9ea383ef633cf5eb2a623d1f11bfc6b239210f96e3
  Stored in directory: /root/.ca

In [None]:
!pip install langchain==0.2.5 faiss-cpu==1.8.0 cohere==5.5.8 langchain-community==0.2.5 rank_bm25==0.2.2 sentence-transformers==3.0.1
!pip install llama-cpp-python==0.2.78  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu124


In [None]:
from datasets import load_dataset

dataset = load_dataset("squad_v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset.shape

{'train': (130319, 5), 'validation': (11873, 5)}

In [None]:
print(dataset["validation"][0])

{'id': '56ddde6b9a695914005b9628', 'title': 'Normans', 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.', 'question': 'In what country is Normandy located?', 'answers': {'text': ['France', 'France', 'France', 'France'], 'answer_start': [159, 159, 159, 159]}}


In [None]:
validation_data = dataset["validation"].select(range(10))

for example in validation_data:
    question = example["question"]
    context = example["context"]
    reference = example["answers"]["text"][0] if example["answers"]["text"] else ""

    print(f"Question: {question}")
    print(f"Answer: {reference}")
    print(f"Context: {context[:100]}...")  # فقط بخشی از متن برای خلاصه‌نمایی
    print("-" * 50)


Question: In what country is Normandy located?
Answer: France
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th a...
--------------------------------------------------
Question: When were the Normans in Normandy?
Answer: 10th and 11th centuries
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th a...
--------------------------------------------------
Question: From which countries did the Norse originate?
Answer: Denmark, Iceland and Norway
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th a...
--------------------------------------------------
Question: Who was the Norse leader?
Answer: Rollo
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th a...
--------------------------------------------------
Question: What century did the Normans first gain their separat

In [None]:
from langchain.schema import Document

documents = []

for item in validation_data:
    context = item["context"]
    question = item["question"]
    answer = item["answers"]
    metadata = {"question": question, "answer": answer}
    documents.append(Document(page_content=context, metadata=metadata))


In [None]:
from cleantext import clean

def clean_context(text):
    return clean(
        text,
        fix_unicode=True,              # تغییر کاراکترهای یونیکد عجیب
        to_ascii=True,                 # حذف کاراکترهای غیر ASCII
        lower=True,                    # کوچیک کردن حروف
        no_line_breaks=True,          # حذف خطوط شکسته
        no_urls=True,                 # حذف URLها
        no_emails=True,               # حذف ایمیل
        no_phone_numbers=True,        # حذف شماره تلفن
        no_numbers=False,             # اگه می‌خوای عددها رو حذف کنی بذار True
        no_punct=False,               # اگه می‌خوای نقطه‌گذاری حذف شه بذار True
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_punct="",
        lang="en"
    )




In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_context(text, max_length=120, min_length=30):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
def preprocess_context_pipeline(text):
    cleaned = clean_context(text)

    # اگه خیلی بلنده، خلاصه‌ش کن
    if len(cleaned.split()) > 200:
        cleaned = summarize_context(cleaned)

    return cleaned

In [None]:
cleaned_docs = []

for doc in documents:
    clean_text = preprocess_context_pipeline(doc.page_content)  # تابعی که در پیام قبل دادم
    cleaned_docs.append(Document(page_content=clean_text, metadata=doc.metadata))

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Embedding Model for converting text to numerical representations
embedding_model = HuggingFaceEmbeddings(
    model_name='BAAI/bge-small-en-v1.5'
)

  embedding_model = HuggingFaceEmbeddings(


In [None]:
from langchain.vectorstores import FAISS

# Create a local vector database
db = FAISS.from_documents(documents, embedding_model)


  return forward_call(*args, **kwargs)


### The RAG Prompt


In [None]:
from langchain import LlamaCpp

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="Phi-3-mini-4k-instruct-q4.gguf",
    n_gpu_layers=-1,
    max_tokens=500,
    n_ctx=2048,
    seed=42,
    verbose=False
)

In [None]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

# Create a prompt template
template = """<|user|>
Based ONLY on the information below, answer the following question with the shortest, most direct possible response (ideally a single word or phrase). Do NOT repeat the question, do NOT include explanations, and do NOT say anything other than the answer itself.

Context:
{context}

Question:
{question}<|end|>
<|assistant|>"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# RAG Pipeline
rag = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever(search_kwargs={'k': 2}),
    chain_type_kwargs={
        "prompt": prompt
    },
    verbose=True
)

In [None]:
rag.invoke('In what country is Normandy located?')



[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)



[1m> Finished chain.[0m


{'query': 'In what country is Normandy located?', 'result': ' France'}

In [None]:
from tqdm import tqdm
validation_data = dataset["validation"]

# فقط ۱۰ نمونه اول رو انتخاب کن
validation_data_small = validation_data.select(range(10))

predictions = []
references = []

for example in tqdm(validation_data_small, desc="Evaluating"):
    question = example["question"]
    # پاسخ درست اول (ground truth)
    reference = example["answers"]["text"][0] if example["answers"]["text"] else ""

    try:
        response = rag.invoke(question)
        # اگر پاسخ دیکشنری بود، نتیجه رو بگیر، وگرنه کل پاسخ
        prediction = response['result'] if isinstance(response, dict) else response
    except Exception as e:
        print(f"Error for question: {question}\n{e}")
        prediction = ""

    predictions.append(prediction)
    references.append(reference)

print("Evaluation finished.")


  return forward_call(*args, **kwargs)




[1m> Entering new RetrievalQA chain...[0m


Evaluating:  10%|█         | 1/10 [00:00<00:07,  1.25it/s]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  20%|██        | 2/10 [00:07<00:31,  3.98s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  30%|███       | 3/10 [00:12<00:33,  4.81s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  40%|████      | 4/10 [00:15<00:24,  4.09s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  50%|█████     | 5/10 [00:23<00:26,  5.28s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  60%|██████    | 6/10 [00:30<00:24,  6.06s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  70%|███████   | 7/10 [00:34<00:16,  5.45s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  80%|████████  | 8/10 [00:38<00:09,  4.94s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  90%|█████████ | 9/10 [00:43<00:05,  5.01s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating: 100%|██████████| 10/10 [02:54<00:00, 17.44s/it]


[1m> Finished chain.[0m
Evaluation finished.





In [None]:
import evaluate

squad_metric = evaluate.load("squad")


In [None]:
# فرض: predictions و references دو لیست متنی هستن
# مثل: predictions = ['Normandy is located in France.'], references = ['France']

# اگه شناسه‌ای نداری می‌تونی idهای ساختگی بدی
formatted_predictions = [
    {"id": str(i), "prediction_text": pred.strip()} for i, pred in enumerate(predictions)
]

formatted_references = [
    {"id": str(i), "answers": {"text": [ref.strip()], "answer_start": [0]}} for i, ref in enumerate(references)
]


In [None]:
results = squad_metric.compute(
    predictions=formatted_predictions,
    references=formatted_references
)

print(f"F1 Score: {results['f1']:.2f}")
print(f"Exact Match (EM): {results['exact_match']:.2f}")


F1 Score: 44.29
Exact Match (EM): 20.00


In [None]:
predictions

[' France',
 ' Tenths-eleventh centuries',
 ' Denmark, Iceland, Norway',
 ' Rollo',
 ' First half of the 10th century',
 ' Normans',
 ' Region of\n\nFrance',
 ' Rollo',
 ' Not specified in context.',
 ' William']

In [None]:
references

['France',
 '10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo',
 '10th century',
 '',
 '',
 '',
 '',
 'William the Conqueror']

In [None]:
def recall_at_k(query, answer, k=5):
    retrieved = db.similarity_search(query, k)
    retrieved_texts = [doc.page_content.lower() for doc in retrieved]
    return any(answer.lower() in doc for doc in retrieved_texts)

# تست روی 100 سوال
recall_scores = [recall_at_k(ex["question"], example["answers"]["text"][0] if example["answers"]["text"] else "") for ex in validation_data_small]
recall_at_5 = sum(recall_scores) / len(recall_scores)
print(f"Recall@5: {recall_at_5:.2f}")

Recall@5: 0.10


In [None]:
from sentence_transformers import SentenceTransformer, util


# بارگذاری مدل
model = SentenceTransformer('all-MiniLM-L6-v2')

# محاسبه شباهت هر جفت و گرفتن میانگین
similarities = []
for ref, pred in zip(references, predictions):
    # اگر هر دو خالی باشن، مشابهت کامل در نظر گرفته میشه
    if ref.strip() == '' and pred.strip() == '':
        sim = 1.0
    # اگر یکی‌شون خالیه، شباهت صفر
    elif ref.strip() == '' or pred.strip() == '':
        sim = 0.0
    else:
        sim = util.cos_sim(model.encode(ref), model.encode(pred))[0][0].item()

    similarities.append(sim)

# میانگین شباهت‌ها
average_similarity = sum(similarities) / len(similarities)
print(f"Semantic Similarity (Average): {average_similarity:.4f}")


  return forward_call(*args, **kwargs)


Semantic Similarity (Average): 0.5496


# "Not answerable based on context."

In [None]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

# Create a prompt template
template = """<|user|>
Based ONLY on the information below, answer the following question with the shortest, most direct possible response (ideally a single word or phrase). Do NOT repeat the question, do NOT include explanations, and do NOT say anything other than the answer itself.If the answer is not found in the context, respond with exactly: "Not present in context"

Context:
{context}

Question:
{question}<|end|>
<|assistant|>"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# RAG Pipeline
rag = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever(search_kwargs={'k': 2}),
    chain_type_kwargs={
        "prompt": prompt
    },
    verbose=True
)

In [None]:
rag.invoke('In what country is Normandy located?')



[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)



[1m> Finished chain.[0m


{'query': 'In what country is Normandy located?', 'result': ' France'}

In [None]:
rag.invoke('When did the Frankish identity emerge?')



[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)



[1m> Finished chain.[0m


{'query': 'When did the Frankish identity emerge?',
 'result': ' Not present in context'}

In [None]:
from tqdm import tqdm
validation_data = dataset["validation"]

# فقط ۱۰ نمونه اول رو انتخاب کن
validation_data_small = validation_data.select(range(10))

predictions = []
references = []

for example in tqdm(validation_data_small, desc="Evaluating"):
    question = example["question"]
    # پاسخ درست اول (ground truth)
    reference = example["answers"]["text"][0] if example["answers"]["text"] else ""

    try:
        response = rag.invoke(question)
        # اگر پاسخ دیکشنری بود، نتیجه رو بگیر، وگرنه کل پاسخ
        prediction = response['result'] if isinstance(response, dict) else response
    except Exception as e:
        print(f"Error for question: {question}\n{e}")
        prediction = ""

    predictions.append(prediction)
    references.append(reference)

print("Evaluation finished.")


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]



[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  10%|█         | 1/10 [00:02<00:24,  2.74s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  20%|██        | 2/10 [00:08<00:36,  4.57s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  30%|███       | 3/10 [00:14<00:37,  5.31s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  40%|████      | 4/10 [00:17<00:26,  4.34s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  50%|█████     | 5/10 [00:23<00:24,  4.82s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  60%|██████    | 6/10 [00:29<00:21,  5.45s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  70%|███████   | 7/10 [00:33<00:14,  4.97s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  80%|████████  | 8/10 [00:37<00:09,  4.61s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating:  90%|█████████ | 9/10 [00:41<00:04,  4.42s/it]


[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m


  return forward_call(*args, **kwargs)
Evaluating: 100%|██████████| 10/10 [02:49<00:00, 16.99s/it]


[1m> Finished chain.[0m
Evaluation finished.





In [None]:
import evaluate

squad_metric = evaluate.load("squad")

In [None]:
def normalize_prediction(pred):
    pred = pred.strip().lower()
    if pred in ["not present in context", "not found in context", "not in context"]:
        return ""  # چون رفرنس هم هیچی نیست
    return pred
predictions = [normalize_prediction(p) for p in predictions]

In [None]:
formatted_predictions = [
    {"id": str(i), "prediction_text": pred.strip()} for i, pred in enumerate(predictions)
]

formatted_references = [
    {"id": str(i), "answers": {"text": [ref.strip()], "answer_start": [0]}} for i, ref in enumerate(references)
]

In [None]:
results = squad_metric.compute(
    predictions=formatted_predictions,
    references=formatted_references
)

print(f"F1 Score: {results['f1']:.2f}")
print(f"Exact Match (EM): {results['exact_match']:.2f}")

F1 Score: 51.90
Exact Match (EM): 60.00


In [None]:
predictions

['france',
 '10th-11th centuries',
 'denmark, iceland, norway',
 'rollo',
 '10th century',
 'normans',
 '',
 'rollo',
 '',
 'william the conqueror']

In [None]:
references

['France',
 '10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo',
 '10th century',
 '',
 '',
 '',
 '',
 'William the Conqueror']

In [None]:
def recall_at_k(query, answer, k=5):
    retrieved = db.similarity_search(query, k)
    retrieved_texts = [doc.page_content.lower() for doc in retrieved]
    return any(answer.lower() in doc for doc in retrieved_texts)

# تست روی 100 سوال
recall_scores = [recall_at_k(ex["question"], example["answers"]["text"][0] if example["answers"]["text"] else "") for ex in validation_data_small]
recall_at_5 = sum(recall_scores) / len(recall_scores)
print(f"Recall@5: {recall_at_5:.2f}")

Recall@5: 0.10


  return forward_call(*args, **kwargs)


In [None]:
from sentence_transformers import SentenceTransformer, util


# بارگذاری مدل
model = SentenceTransformer('all-MiniLM-L6-v2')

# محاسبه شباهت هر جفت و گرفتن میانگین
similarities = []
for ref, pred in zip(references, predictions):
    # اگر هر دو خالی باشن، مشابهت کامل در نظر گرفته میشه
    if ref.strip() == '' and pred.strip() == '':
        sim = 1.0
    # اگر یکی‌شون خالیه، شباهت صفر
    elif ref.strip() == '' or pred.strip() == '':
        sim = 0.0
    else:
        sim = util.cos_sim(model.encode(ref), model.encode(pred))[0][0].item()

    similarities.append(sim)

# میانگین شباهت‌ها
average_similarity = sum(similarities) / len(similarities)
print(f"Semantic Similarity (Average): {average_similarity:.4f}")


Semantic Similarity (Average): 0.7953
