In [6]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
)
import json
import os

from tqdm.auto import tqdm
import time
import torch
import pandas as pd

  from tqdm.autonotebook import tqdm, trange


In [7]:
load_dotenv()


HUGGINGFACE_API = os.getenv("HUGGINGFACE_API")
model_transformer = "all-MiniLM-L6-v2"

The file `RAG_EVALUATION.ipynb` is a Jupyter Notebook that contains code to evaluate a Retrieval-Augmented Generation (RAG) system. Below is a summary of the main sections and functions of the notebook:

1. **Module Import**:
    - Various libraries are imported, including `sentence_transformers`, `elasticsearch`, `dotenv`, `transformers`, `json`, `os`, `tqdm`, `time`, `torch`, and `pandas`.

2. **Environment Variable Loading**:
    - An environment variable `HUGGINGFACE_API` is loaded, and the model `model_transformer` is defined as `"all-MiniLM-L6-v2"`.

3. **Utility Functions**:
    - `read_json(file_path)`: Reads a JSON file and returns its content.
    - `load_mode(model_name)`: Loads a `SentenceTransformer` model.
    - `fetch_documents()`: Retrieves documents from a specific folder.
    - `setup_elasticsearch(index_name, model, url_es)`: Sets up an Elasticsearch index.
    - `index_documents(es_client, documents, model, index_name)`: Indexes documents in Elasticsearch.
    - `init_elasticsearch(model_name, index_name)`: Initializes Elasticsearch with a model and an index.

4. **Elasticsearch Initialization**:
    - `init_elasticsearch` is called to set up and index documents in Elasticsearch.

5. **Loading Verification Data**:
    - A CSV file with verification data is loaded and converted into a list of dictionaries.

6. **Loading Generation Models**:
    - `load_model_generation(name_hf_model)`: Loads a text generation model from Hugging Face.

7. **KNN Search in Elasticsearch**:
    - `elastic_search_knn(field, vector, index_name)`: Performs a KNN search in Elasticsearch.

8. **Prompt Construction and Response Generation**:
    - `build_prompt(query, search_results, template)`: Constructs a prompt for text generation.
    - `llm(prompt, pipe_generation)`: Generates a response using a language model.

9. **RAG Function**:
    - `rag(query, model, pipe_generation, template)`: Performs a RAG query and generates a response.

10. **RAG System Evaluation**:
     - `evalation_rag(ground_truth)`: Evaluates the relevance of the responses generated by the RAG system.

11. **Running Evaluations**:
     - Evaluations of the RAG system are performed using different text generation models.

The notebook is designed to evaluate the relevance of the responses generated by a RAG system in the context of housing policies, using Elasticsearch for document retrieval and language models for response generation.
1. **Module Import**:
    - Various libraries are imported, including `sentence_transformers`, `elasticsearch`, `dotenv`, `transformers`, `json`, `os`, `tqdm`, `time`, `torch`, and `pandas`.

2. **Environment Variable Loading**:
    - An environment variable `HUGGINGFACE_API` is loaded, and the model `model_transformer` is defined as `"all-MiniLM-L6-v2"`.

3. **Utility Functions**:
    - `read_json(file_path)`: Reads a JSON file and returns its content.
    - `load_mode(model_name)`: Loads a `SentenceTransformer` model.
    - `fetch_documents()`: Retrieves documents from a specific folder.
    - `setup_elasticsearch(index_name, model, url_es)`: Sets up an Elasticsearch index.
    - `index_documents(es_client, documents, model, index_name)`: Indexes documents in Elasticsearch.
    - `init_elasticsearch(model_name, index_name)`: Initializes Elasticsearch with a model and an index.

4. **Elasticsearch Initialization**:
    - `init_elasticsearch` is called to set up and index documents in Elasticsearch.

5. **Loading Verification Data**:
    - A CSV file with verification data is loaded and converted into a list of dictionaries.

6. **Loading Generation Models**:
    - `load_model_generation(name_hf_model)`: Loads a text generation model from Hugging Face.

7. **KNN Search in Elasticsearch**:
    - `elastic_search_knn(field, vector, index_name)`: Performs a KNN search in Elasticsearch.

8. **Prompt Construction and Response Generation**:
    - `build_prompt(query, search_results, template)`: Constructs a prompt for text generation.
    - `llm(prompt, pipe_generation)`: Generates a response using a language model.

9. **RAG Function**:
    - `rag(query, model, pipe_generation, template)`: Performs a RAG query and generates a response.

10. **RAG System Evaluation**:
     - `evalation_rag(ground_truth)`: Evaluates the relevance of the responses generated by the RAG system.

11. **Running Evaluations**:
     - Evaluations of the RAG system are performed using different text generation models.

The notebook is designed to evaluate the relevance of the responses generated by a RAG system in the context of housing policies, using Elasticsearch for document retrieval and language models for response generation.


In [8]:
def read_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def load_mode(model_name):
    print(f"Loading model: {model_name}")
    return SentenceTransformer(model_name)


def fetch_documents():
    print("Fetching documents...")

    directory_path = "../json_data"

    # List all files in the directory
    files = os.listdir(directory_path)

    documents = []
    for file in files:
        print(f"Reading file: {file}")
        data = read_json(f"{directory_path}/{file}")
        documents.extend(data)
        print(f"Fetched {len(documents)} documents")
    return documents


def setup_elasticsearch(index_name, model, url_es="http://localhost:9200"):
    print("Setting up Elasticsearch...")
    es_client = Elasticsearch(url_es)

    index_settings = {
        "settings": {"number_of_shards": 1, "number_of_replicas": 0},
        "mappings": {
            "properties": {
                "doc_id": {"type": "keyword"},
                "page_num": {"type": "integer"},
                "chunk_id": {"type": "keyword"},
                "text": {"type": "text"},
                "text_vector": {
                    "type": "dense_vector",
                    "dims": model.get_sentence_embedding_dimension(),
                    "index": True,
                    "similarity": "cosine",
                },
            }
        },
    }

    es_client.indices.delete(index=index_name, ignore_unavailable=True)
    es_client.indices.create(index=index_name, body=index_settings)
    print(f"Elasticsearch index '{index_name}' created")
    return es_client


def index_documents(es_client, documents, model, index_name):
    print("Indexing documents...")
    for doc in tqdm(documents):
        doc["text_vector"] = model.encode(doc["text"]).tolist()
        es_client.index(index=index_name, document=doc)
    print(f"Indexed {len(documents)} documents")


def init_elasticsearch(model_name, index_name):
    model = load_mode(model_name)
    documents = fetch_documents()
    es_client = setup_elasticsearch(index_name, model)
    index_documents(es_client, documents, model, index_name)

In [9]:
init_elasticsearch(model_transformer, "esearchtext_model_all-minilm-l6-v2")

Loading model: all-MiniLM-L6-v2
Fetching documents...
Reading file: Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M.json
Fetched 60 documents
Reading file: How-do-local-governments-respond-to-central-mandate-in-affo_2024_Journal-of-.json
Fetched 113 documents
Reading file: Inclusive-cities--Less-crime-requires-more-lo_2024_Journal-of-Urban-Manageme.json
Fetched 118 documents
Reading file: sideris_gonzales_ong.json
Fetched 171 documents
Reading file: The_High_Cost_of_Free_Parking.json
Fetched 190 documents
Setting up Elasticsearch...
Elasticsearch index 'esearchtext_model_all-minilm-l6-v2' created
Indexing documents...


100%|██████████| 190/190 [00:09<00:00, 19.39it/s]

Indexed 190 documents





In [10]:
ground_truth = pd.read_csv("../data_output/ground-truth-retrieval.csv")
ground_truth = ground_truth.to_dict(orient="records")
ground_truth = ground_truth[:25]

In [11]:
def load_model_generation(name_hf_model):
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    model = AutoModelForCausalLM.from_pretrained(
        name_hf_model,
        device_map=device,
        torch_dtype="auto",
        trust_remote_code=True,
        token=HUGGINGFACE_API,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        name_hf_model,
        token=HUGGINGFACE_API,
    )
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )
    return pipe

In [12]:
def elastic_search_knn(
    field,
    vector,
    # course,
    index_name,
):
    es_client = Elasticsearch("http://localhost:9200")

    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        # "filter": {"term": {"course": course}},
    }

    search_query = {
        "knn": knn,
        "_source": ["doc_id", "page_num", "chunk_id", "text"],
    }

    es_results = es_client.search(index=index_name, body=search_query)

    return [hit["_source"] for hit in es_results["hits"]["hits"]]

In [13]:
def build_prompt(query, search_results, template):
    prompt_template = """
{template}

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = "\n\n".join(
        [f"doc_id: {doc['doc_id']}\nanswer: {doc['text']}" for doc in search_results]
    )
    return prompt_template.format(
        question=query, context=context, template=template
    ).strip()


def llm(prompt, pipe_generation):
    # return {"answer": "test", "time": 0.0}
    start_time = time.time()
    messages = [
        {"role": "user", "content": prompt},
    ]

    eos_token_id = pipe_generation.tokenizer.eos_token_id

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        # "temperature": 0.0,
        "do_sample": False,
        "pad_token_id": eos_token_id,
    }

    output = pipe_generation(messages, **generation_args)

    answer = output[0]["generated_text"].strip()

    end_time = time.time()
    response_time = end_time - start_time

    return {"answer": answer, "time": response_time}

In [14]:
def rag(query, model, pipe_generation, template):
    search_results = elastic_search_knn(
        "text_vector", model.encode(query), "esearchtext_model_all-minilm-l6-v2"
    )
    prompt = build_prompt(query, search_results, template)
    return llm(prompt, pipe_generation)

In [15]:
model = load_mode(model_transformer)
pipe_generation = load_model_generation("meta-llama/Llama-3.2-1B-Instruct")
template = """As a housing policy expert advising policymakers, answer the QUESTION below using only the verified information provided in the CONTEXT. 
Maintain a neutral, factual tone, and avoid assumptions or extrapolations beyond the CONTEXT. 
Structure your response with a brief summary of pros and cons to support balanced decision-making, and keep the response not more that 30 words."""

for row in tqdm(ground_truth):
    row["gen_answer"] = rag(row["question"], model, pipe_generation, template)["answer"]

Loading model: all-MiniLM-L6-v2


 40%|████      | 10/25 [02:01<03:41, 14.76s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 25/25 [04:40<00:00, 11.21s/it]


In [29]:
ground_truth[:5]

[{'id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_1',
  'question': 'How do you think the concept of cityphobia can be used to inform policy decisions aimed at reducing urban poverty and inequality, particularly in areas with high levels of social exclusion?',
  'gen_answer': "**Summary of Pros and Cons:**\n\nTo inform policy decisions aimed at reducing urban poverty and inequality, using the concept of cityphobia can be beneficial. On the one hand, cityphobia can highlight the importance of love and attachment to one's living environment, which can lead to more inclusive and sustainable urban development. On the other hand, cityphobia can also be used to justify the need for more commercial housing and social services, potentially exacerbating existing inequalities.\n\n**Balanced Decision-Making:**\n\n* Cityphobia can emphasize the importance of love and attachment to one's living environment, leading to more inclusive and sustainable urban develo

In [30]:
def evalation_rag(ground_truth):
    def build_prompt_template(question, gen_answer):
        prompt_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system. 
You task is to analyse the relevance of de answer to the question and context provided.
The answer try to repond like a housing policy expert.
Based on the relevance of the answer, you have to classify it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

QUESTION: {question}

GENERATED ANSWER:
{gen_answer}

Return the output in a well-formed JSON format without code blocks.

{{
    RELEVANCE: "NON_RELEVANT" | "RELEVANT" | "HIGHLY_RELEVANT"
    Explanation: "[Provide a brief explanation of your decision]"
}}
""".strip()

        return prompt_template.format(question=question, gen_answer=gen_answer).strip()

    status_list = []
    for ground_truth_row in tqdm(ground_truth):
        prompt = build_prompt_template(
            ground_truth_row["question"], ground_truth_row["gen_answer"]
        )
        llm_output = llm(prompt, pipe_generation)["answer"]
        try:
            evaluation_result = json.loads(llm_output)
        except json.JSONDecodeError:
            evaluation_result = {
                "RELEVANCE": "ERROR",
                "Explanation": "failed to parse JSON",
            }

        if (
            "RELEVANCE" not in evaluation_result
            or "Explanation" not in evaluation_result
        ):
            evaluation_result = {"RELEVANCE": "ERROR", "Explanation": "bad JSON format"}

        status_list.append(llm_output)

    full_load_json = []

    for i in tqdm(status_list):
        load_json = json.loads(i)

        full_load_json.append(load_json)

    relevance_df = pd.DataFrame(full_load_json)
    return relevance_df.value_counts("RELEVANCE") / relevance_df.shape[0] * 100

In [31]:
print("Evaluation of the RAG system whit meta-llama/Llama-3.2-1B-Instruct")
evalation_rag(ground_truth)

Evaluation of the RAG system whit meta-llama/Llama-3.2-1B-Instruct


100%|██████████| 25/25 [01:32<00:00,  3.68s/it]
100%|██████████| 25/25 [00:00<?, ?it/s]


RELEVANCE
NON_RELEVANT    100.0
Name: count, dtype: float64

In [None]:
model = load_mode(model_transformer)
pipe_generation = load_model_generation("meta-llama/Llama-3.2-1B-Instruct")
template = """Answer the QUESTION below using only the verified information provided in the CONTEXT. 
Maintain a neutral, factual tone, and avoid assumptions or extrapolations beyond the CONTEXT. 
Structure your response with a brief summary of pros and cons to support balanced decision-making, and keep the response not more that 30 words."""

for row in tqdm(ground_truth):
    row["gen_answer"] = rag(row["question"], model, pipe_generation, template)["answer"]

In [None]:
ground_truth[:5]

In [1]:
print("Evaluation of the RAG system whit facebook/bart-large-cnn")
evalation_rag(ground_truth)

100%|██████████| 25/25 [00:00<00:00, 24989.90it/s]


RELEVANCE
NON_RELEVANT    100.0
Name: count, dtype: float64

: 