In [None]:
import os
import re
from contextlib import AbstractContextManager
from itertools import groupby
from typing import Dict, List

import weaviate
from openai import OpenAI
from weaviate.classes.query import Filter, HybridFusion, MetadataQuery
import torch
from transformers import AutoModel, AutoTokenizer


class ModelManager:
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModel.from_pretrained(model_path, add_pooling_layer=True)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs)[0][:, 0]
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).squeeze(0).tolist()

# Initialize models 
base_dir = os.getcwd()
embedding_path = os.path.join(base_dir, "embeddings")
# For embedding model
model_manager = ModelManager(embedding_path)  
# For classification, query transformation, generation, and validation
client = OpenAI()


class RetrievalDecisionModule:
    def __init__(self, client=client, model="gpt-4o-mini", temperature=0.0, max_tokens=5):
        self.client = client
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens

    def classify_if_retrieval_needed(self, user_input, context_str):
        prompt = f"""
        You are an assistant helping decide whether a user message needs document retrieval or not.

        Instructions:
        - If the user's message is purely conversational (e.g. "hi", "thanks", "that's helpful") or can be answered from previous chat messages or common knowledge (e.g. general facts), respond: **"no"**.
        - If the message requires external knowledge, document retrieval, or detailed information not provided in the chat history, respond: **"yes"**.
        - If the user's message is a simple factual question (e.g., "What is the capital of France?" or "How many days are in a week?"), respond: **"no"**.
        
        Chat History:
        {context_str}

        User Input:
        {user_input}

        Does this query need document retrieval? (Answer only "yes" or "no")
        """

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt.strip()}],
                temperature=self.temperature,
                max_tokens=self.max_tokens
            )
            return response.choices[0].message.content.strip().lower() == "yes"
        except Exception:
            return True


class QueryTransformationModule:
    def __init__(
        self,
        client=client,
        refine_model="gpt-4o-mini",
        hyde_model="gpt-4o-mini-search-preview",
        refine_temperature=0.3,
        hyde_temperature=0.7,
        refine_max_tokens=300,
        hyde_max_tokens=512
    ):
        self.client = client
        self.refine_model = refine_model
        self.hyde_model = hyde_model
        self.refine_temperature = refine_temperature
        self.hyde_temperature = hyde_temperature
        self.refine_max_tokens = refine_max_tokens
        self.hyde_max_tokens = hyde_max_tokens

    def refine_query_with_history(self, new_query, context_str):
        prompt = f"""
        You're legal query refiner helping create effective search queries for Quezon City documents. Consider both:
        1. The new user query
        2. Relevant context from chat history (if applicable)

        Your task:
        - Refine the query into a standalone, clear, **affirmative sentence** (not a question), in English, suitable for document search.

        Chat History (most recent first): {context_str}

        New Query: {new_query}

        Refined Search Query (respond ONLY with the refined query in ENGLISH):
        """

        try:
            response = self.client.chat.completions.create(
                model=self.refine_model,
                messages=[{"role": "user", "content": prompt.strip()}],
                temperature=self.refine_temperature,
                max_tokens=self.refine_max_tokens
            )
            return response.choices[0].message.content.strip()
        except Exception:
            return new_query

    def generate_hypothetical_document(self, query: str) -> str:
        prompt = f"""
        You are a legal research assistant for Quezon City. Based on the query below, generate an answer that could plausibly address it.

        - The content should be realistic and relevant.
        - Translate to English if needed.

        Query: "{query}"

        Hypothetical legal document (1 paragraph):
        """

        try:
            response = self.client.chat.completions.create(
                model=self.hyde_model,
                messages=[
                    {"role": "user", "content": prompt.strip()}
                ],
                max_tokens=self.hyde_max_tokens
            )
            return response.choices[0].message.content.strip()
        except Exception:
            return "Error generating hypothetical document"


class DocumentRetrievalModule(AbstractContextManager):
    def __init__(self, host="localhost", collection_name="BAAI", alpha=0.5, context_window=1):
        self.host = host
        self.collection_name = collection_name
        self.alpha = alpha
        self.context_window = context_window
        self.client = None

    def __enter__(self):
        self.client = weaviate.connect_to_local(host=self.host)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.client:
            self.client.close()

    def search_documents(self, query_text: str, max_results: int) -> List[Dict]:
        try:
            self.collection = self.client.collections.get(self.collection_name)
            response = self.collection.query.hybrid(
                query=query_text,
                vector=model_manager.get_embedding(query_text),
                alpha=self.alpha,
                fusion_type=HybridFusion.RELATIVE_SCORE,
                limit=max_results,
                return_properties=["text", "source", "category", "chunk_index"],
                return_metadata=MetadataQuery(score=True)
            )

            results = []
            seen = set()

            for obj in response.objects:
                key = (obj.properties["source"], obj.properties["category"], obj.properties["chunk_index"])
                if key not in seen:
                    seen.add(key)
                    results.append({
                        "text": obj.properties["text"],
                        "source": obj.properties["source"],
                        "category": obj.properties["category"],
                        "chunk_index": obj.properties["chunk_index"],
                        "score": obj.metadata.score
                    })

            merged_results = self.expand_documents(results)
            return merged_results

        except Exception as e:
            return {"error": "Error searching for documents", "details": str(e)}

    def expand_documents(self, results: List[Dict]) -> List[Dict]:
        merged_results = []
        if results:
            expanded_chunks = self.expand_document_search(results)
            seen = {(chunk['source'], chunk['category'], chunk['chunk_index']) for chunk in results}

            for chunk in expanded_chunks:
                if (chunk['source'], chunk['category'], chunk['chunk_index']) not in seen:
                    results.append({
                        "text": chunk["text"],
                        "source": chunk["source"],
                        "category": chunk["category"],
                        "chunk_index": chunk["chunk_index"],
                        "score": 0})

            category_order = {
                'Introduction': 0,
                'Preamble': 1,
                'Operative': 2,
                'Signature': 3,
                'Uncategorized': 4
            }

            sorted_items = sorted(
                results,
                key=lambda x: (
                    x['source'].lower(),
                    int(category_order.get(x['category'], 4)),
                    int(x['chunk_index'])
                )
            )

            for (source, category), group in groupby(sorted_items, key=lambda x: (x['source'], x['category'])):
                group = list(group)
                group.sort(key=lambda x: x['chunk_index'])

                merged = [group[0]]
                for item in group[1:]:
                    last = merged[-1]

                    if item['chunk_index'] == last['chunk_index'] + 1:
                        last['text'] += item['text']
                        last['score'] = max(last['score'], item['score'])
                        last['chunk_index'] = item['chunk_index']
                    else:
                        merged.append(item)
                merged_results.extend(merged)
        return sorted(merged_results, key=lambda x: x['score'], reverse=True)

    def expand_document_search(self, initial_results: List[Dict]) -> List[Dict]:
        expanded_chunks = []
        doc_sources = set()

        for chunk in reversed(initial_results):
            doc_sources.add((chunk['source'], chunk['category'], chunk['chunk_index']))

        try:
            for source, category, index in doc_sources:
                filters = (
                    Filter.by_property("source").equal(source)
                    & Filter.by_property("category").equal(category)
                    & Filter.by_property("chunk_index").greater_than(index - self.context_window - 1)
                    & Filter.by_property("chunk_index").less_than(index + self.context_window + 1)
                )

                response = self.collection.query.fetch_objects(
                    filters=filters,
                    return_properties=["text", "source", "category", "chunk_index"]
                )

                for obj in response.objects:
                    expanded_chunks.append({
                        "text": obj.properties["text"],
                        "source": obj.properties["source"],
                        "category": obj.properties["category"],
                        "chunk_index": obj.properties["chunk_index"],
                        "score": 0
                    })
            return expanded_chunks

        except Exception as e:
            return {"error": "Error expanding document search", "details": str(e)}


class ResponseGeneratorModule:
    def __init__(
        self,
        client=client,
        generation_model_with_retrieval="gpt-4o",
        generation_model_without_retrieval="gpt-4o-mini-search-preview",
        generation_temperature=0.1,
        max_tokens=512
    ):
        self.client = client
        self.generation_model_with_retrieval = generation_model_with_retrieval
        self.generation_model_without_retrieval = generation_model_without_retrieval
        self.generation_temperature = generation_temperature
        self.max_tokens = max_tokens

    def conversation_without_retrieval(self, user_input, context_str=None):
        prompt = f"""
        You are a Quezon City Legal Provider. Answer the query using your internal knowledge or the provided conversation history if applicable.

        Conversation history:
        {context_str if context_str else 'No previous conversation history.'}

        User query:
        {user_input}

        Please answer clearly and accurately.  
        Note: If the user query is in English, provide the answer in English. If the query is in Filipino, provide the answer in Filipino.
        """

        try:
            response = self.client.chat.completions.create(
                model=self.generation_model_without_retrieval,
                messages=[{"role": "user", "content": prompt.strip()}],
                temperature=self.generation_temperature,
                max_tokens=self.max_tokens
            )
            return response.choices[0].message.content.strip()
        except Exception:
            return "Error generating response"


    def generate_response(self, query, context_docs, context_sources):
        context_str = "\n\n".join(
            [f"Document {index + 1}:\n{doc['text']}" for index, doc in enumerate(context_docs)]
        )
        
        prompt = f"""
        You are a legal AI assistant helping users find information from ordinances and resolutions. 
        Answer the query **strictly using the provided context below**. 

        If you use any context in your answer, you must clearly indicate which document(s) you used **using the format: "Document X"** (e.g. Document 1, Document 2).

        Query: {query}

        Context: {context_str if context_docs else 'No relevant documents found.'}

        Note: If the user query is in English, provide the answer in English. If the query is in Filipino, provide the answer in Filipino.
        """

        try:
            response = self.client.chat.completions.create(
                model=self.generation_model_with_retrieval,
                messages=[{"role": "user", "content": prompt.strip()}],
                temperature=self.generation_temperature,
                max_tokens=self.max_tokens
            )

            generated_answer = response.choices[0].message.content.strip()

            doc_numbers = list(set(re.findall(r"document\s+(\d+)", generated_answer, re.IGNORECASE)))
            relevant_sources = [context_sources[int(num) - 1] for num in doc_numbers if num.isdigit()]
            relevant_contexts = [context_docs[int(num) - 1] for num in doc_numbers if num.isdigit()]
            generated_answer = re.sub(
                r"(\(?\s*(See\s+)?(Sources?:\s*)?(Document\s+\d+[,\s]*)+(and\s+)?(Document\s+\d+)?\s*\)?)", 
                "", 
                generated_answer, 
                flags=re.IGNORECASE
            ).strip()

            return generated_answer, relevant_sources, relevant_contexts
        except Exception:
            return "Error generating response", [], []


class AnswerValidationAgent:
    def __init__(
        self,
        client=client,
        model="gpt-4o-mini",
        temperature=0.0,
        max_tokens=512,
        max_attempts=3
    ):
        self.client = client
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.max_attempts = max_attempts
        self.current_attempt = 0
        self.search_params = {'max_results': 7, 'alpha': 0.5}

    def validate_answer(self, answer, context_docs, query):
        if not context_docs:
            return False

        context_str = "\n\n".join(doc['text'] for doc in context_docs)
        prompt = f"""
        Legal Answer Validation - Strict Check:

        Evaluate the answer based on the provided documents according to these criteria:
        - Correctness: Does the answer accurately reflect information from the documents?
        - Completeness: Does the answer include all critical and relevant information?
        - Honesty: Does the answer avoid making claims not supported by the documents?

        If the answer fails any of these criteria, or if critical information is missing, or if unsupported claims are made, consider it invalid.

        Documents:
        {context_str}

        Query: {query}
        Answer: {answer}

        Respond ONLY with one word: 'valid' or 'invalid'. No explanations.
        """

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt.strip()}],
                temperature=self.temperature,
                max_tokens=self.max_tokens
            )
            content = response.choices[0].message.content.strip().lower()
            return 'valid' in content and 'invalid' not in content
        except Exception:
            return False


In [5]:
import pandas as pd
import ast
import numpy as np

def evaluate_ir_metrics_table(df, ks=[3, 5, 7], col_names=None):
    def evaluate_column(col_name, k):
        precision_list = []
        recall_list = []
        rr_list = []

        for idx, row in df.iterrows():
            gt_source = row["Source"].strip()
            gt_category = row["Category"].strip()
            gt_pair = (gt_source, gt_category)

            retrieved = ast.literal_eval(row[col_name])

            top_k = retrieved[:k]
            retrieved_pairs = [(doc[0].strip(), doc[1].strip()) for doc in top_k]

            relevance = [1 if pair == gt_pair else 0 for pair in retrieved_pairs]

            precision = sum(relevance) / k
            precision_list.append(precision)

            recall = 1 if gt_pair in retrieved_pairs else 0
            recall_list.append(recall)

            if gt_pair in retrieved_pairs:
                rank = retrieved_pairs.index(gt_pair)
                rr = 1 / (rank + 1)
            else:
                rr = 0
            rr_list.append(rr)

        return {
            "Precision": np.mean(precision_list),
            "Recall": np.mean(recall_list),
            "MRR": np.mean(rr_list)
        }

    rows = []
    for col_name in col_names:
        for k in ks:
            metrics = evaluate_column(col_name, k)
            rows.append({
                "Column": col_name,
                "k": k,
                **metrics
            })

    return pd.DataFrame(rows)

1. What embedding model to use?

In [55]:
import pandas as pd
import json
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

class ModelManager:
    def __init__(self, model_path, use_pooling=True):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        if use_pooling:
            self.model = AutoModel.from_pretrained(model_path)
        else:
            self.model = AutoModel.from_pretrained(model_path, add_pooling_layer=False)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs)[0][:, 0] 
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).squeeze(0).tolist()

embedding_models = {
    "BAAI": {
        "path": r"C:\Users\Alister\Desktop\AI Classes\Capstone\Models2\Embeddings\BAAI\bge-base-en-v1.5",
        "use_pooling": True
    },
    "LegalDocument": {
        "path": r"C:\Users\Alister\Desktop\AI Classes\Capstone\Models2\Embeddings\Snowflake\snowflake-arctic-embed-m",
        "use_pooling": False
    },
    "multilingual": {
        "path": r"C:\Users\Alister\Desktop\AI Classes\Capstone\Models2\Embeddings\intfloat\multilingual-e5-large-instruct",
        "use_pooling": True
    }
}

df = pd.read_csv("generated_qa.csv")

for collection_name, settings in embedding_models.items():
    model_manager = ModelManager(settings["path"], use_pooling=settings["use_pooling"])
    retrieved_docs_list = []
    
    print(f"Processing collection: {collection_name}")
    
    for question in tqdm(df["Question_EN"], desc=f"Retrieving docs for {collection_name}"):
        with DocumentRetrievalModule(host="localhost", collection_name=collection_name, alpha=0.5) as searcher:
            try:
                context_docs = searcher.search_documents(question, max_results=7)
                sources = [(doc["source"], doc["category"], doc["score"]) for doc in context_docs]
            except Exception:
                sources = []
        retrieved_docs_list.append(sources)

    df[f"{collection_name}_documents_retrieved"] = [json.dumps(docs) for docs in retrieved_docs_list]

df.to_csv("generated_qa_with_docs_all_collections.csv", index=False)

Processing collection: BAAI


Retrieving docs for BAAI: 100%|██████████| 150/150 [02:21<00:00,  1.06it/s]


Processing collection: LegalDocument


Retrieving docs for LegalDocument: 100%|██████████| 150/150 [02:23<00:00,  1.05it/s]


Processing collection: multilingual


Retrieving docs for multilingual: 100%|██████████| 150/150 [02:41<00:00,  1.08s/it]


In [60]:
df = pd.read_csv("generated_qa_with_docs_all_collections.csv")
cols_to_eval = ["BAAI_documents_retrieved", "snowflake_documents_retrieved", "multilingual_documents_retrieved"]
ks = [3, 5, 7]

results_df = evaluate_ir_metrics_table(df, ks=ks, col_names=cols_to_eval)
results_df

Unnamed: 0,Column,k,Precision,Recall,MRR
0,BAAI_documents_retrieved,3,0.306667,0.88,0.707778
1,BAAI_documents_retrieved,5,0.198667,0.926667,0.718111
2,BAAI_documents_retrieved,7,0.145714,0.953333,0.722238
3,snowflake_documents_retrieved,3,0.24,0.713333,0.465556
4,snowflake_documents_retrieved,5,0.162667,0.806667,0.486889
5,snowflake_documents_retrieved,7,0.128571,0.88,0.498
6,multilingual_documents_retrieved,3,0.291111,0.84,0.667778
7,multilingual_documents_retrieved,5,0.189333,0.9,0.681444
8,multilingual_documents_retrieved,7,0.139048,0.926667,0.68573


2. multilinguel embedding model vs. translated and english-only embedding model?

In [61]:
import pandas as pd
import json
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


class ModelManager:
    def __init__(self, model_path, use_pooling=True):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        if use_pooling:
            self.model = AutoModel.from_pretrained(model_path)
        else:
            self.model = AutoModel.from_pretrained(model_path, add_pooling_layer=False)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs)[0][:, 0] 
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).squeeze(0).tolist()

embedding_models = {
    "BAAI": {
        "path": r"C:\Users\Alister\Desktop\AI Classes\Capstone\Models2\Embeddings\BAAI\bge-base-en-v1.5",
        "use_pooling": True
    },
    "LegalDocument": {
        "path": r"C:\Users\Alister\Desktop\AI Classes\Capstone\Models2\Embeddings\Snowflake\snowflake-arctic-embed-m",
        "use_pooling": False
    },
    "multilingual": {
        "path": r"C:\Users\Alister\Desktop\AI Classes\Capstone\Models2\Embeddings\intfloat\multilingual-e5-large-instruct",
        "use_pooling": True
    }
}

df = pd.read_csv("generated_qa.csv")

query_transformation_module = QueryTransformationModule()

for collection_name, settings in embedding_models.items():
    model_manager = ModelManager(settings["path"], use_pooling=settings["use_pooling"])
    retrieved_docs_list = []
    
    print(f"Processing collection: {collection_name}")
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Retrieving docs for {collection_name}"):
        if collection_name == "multilingual":
            query = row["Question_TL"]
        else:
            user_input = row["Question_TL"]
            query = query_transformation_module.refine_query_with_history(user_input, [])
        
        with DocumentRetrievalModule(host="localhost", collection_name=collection_name, alpha=0.5) as searcher:
            try:
                context_docs = searcher.search_documents(query, max_results=7)
                sources = [(doc["source"], doc["category"], doc["score"]) for doc in context_docs]
            except Exception as e:
                print(f"Error searching documents for query '{query}': {e}")
                sources = []
        retrieved_docs_list.append(sources)

    df[f"{collection_name}_documents_retrieved"] = [json.dumps(docs) for docs in retrieved_docs_list]

df.to_csv("generated_qa_expt_2.csv", index=False)

Processing collection: BAAI


Retrieving docs for BAAI: 100%|██████████| 150/150 [04:50<00:00,  1.94s/it]


Processing collection: LegalDocument


            Please make sure to close the connection using `client.close()`.
Retrieving docs for LegalDocument: 100%|██████████| 150/150 [05:07<00:00,  2.05s/it]


Processing collection: multilingual


Retrieving docs for multilingual: 100%|██████████| 150/150 [02:45<00:00,  1.11s/it]


In [62]:
df = pd.read_csv("generated_qa_expt_2.csv")
cols_to_eval = ["BAAI_documents_retrieved", "snowflake_documents_retrieved", "multilingual_documents_retrieved"]
ks = [3, 5, 7]

results_df = evaluate_ir_metrics_table(df, ks=ks, col_names=cols_to_eval)
results_df

Unnamed: 0,Column,k,Precision,Recall,MRR
0,BAAI_documents_retrieved,3,0.306667,0.886667,0.69
1,BAAI_documents_retrieved,5,0.193333,0.92,0.698
2,BAAI_documents_retrieved,7,0.141905,0.94,0.701333
3,snowflake_documents_retrieved,3,0.226667,0.68,0.443333
4,snowflake_documents_retrieved,5,0.162667,0.8,0.470667
5,snowflake_documents_retrieved,7,0.124762,0.86,0.479714
6,multilingual_documents_retrieved,3,0.208889,0.606667,0.437778
7,multilingual_documents_retrieved,5,0.14,0.673333,0.453444
8,multilingual_documents_retrieved,7,0.108571,0.726667,0.461698


3. Hybrid weight to use

In [69]:
import pandas as pd
import json
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


COLLECTION_NAME = "BAAI"
ALPHAS = [0.3, 0.5, 0.7]

class ModelManager:
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModel.from_pretrained(model_path, add_pooling_layer=True)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs)[0][:, 0]
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).squeeze(0).tolist()

base_dir = os.getcwd()
embedding_path = os.path.join(base_dir, "embeddings")
model_manager = ModelManager(embedding_path)  
client = OpenAI()


df = pd.read_csv("generated_qa.csv")

for ALPHA in ALPHAS:
    retrieved_docs_list = []
    for question in tqdm(df["Question_EN"], desc=f"Retrieving docs for {COLLECTION_NAME} (alpha={ALPHA})"):
        with DocumentRetrievalModule(host="localhost", collection_name=COLLECTION_NAME, alpha=ALPHA) as searcher:
            context_docs = searcher.search_documents(question, max_results=7)
            sources = [(doc["source"], doc["category"], doc["score"]) for doc in context_docs]
     
        retrieved_docs_list.append(sources)

    df[f"{COLLECTION_NAME}_documents_retrieved_{ALPHA}"] = [json.dumps(docs) for docs in retrieved_docs_list]

df.to_csv("generated_qa_expt_3.csv", index=False)

Retrieving docs for BAAI (alpha=0.3): 100%|██████████| 150/150 [02:43<00:00,  1.09s/it]
Retrieving docs for BAAI (alpha=0.5): 100%|██████████| 150/150 [03:39<00:00,  1.46s/it]
Retrieving docs for BAAI (alpha=0.7): 100%|██████████| 150/150 [02:36<00:00,  1.04s/it]


In [70]:
df = pd.read_csv("generated_qa_expt_3.csv")
cols_to_eval = ["BAAI_documents_retrieved_0.3", "BAAI_documents_retrieved_0.5", "BAAI_documents_retrieved_0.7"]
ks = [3, 5, 7]

results_df = evaluate_ir_metrics_table(df, ks=ks, col_names=cols_to_eval)
results_df

Unnamed: 0,Column,k,Precision,Recall,MRR
0,BAAI_documents_retrieved_0.3,3,0.3,0.866667,0.697778
1,BAAI_documents_retrieved_0.3,5,0.194667,0.906667,0.707444
2,BAAI_documents_retrieved_0.3,7,0.144762,0.94,0.712683
3,BAAI_documents_retrieved_0.5,3,0.306667,0.88,0.707778
4,BAAI_documents_retrieved_0.5,5,0.198667,0.926667,0.718111
5,BAAI_documents_retrieved_0.5,7,0.145714,0.953333,0.722238
6,BAAI_documents_retrieved_0.7,3,0.295556,0.866667,0.685556
7,BAAI_documents_retrieved_0.7,5,0.197333,0.926667,0.699222
8,BAAI_documents_retrieved_0.7,7,0.145714,0.953333,0.70319


4. Query Transformation

In [80]:
import pandas as pd
import json
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

query_transformations = ["refine_query", "HyDE"]
models = ["gpt-4o", "gpt-4o-mini"]

class ModelManager:
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModel.from_pretrained(model_path, add_pooling_layer=True)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs)[0][:, 0]
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).squeeze(0).tolist()

base_dir = os.getcwd()
embedding_path = os.path.join(base_dir, "embeddings")
model_manager = ModelManager(embedding_path)  
client = OpenAI()


df = pd.read_csv("generated_qa.csv")

for model_name in models:
    query_transformation_module = QueryTransformationModule(
        refine_model=model_name, 
        hyde_model=model_name
    )
    
    for query_transformation in query_transformations:
        retrieved_docs_list = []
        refined_queries = []

        for question in tqdm(df["Question_EN"], desc=f"{query_transformation} ({model_name})"):
            if query_transformation == "refine_query":
                refined_query = query_transformation_module.refine_query_with_history(question, context_str="")
            else:  # HyDE
                refined_query = query_transformation_module.generate_hypothetical_document(question)

            refined_queries.append(refined_query)

            with DocumentRetrievalModule(host="localhost", collection_name="BAAI", alpha=0.5) as searcher:
                try:
                    context_docs = searcher.search_documents(refined_query, max_results=7)
                    sources = [(doc["source"], doc["category"], doc["score"]) for doc in context_docs]
                except Exception:
                    sources = []

            retrieved_docs_list.append(sources)

        refined_col_name = f"{query_transformation}_refined_queries_{model_name}"
        docs_col_name = f"{query_transformation}_retrieved_docs_{model_name}"
        
        df[refined_col_name] = refined_queries
        df[docs_col_name] = [json.dumps(docs) for docs in retrieved_docs_list]

df.to_csv("generated_qa_expt_4.csv", index=False)

refine_query (gpt-4o): 100%|██████████| 150/150 [05:06<00:00,  2.05s/it]
HyDE (gpt-4o): 100%|██████████| 150/150 [14:02<00:00,  5.62s/it]
refine_query (gpt-4o-mini): 100%|██████████| 150/150 [05:47<00:00,  2.32s/it]
HyDE (gpt-4o-mini): 100%|██████████| 150/150 [16:02<00:00,  6.42s/it]


In [81]:
df = pd.read_csv("generated_qa_expt_4.csv")
cols_to_eval = ["refine_query_retrieved_docs_gpt-4o", "HyDE_retrieved_docs_gpt-4o", "refine_query_retrieved_docs_gpt-4o-mini", "HyDE_retrieved_docs_gpt-4o-mini"]
ks = [3, 5, 7]

results_df = evaluate_ir_metrics_table(df, ks=ks, col_names=cols_to_eval)
results_df

Unnamed: 0,Column,k,Precision,Recall,MRR
0,refine_query_retrieved_docs_gpt-4o,3,0.304444,0.886667,0.702222
1,refine_query_retrieved_docs_gpt-4o,5,0.194667,0.933333,0.712556
2,refine_query_retrieved_docs_gpt-4o,7,0.141905,0.946667,0.714619
3,HyDE_retrieved_docs_gpt-4o,3,0.28,0.826667,0.69
4,HyDE_retrieved_docs_gpt-4o,5,0.192,0.92,0.710333
5,HyDE_retrieved_docs_gpt-4o,7,0.140952,0.946667,0.714619
6,refine_query_retrieved_docs_gpt-4o-mini,3,0.304444,0.893333,0.713333
7,refine_query_retrieved_docs_gpt-4o-mini,5,0.196,0.933333,0.722
8,refine_query_retrieved_docs_gpt-4o-mini,7,0.14381,0.953333,0.725016
9,HyDE_retrieved_docs_gpt-4o-mini,3,0.286667,0.833333,0.673333


5. Overall Evaluation using english and filipino queries

In [15]:
import pandas as pd
import json
from tqdm import tqdm

df = pd.read_csv("generated_qa.csv")
models = ["gpt-4o", "gpt-4o-mini"]
query_transformation_module = QueryTransformationModule()
retrieval_decision_module = RetrievalDecisionModule()
agent = AnswerValidationAgent()

class ModelManager:
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModel.from_pretrained(model_path, add_pooling_layer=True)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs)[0][:, 0]
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).squeeze(0).tolist()

base_dir = os.getcwd()
embedding_path = os.path.join(base_dir, "embeddings")
model_manager = ModelManager(embedding_path)  
client = OpenAI()


for model_name in models:
    response_generator_module = ResponseGeneratorModule(generation_model_with_retrieval=model_name)

    for col in [
        "Answer", "Source", "Category",
        "final_response", "final_sources", "final_context", "final_valid",
        "trial_1_query_used", "trial_1_response", "trial_1_sources", "trial_1_verified_context", "trial_1_context", "trial_1_valid",
        "trial_2_query_used", "trial_2_response", "trial_2_sources", "trial_2_verified_context", "trial_2_context", "trial_2_valid",
        "trial_3_query_used", "trial_3_response", "trial_3_sources", "trial_3_verified_context", "trial_3_context", "trial_3_valid"
    ]:
        df[f"{col}_{model_name}"] = None

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing with {model_name}"):
        user_input = row["Question_EN"]

        refined_query = query_transformation_module.refine_query_with_history(user_input, context_str="")
        trial_data = []
        final_response = ""
        final_sources = []
        final_context = []
        final_valid = False

        with DocumentRetrievalModule(host="localhost", collection_name="BAAI", alpha=0.5) as searcher:
            agent = AnswerValidationAgent()
            while agent.current_attempt < agent.max_attempts:
                trial_query = refined_query
                if agent.current_attempt > 0:
                    trial_query = query_transformation_module.generate_hypothetical_document(refined_query)

                context_docs = searcher.search_documents(trial_query, max_results=7)

                sources = [(doc["source"], doc["category"], doc["chunk_index"]) for doc in context_docs]
                context_content = [doc["text"] for doc in context_docs]

                response, relevant_sources, relevant_contexts = response_generator_module.generate_response(user_input, context_docs, sources)

                is_valid = agent.validate_answer(response, relevant_contexts, user_input)

                trial_data.append({
                    "query": trial_query,
                    "response": response,
                    "sources": relevant_sources,
                    "context": context_content,
                    "verified_context": relevant_contexts,
                    "is_valid": is_valid
                })

                if is_valid and relevant_sources:
                    final_response = response
                    final_sources = relevant_sources
                    final_context = context_content
                    final_valid = True
                    break

                agent.current_attempt += 1

        df.at[i, f"Answer_{model_name}"] = final_response
        df.at[i, f"final_response_{model_name}"] = final_response
        df.at[i, f"final_sources_{model_name}"] = json.dumps(final_sources)
        df.at[i, f"final_context_{model_name}"] = json.dumps(final_context)
        df.at[i, f"final_valid_{model_name}"] = final_valid

        if final_sources:
            df.at[i, f"Source_{model_name}"] = final_sources[0][0] if final_sources else None
            df.at[i, f"Category_{model_name}"] = final_sources[0][1] if final_sources else None

        for j, trial in enumerate(trial_data):
            trial_num = j + 1
            df.at[i, f"trial_{trial_num}_query_used_{model_name}"] = trial["query"]
            df.at[i, f"trial_{trial_num}_response_{model_name}"] = trial["response"]
            df.at[i, f"trial_{trial_num}_sources_{model_name}"] = json.dumps(trial["sources"])
            df.at[i, f"trial_{trial_num}_verified_context_{model_name}"] = json.dumps(trial["verified_context"])
            df.at[i, f"trial_{trial_num}_context_{model_name}"] = json.dumps(trial["context"])
            df.at[i, f"trial_{trial_num}_valid_{model_name}"] = trial["is_valid"]

df.to_csv("generated_qa_expt_5_final_expt.csv", index=False)

Processing with gpt-4o: 100%|██████████| 150/150 [18:37<00:00,  7.45s/it]
Processing with gpt-4o-mini: 100%|██████████| 150/150 [15:43<00:00,  6.29s/it]


In [18]:
import pandas as pd
import json
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_similarity, answer_correctness
from datasets import Dataset
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

df = pd.read_csv("generated_qa_expt_5_final_expt.csv")

model = "gpt-4o"

questions = []
answers = []
contexts = []
ground_truths = []

for _, row in df.iterrows():
    questions.append(row.get("Question_EN", ""))

    gen_answer = row.get(f"final_response_{model}", "")
    answers.append(gen_answer if isinstance(gen_answer, str) else "")

    try:
        context = json.loads(row.get(f"final_context_{model}", "[]"))
    except:
        context = []
    contexts.append(context)

    # Ground truth answer in English
    ground_truths.append(row.get("Answer", ""))

# Prepare dataset for RAGAS
data_samples = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truths,
}

dataset = Dataset.from_dict(data_samples)

# Evaluate
score = evaluate(
    dataset,
    metrics=[faithfulness, answer_relevancy, answer_similarity, answer_correctness]
)

print(f"RAGAS evaluation scores for model {model}:")
print(score)

Evaluating:  10%|█         | 62/600 [00:45<08:25,  1.06it/s]No statements were generated from the answer.
Evaluating:  17%|█▋        | 101/600 [01:09<03:04,  2.70it/s]No statements were generated from the answer.
Evaluating:  52%|█████▏    | 310/600 [04:06<10:02,  2.08s/it]No statements were generated from the answer.
Evaluating:  57%|█████▊    | 345/600 [04:33<02:11,  1.94it/s]No statements were generated from the answer.
Evaluating:  79%|███████▉  | 476/600 [06:37<01:55,  1.07it/s]Exception raised in Job[532]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-D3BrhOxEyoKAj6MidNYs4AOE on tokens per min (TPM): Limit 200000, Used 198675, Requested 6994. Please try again in 1.7s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:  89%|████████▉ | 536/600 [07:37<01:22,  1.30s/it]Exception raised in Job[412]: RateLimitError(Error cod

RAGAS evaluation scores for model gpt-4o:
{'faithfulness': 0.9218, 'answer_relevancy': 0.9525, 'answer_similarity': 0.9546, 'answer_correctness': 0.7787}


In [19]:
import pandas as pd
import json
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_similarity, answer_correctness
from datasets import Dataset
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

df = pd.read_csv("generated_qa_expt_5_final_expt.csv")

model = "gpt-4o-mini"

questions = []
answers = []
contexts = []
ground_truths = []

for _, row in df.iterrows():
    questions.append(row["Question_EN"])

    # Generated answer for this model (final_response)
    gen_answer = row.get(f"final_response_{model}", "")
    answers.append(gen_answer if isinstance(gen_answer, str) else "")

    # Context (load JSON if possible)
    try:
        context = json.loads(row.get(f"final_context_{model}", "[]"))
    except:
        context = []
    contexts.append(context)

    # Ground truth answer (original reference)
    ground_truths.append(row.get("Answer", ""))

# Prepare dataset for RAGAS
data_samples = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truths,
}

dataset = Dataset.from_dict(data_samples)

# Evaluate
score = evaluate(
    dataset,
    metrics=[faithfulness, answer_relevancy, answer_similarity, answer_correctness]
)

print(f"RAGAS evaluation scores for model {model}:")
print(score)

Evaluating:  38%|███▊      | 228/600 [02:43<04:50,  1.28it/s]No statements were generated from the answer.
Evaluating:  55%|█████▌    | 330/600 [04:23<02:47,  1.61it/s]Exception raised in Job[413]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-D3BrhOxEyoKAj6MidNYs4AOE on tokens per min (TPM): Limit 200000, Used 198858, Requested 5897. Please try again in 1.426s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:  69%|██████▉   | 414/600 [05:39<02:38,  1.17it/s]Exception raised in Job[153]: TimeoutError()
Evaluating:  88%|████████▊ | 526/600 [07:41<00:54,  1.36it/s]Exception raised in Job[240]: TimeoutError()
Evaluating:  89%|████████▉ | 535/600 [07:48<00:48,  1.34it/s]No statements were generated from the answer.
Evaluating:  90%|█████████ | 540/600 [07:53<00:51,  1.16it/s]Exception raised in Job[329]: RateLimitError(Error co

RAGAS evaluation scores for model gpt-4o-mini:
{'faithfulness': 0.8921, 'answer_relevancy': 0.9454, 'answer_similarity': 0.9580, 'answer_correctness': 0.7858}


In [20]:
import pandas as pd
import json
import os
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
from openai import OpenAI

df = pd.read_csv("generated_qa.csv")
models = ["gpt-4o", "gpt-4o-mini"]
query_transformation_module = QueryTransformationModule()
retrieval_decision_module = RetrievalDecisionModule()
agent = AnswerValidationAgent()

class ModelManager:
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModel.from_pretrained(model_path, add_pooling_layer=True)

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = self.model(**inputs)[0][:, 0]
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).squeeze(0).tolist()

base_dir = os.getcwd()
embedding_path = os.path.join(base_dir, "embeddings")
model_manager = ModelManager(embedding_path)  
client = OpenAI()

def translate_to_english(text):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful translator from Tagalog to English."},
            {"role": "user", "content": f"Translate this text to English:\n\n{text}"}
        ],
        temperature=0.2,
    )
    return response.choices[0].message.content.strip()

for model_name in models:
    response_generator_module = ResponseGeneratorModule(generation_model_with_retrieval=model_name)

    for col in [
        "Answer", "Source", "Category",
        "final_response", "final_sources", "final_context", "final_valid",
        "trial_1_query_used", "trial_1_response", "trial_1_sources", "trial_1_verified_context", "trial_1_context", "trial_1_valid",
        "trial_2_query_used", "trial_2_response", "trial_2_sources", "trial_2_verified_context", "trial_2_context", "trial_2_valid",
        "trial_3_query_used", "trial_3_response", "trial_3_sources", "trial_3_verified_context", "trial_3_context", "trial_3_valid"
    ]:
        df[f"{col}_{model_name}"] = None

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing with {model_name}"):
        user_input = row["Question_TL"] 

        refined_query = query_transformation_module.refine_query_with_history(user_input, context_str="")
        trial_data = []
        final_response = ""
        final_sources = []
        final_context = []
        final_valid = False

        with DocumentRetrievalModule(host="localhost", collection_name="BAAI", alpha=0.5) as searcher:
            agent = AnswerValidationAgent()
            while agent.current_attempt < agent.max_attempts:
                trial_query = refined_query
                if agent.current_attempt > 0:
                    trial_query = query_transformation_module.generate_hypothetical_document(refined_query)

                context_docs = searcher.search_documents(trial_query, max_results=7)

                sources = [(doc["source"], doc["category"], doc["chunk_index"]) for doc in context_docs]
                context_content = [doc["text"] for doc in context_docs]

                response, relevant_sources, relevant_contexts = response_generator_module.generate_response(user_input, context_docs, sources)

                is_valid = agent.validate_answer(response, relevant_contexts, user_input)

                trial_data.append({
                    "query": trial_query,
                    "response": response,
                    "sources": relevant_sources,
                    "context": context_content,
                    "verified_context": relevant_contexts,
                    "is_valid": is_valid
                })

                if is_valid and relevant_sources:
                    final_response = response
                    final_sources = relevant_sources
                    final_context = context_content
                    final_valid = True
                    break

                agent.current_attempt += 1

        # Translate final response to English
        final_answer_en = translate_to_english(final_response) if final_response else ""

        df.at[i, f"Answer_{model_name}"] = final_response  # Tagalog answer
        df.at[i, f"final_response_{model_name}"] = final_response  # Tagalog final response
        df.at[i, f"final_answer_en_{model_name}"] = final_answer_en  # English translation of final answer
        df.at[i, f"final_sources_{model_name}"] = json.dumps(final_sources)
        df.at[i, f"final_context_{model_name}"] = json.dumps(final_context)
        df.at[i, f"final_valid_{model_name}"] = final_valid

        if final_sources:
            df.at[i, f"Source_{model_name}"] = final_sources[0][0] if final_sources else None
            df.at[i, f"Category_{model_name}"] = final_sources[0][1] if final_sources else None

        for j, trial in enumerate(trial_data):
            trial_num = j + 1
            df.at[i, f"trial_{trial_num}_query_used_{model_name}"] = trial["query"]
            df.at[i, f"trial_{trial_num}_response_{model_name}"] = trial["response"]
            df.at[i, f"trial_{trial_num}_sources_{model_name}"] = json.dumps(trial["sources"])
            df.at[i, f"trial_{trial_num}_verified_context_{model_name}"] = json.dumps(trial["verified_context"])
            df.at[i, f"trial_{trial_num}_context_{model_name}"] = json.dumps(trial["context"])
            df.at[i, f"trial_{trial_num}_valid_{model_name}"] = trial["is_valid"]

df.to_csv("generated_qa_expt_5_tagalog_final_expt.csv", index=False)

Processing with gpt-4o: 100%|██████████| 150/150 [27:30<00:00, 11.01s/it]
Processing with gpt-4o-mini: 100%|██████████| 150/150 [35:31<00:00, 14.21s/it]


In [21]:
import pandas as pd
import json
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_similarity, answer_correctness
from datasets import Dataset

df = pd.read_csv("generated_qa_expt_5_tagalog_final_expt.csv")

model = "gpt-4o-mini"

questions = []
answers = []
contexts = []
ground_truths = []

for _, row in df.iterrows():
    questions.append(row.get("Question_EN", ""))

    gen_answer = row.get(f"final_answer_en_{model}", "")
    answers.append(gen_answer if isinstance(gen_answer, str) else "")

    try:
        context = json.loads(row.get(f"final_context_{model}", "[]"))
    except:
        context = []
    contexts.append(context)

    # Ground truth answer in English
    ground_truths.append(row.get("Answer", ""))

# Prepare dataset for RAGAS
data_samples = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truths,
}

dataset = Dataset.from_dict(data_samples)

# Evaluate
score = evaluate(
    dataset,
    metrics=[faithfulness, answer_relevancy, answer_similarity, answer_correctness]
)

print(f"RAGAS evaluation scores for model {model} for Tagalog questions:")
print(score)

Evaluating:  12%|█▏        | 69/600 [00:58<09:54,  1.12s/it]No statements were generated from the answer.
Evaluating:  46%|████▌     | 274/600 [03:32<04:40,  1.16it/s]No statements were generated from the answer.
Evaluating:  55%|█████▌    | 332/600 [04:41<03:41,  1.21it/s]Exception raised in Job[316]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-D3BrhOxEyoKAj6MidNYs4AOE on tokens per min (TPM): Limit 200000, Used 195496, Requested 9197. Please try again in 1.407s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:  58%|█████▊    | 348/600 [04:58<05:41,  1.36s/it]Exception raised in Job[137]: TimeoutError()
Evaluating:  61%|██████    | 364/600 [05:04<02:03,  1.91it/s]No statements were generated from the answer.
Evaluating:  87%|████████▋ | 524/600 [07:42<02:12,  1.74s/it]Exception raised in Job[232]: TimeoutError()
Evaluati

RAGAS evaluation scores for model gpt-4o-mini for Tagalog questions:
{'faithfulness': 0.8648, 'answer_relevancy': 0.9366, 'answer_similarity': 0.9509, 'answer_correctness': 0.7405}


In [22]:
import pandas as pd
import json
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_similarity, answer_correctness
from datasets import Dataset

df = pd.read_csv("generated_qa_expt_5_tagalog_final_expt.csv")

model = "gpt-4o"

questions = []
answers = []
contexts = []
ground_truths = []

for _, row in df.iterrows():
    questions.append(row.get("Question_EN", ""))

    gen_answer = row.get(f"final_answer_en_{model}", "")
    answers.append(gen_answer if isinstance(gen_answer, str) else "")

    try:
        context = json.loads(row.get(f"final_context_{model}", "[]"))
    except:
        context = []
    contexts.append(context)

    # Ground truth answer in English
    ground_truths.append(row.get("Answer", ""))

# Prepare dataset for RAGAS
data_samples = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truths,
}

dataset = Dataset.from_dict(data_samples)

# Evaluate
score = evaluate(
    dataset,
    metrics=[faithfulness, answer_relevancy, answer_similarity, answer_correctness]
)

print(f"RAGAS evaluation scores for model {model} for Tagalog questions:")
print(score)

Evaluating:  10%|▉         | 58/600 [00:36<02:26,  3.69it/s]No statements were generated from the answer.
Evaluating:  28%|██▊       | 166/600 [01:41<05:55,  1.22it/s]No statements were generated from the answer.
Evaluating:  66%|██████▌   | 395/600 [05:24<02:50,  1.20it/s]Exception raised in Job[233]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-D3BrhOxEyoKAj6MidNYs4AOE on tokens per min (TPM): Limit 200000, Used 197888, Requested 9681. Please try again in 2.27s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:  66%|██████▋   | 399/600 [05:33<06:36,  1.97s/it]Exception raised in Job[316]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-D3BrhOxEyoKAj6MidNYs4AOE on tokens per min (TPM): Limit 200000, Used 193982, Requested 9194. Please try again in 952ms. Visit 

RAGAS evaluation scores for model gpt-4o for Tagalog questions:
{'faithfulness': 0.9213, 'answer_relevancy': 0.9518, 'answer_similarity': 0.9555, 'answer_correctness': 0.7664}


In [25]:
class ResponseGeneratorWithoutRetrievalModule:
    def __init__(
        self,
        client=client,
        generation_model_without_retrieval="gpt-4o-mini",
        generation_temperature=0.1,
        max_tokens=512
    ):
        self.client = client
        self.generation_model_without_retrieval = generation_model_without_retrieval
        self.generation_temperature = generation_temperature
        self.max_tokens = max_tokens

    def conversation_without_retrieval(self, user_input, context_str=None):
        prompt = f"""
        You are a Quezon City Legal Provider. Answer the query using your internal knowledge.

        User query:
        {user_input}

        Please answer clearly and accurately.  
        Note: Provide the answer in English.
        """

        try:
            response = self.client.chat.completions.create(
                model=self.generation_model_without_retrieval,
                messages=[{"role": "user", "content": prompt.strip()}],
                max_tokens=self.max_tokens
            )
            return response.choices[0].message.content.strip()
        except Exception:
            return "Error generating response"    

In [32]:
import pandas as pd
import json
from tqdm import tqdm

df = pd.read_csv("generated_qa.csv")
models = ["gpt-4o-mini"]

response_generator_module = ResponseGeneratorWithoutRetrievalModule()

client = OpenAI()

for model_name in models:
    response_generator_module = ResponseGeneratorWithoutRetrievalModule(generation_model_without_retrieval=model_name)

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing with {model_name}"):
        user_input = row["Question_EN"]

        generated_response = response_generator_module.conversation_without_retrieval(user_input)
        df.at[i, f"generated_response_{model_name}"] = generated_response

df.to_csv("generated_qa_expt_5_english_no_retrieval.csv", index=False)

Processing with gpt-4o-mini: 100%|██████████| 150/150 [15:02<00:00,  6.01s/it]


In [34]:
import pandas as pd
import json
from ragas import evaluate
from ragas.metrics import answer_relevancy, answer_similarity, answer_correctness
from datasets import Dataset

df = pd.read_csv("generated_qa_expt_5_english_no_retrieval.csv")

model = "gpt-4o-mini"

questions = []
answers = []
ground_truths = []

for _, row in df.iterrows():
    questions.append(row.get("Question_EN", ""))

    gen_answer = row.get(f"generated_response_{model}", "")
    answers.append(gen_answer if isinstance(gen_answer, str) else "")

    # Ground truth answer in English
    ground_truths.append(row.get("Answer", ""))

# Prepare dataset for RAGAS
data_samples = {
    "question": questions,
    "answer": answers,
    "ground_truth": ground_truths,
}

dataset = Dataset.from_dict(data_samples)

# Evaluate
score = evaluate(
    dataset,
    metrics=[answer_similarity, answer_correctness]
)

print(f"RAGAS evaluation scores for no RAG:")
print(score)

Evaluating: 100%|██████████| 300/300 [05:38<00:00,  1.13s/it]


RAGAS evaluation scores for no RAG:
{'answer_similarity': 0.9324, 'answer_correctness': 0.6024}


In [35]:
import pandas as pd
import json
from tqdm import tqdm

df = pd.read_csv("generated_qa.csv")
models = ["gpt-4o"]

response_generator_module = ResponseGeneratorWithoutRetrievalModule()

client = OpenAI()

for model_name in models:
    response_generator_module = ResponseGeneratorWithoutRetrievalModule(generation_model_without_retrieval=model_name)

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing with {model_name}"):
        user_input = row["Question_EN"]

        generated_response = response_generator_module.conversation_without_retrieval(user_input)
        df.at[i, f"generated_response_{model_name}"] = generated_response

df.to_csv("generated_qa_expt_5_english_no_retrieval_4o.csv", index=False)

Processing with gpt-4o: 100%|██████████| 150/150 [18:25<00:00,  7.37s/it]


In [36]:
import pandas as pd
import json
from ragas import evaluate
from ragas.metrics import answer_relevancy, answer_similarity, answer_correctness
from datasets import Dataset

df = pd.read_csv("generated_qa_expt_5_english_no_retrieval_4o.csv")

model = "gpt-4o"

questions = []
answers = []
ground_truths = []

for _, row in df.iterrows():
    questions.append(row.get("Question_EN", ""))

    gen_answer = row.get(f"generated_response_{model}", "")
    answers.append(gen_answer if isinstance(gen_answer, str) else "")

    # Ground truth answer in English
    ground_truths.append(row.get("Answer", ""))

# Prepare dataset for RAGAS
data_samples = {
    "question": questions,
    "answer": answers,
    "ground_truth": ground_truths,
}

dataset = Dataset.from_dict(data_samples)

# Evaluate
score = evaluate(
    dataset,
    metrics=[answer_similarity, answer_correctness]
)

print(f"RAGAS evaluation scores for no RAG:")
print(score)

Evaluating: 100%|██████████| 300/300 [05:12<00:00,  1.04s/it]


RAGAS evaluation scores for no RAG:
{'answer_similarity': 0.9316, 'answer_correctness': 0.6073}


In [37]:
class ResponseGeneratorWithoutRetrievalModule:
    def __init__(
        self,
        client=client,
        generation_model_without_retrieval="gpt-4o-mini",
        max_tokens=512
    ):
        self.client = client
        self.generation_model_without_retrieval = generation_model_without_retrieval
        self.max_tokens = max_tokens

    def conversation_without_retrieval(self, user_input, context_str=None):
        prompt = f"""
        You are a Quezon City Legal Provider. Answer the query using your internal knowledge.

        User query:
        {user_input}

        Please answer clearly and accurately.  
        Note: Provide the answer in English.
        """

        try:
            response = self.client.chat.completions.create(
                model=self.generation_model_without_retrieval,
                messages=[{"role": "user", "content": prompt.strip()}],
                max_tokens=self.max_tokens
            )
            return response.choices[0].message.content.strip()
        except Exception:
            return "Error generating response"    

In [41]:
import pandas as pd
import json
from tqdm import tqdm

df = pd.read_csv("generated_qa.csv")
models = ["gpt-4o-mini-search-preview"]

response_generator_module = ResponseGeneratorWithoutRetrievalModule()

client = OpenAI()

for model_name in models:
    response_generator_module = ResponseGeneratorWithoutRetrievalModule(generation_model_without_retrieval=model_name)

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing with {model_name}"):
        user_input = row["Question_EN"]

        generated_response = response_generator_module.conversation_without_retrieval(user_input)
        df.at[i, f"generated_response_{model_name}"] = generated_response

df.to_csv("generated_qa_expt_5_english_no_retrieval_web_search.csv", index=False)

Processing with gpt-4o-mini-search-preview: 100%|██████████| 150/150 [10:18<00:00,  4.13s/it]


In [None]:
import pandas as pd
import json
from ragas import evaluate
from ragas.metrics import answer_relevancy, answer_similarity, answer_correctness
from datasets import Dataset

df = pd.read_csv("generated_qa_expt_5_english_no_retrieval_web_search.csv")

model = "gpt-4o-mini-search-preview"

questions = []
answers = []
ground_truths = []

for _, row in df.iterrows():
    questions.append(row.get("Question_EN", ""))

    gen_answer = row.get(f"generated_response_{model}", "")
    answers.append(gen_answer if isinstance(gen_answer, str) else "")

    # Ground truth answer in English
    ground_truths.append(row.get("Answer", ""))

# Prepare dataset for RAGAS
data_samples = {
    "question": questions,
    "answer": answers,
    "ground_truth": ground_truths,
}

dataset = Dataset.from_dict(data_samples)

# Evaluate
score = evaluate(
    dataset,
    metrics=[answer_similarity, answer_correctness]
)

print(f"RAGAS evaluation scores for no RAG (web search):")
print(score)