In [1]:
config = {
    "llm":"microsoft/Phi-3.5-mini-instruct",
    "embedding_model":"BAAI/bge-base-en-v1.5",
    "index_path":"./wiki_index/chroma.sqlite3",
    "chunk_size":256,
    "chunk_overlap":32,
    "similarity_k":5,
    "similarity_cutoff":0.5,
    "n_articles":3,
    "quantized":True
}

In [None]:
%env API_KEY="your_api_key_here"
%env CX="your_cx_here"
%env HF_TOKEN="your_hf_token_here"

In [2]:
import json
questions = [] 
ground_truths = []
with open('eval_dataset.json','r') as f:
    data = json.load(f)

In [3]:
for entry in data:
    if  'question' in entry and 'answer' in entry:
        questions.append(entry['question'])
        ground_truths.append(entry['answer'])

In [None]:
from rag import WikiRAG
engine = WikiRAG(config_dict=config)

In [None]:
from transformers import BitsAndBytesConfig
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
import os

kwargs = {
    "token": os.environ['HF_TOKEN'],
    "torch_dtype": torch.bfloat16, 
}

if config['quantized']:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )
    del kwargs['torch_dtype']
    kwargs["quantization_config"] = quantization_config

llm = HuggingFaceLLM(
    model_name=config['llm'],
    tokenizer_name=config['llm'],
    model_kwargs = kwargs,
    tokenizer_kwargs={"token": os.environ['HF_TOKEN']},
)

In [None]:
llm_answers = []
rag_answers = []
contexts = []

for question in questions:
    llm_answer = llm.complete(question)
    rag_answer, contexts = engine.query(question)
    llm_answers.append(llm_answer)
    rag_answers.append(rag_answers)
    contexts.append(contexts)

In [None]:
# 1. evaluate generation metrics (faithfulness, correctness, relevancy)
from datasets import Dataset 
from ragas.metrics import faithfulness, answer_correctness, answer_relevancy, context_recall,context_precision
from ragas import evaluate

llm_samples = {
    'question' : questions,
    'answer': llm_answers,
    'ground_truth': ground_truths,
}

llm_dataset = Dataset.from_dict(llm_samples)
llm_score = evaluate(llm_dataset,metrics=[faithfulness,answer_correctness,answer_relevancy])

In [None]:
rag_samples = {
    'question' : questions,
    'answer': llm_answers,
    'ground_truth': ground_truths,
    'context': contexts
}

rag_dataset = Dataset.from_dict(rag_samples)

rag_score = evaluate(llm_dataset,metrics=
                     [faithfulness,answer_correctness,answer_relevancy,context_precision,context_recall])