In [1]:

import pandas as pd
import tiktoken
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index import VectorStoreIndex, SimpleWebPageReader, ServiceContext
from llama_index.embeddings import OpenAIEmbedding
from llama_index.evaluation import FaithfulnessEvaluator, \
    RelevancyEvaluator
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from llama_index.prompts import PromptTemplate
from llama_index.query_engine import CustomQueryEngine
from llama_index.response.schema import Response
from llama_index.response_synthesizers import (
    get_response_synthesizer,
    BaseSynthesizer,
)
from llama_index.retrievers import VectorIndexRetriever, BaseRetriever
from tqdm.auto import tqdm


In [2]:
import os

os.environ["OPENAI_API_KEY"] = "<Put the OpenAI API Key here>"

In [3]:
def num_tokens_from_string(string: str, model_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [4]:
def get_documents(url: str):
    return SimpleWebPageReader(html_to_text=True).load_data([url])

In [5]:
def get_embedding():
    model_name = "sentence-transformers/all-mpnet-base-v2"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}

    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return hf

In [6]:
documents = get_documents("https://www.pg.unicamp.br/norma/31594/0")
embedding_model = get_embedding()

In our RAG pipeline uses gpt-3.5-turbo to index and query our documents.

In [7]:
node_parser = SimpleNodeParser.from_defaults(
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=500)
)

gpt35_llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo")

gpt35_service_context = ServiceContext.from_defaults(
    llm=gpt35_llm,
    embed_model=OpenAIEmbedding(),
    node_parser=node_parser
)

### Configuring a Query Engine

In [8]:
# build index
index = VectorStoreIndex.from_documents(documents, service_context=gpt35_service_context, show_progress=True)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=4,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="compact",
)

Parsing documents into nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/338 [00:00<?, ?it/s]

In [9]:
text_qa_template = PromptTemplate(
    "As informações dos documentos estão apresentadas abaixo.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Dadas as informações dos documentos e nenhum conhecimento prévio, "
    "responda a seguinte pergunta.\n"
    "Pergunta: {query_str}\n"
    "Resposta: "
)

refine_template = PromptTemplate(
    "A pergunta original é a seguinte: {query_str}\n"
    "Fornecemos uma resposta existente: {existing_answer}\n"
    "Temos a oportunidade de refinar a resposta existente (somente se necessário) com mais contexto abaixo.\n"
    "------------\n"
    "{context_msg}\n"
    "------------\n"
    "Dado o novo contexto, refine a resposta original para melhor responder à pergunta. "
    "Se o contexto não for útil, retorne APENAS a resposta original.\n"

    "Resposta: "
)

In [10]:
response_synthesizer.update_prompts({"text_qa_template": text_qa_template, "refine_template": refine_template})

### Evaluation

Our pipeline will evaluate both the response quality and hallucination. In the evaluation process e will setup `gpt-3.5-turbo` as our LLM.

In [4]:
import pickle

qa_pairs = pickle.load(open("../eval_data/eval_dataset.pkl", "rb"))

In [5]:
questions, answers = zip(*qa_pairs)

In [13]:
qa_df = pd.DataFrame.from_dict({
    "questions": questions,
    "answers": answers
})

print(qa_df.shape)
qa_df.head()

(24, 2)


Unnamed: 0,questions,answers
0,Qual é o total de vagas oferecidas para o Vest...,"Para o ano de 2024, a Universidade Estadual de..."
1,Quais são as condições para que um candidato p...,Os candidatos que podem participar do Programa...
2,Quais são as datas e o formato das provas para...,A 1ª fase do VU 2024 será realizada no dia 29 ...
3,Qual é a consequência para um candidato que ob...,O candidato que não realizar ou obtiver nota 0...
4,Qual é o procedimento para a classificação e c...,"Para cada curso, até duas provas são considera..."


In [14]:
class UnicampQueryEngine(CustomQueryEngine):
    """RAG String Query Engine."""

    retriever: BaseRetriever
    response_synthesizer: BaseSynthesizer

    def custom_query(self, query_str: str):
        nodes = self.retriever.retrieve(query_str)
        response_obj = self.response_synthesizer.synthesize(query_str, nodes)

        return response_obj

    async def acustom_query(self, query_str: str):
        """Run a custom query asynchronously."""
        # by default, just run the synchronous version
        return self.custom_query(query_str)

In [15]:
query_engine = UnicampQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

In [16]:
query_str = "Quem é Maria Luiza Moretti?"

response = query_engine.custom_query(query_str)
print(response)

Maria Luiza Moretti é a Reitora em exercício da Unicamp.


In [31]:
import time
import asyncio
import nest_asyncio

nest_asyncio.apply()


async def run_query(query_engine, q):
    try:
        return await query_engine.acustom_query(q)
    except:
        return Response(response="Error, query failed.")


def async_evaluate_query_engine(evaluator, query_engine, questions, batch_size=10):
    total_correct = 0
    all_results = []
    for i in tqdm(range(0, len(questions), batch_size)):
        batch_qs = questions[i:i + batch_size]

        tasks = [run_query(query_engine, q) for q in batch_qs]
        responses = asyncio.run(asyncio.gather(*tasks))
        print(f"finished batch {(i // batch_size) + 1} out of {len(questions) // batch_size + 1}")

        # eval for hallucination
        if isinstance(evaluator, FaithfulnessEvaluator):
            print("Use FaithfulnessEvaluator")
            for response in responses:
                eval_result = 1 if "YES" in evaluator.evaluate_response(response=response).feedback else 0
                total_correct += eval_result
                all_results.append(eval_result)

        # eval for answer quality
        elif isinstance(evaluator, RelevancyEvaluator):
            print("Use RelevancyEvaluator")
            for question, response in zip(batch_qs, responses):
                context_list = response.source_nodes
                eval_result = 1 if "YES" in evaluator.evaluate_response(query=question, response=response,
                                                                        context=context_list).feedback else 0
                total_correct += eval_result
                all_results.append(eval_result)

        # helps avoid rate limits
        time.sleep(1)

    return total_correct, all_results


In [21]:
# eval for Faithfulness/hallucination
faithfulness_evaluator = FaithfulnessEvaluator(service_context=gpt35_service_context)
total_correct, all_results = async_evaluate_query_engine(faithfulness_evaluator, query_engine, questions, batch_size=20)

  0%|          | 0/2 [00:00<?, ?it/s]

finished batch 1 out of 2
Use FaithfulnessEvaluator
finished batch 2 out of 2
Use FaithfulnessEvaluator


In [22]:
print(f"Faithfulness:  Scored {total_correct} out of {len(questions)} questions correctly.")

Faithfulness:  Scored 22 out of 24 questions correctly.


In [32]:
# eval for Relevancy/answer quality
relevancy_evaluator = RelevancyEvaluator(service_context=gpt35_service_context)
total_correct, all_results = async_evaluate_query_engine(relevancy_evaluator, query_engine, questions, batch_size=20)

  0%|          | 0/2 [00:00<?, ?it/s]

finished batch 1 out of 2
Use RelevancyEvaluator
finished batch 2 out of 2
Use RelevancyEvaluator


In [33]:
print(f"Relevancy:  Scored {total_correct} out of {len(questions)} questions correctly.")

Relevancy:  Scored 21 out of 24 questions correctly.
