# Tutorial: Building Your Own Retrieval-Augmented Generation System

Retrieval-augmented generation is a promising avenue for combining the semantic understanding capabilities of LLMs with the factual accuracy of direct source materials. This section will present a practical, hands-on example of building and augmenting a RAG system using a popular open-source RAG application. This tutorial is far from an exhaustive exploration of RAG capabilities, but aims to overview a few critical techinques while introducing the use of LlamaIndex. 

Requirements:

1.   An OpenAI account with an API key.

## Installation and Imports

In [None]:
%pip install -U -q llama-index-core \
                   llama-index-finetuning \
                   llama-index-embeddings-openai \
                   llama-index-embeddings-huggingface \
                   llama-index-llms-openai \
                   llama-index-embeddings-adapter \
                   llama-index-postprocessor-cohere-rerank \
                   llama-index-llms-gradient

%pip install openai==1.12.0 -q -U
%pip install sentence_transformers -q -U
%pip install datasets -q -U
%pip install accelerate -q -U

In [None]:
import os
import torch
import nest_asyncio
import getpass
from google.colab import drive
import locale
locale.getpreferredencoding = lambda: "UTF-8"

from tqdm import tqdm
import pandas as pd

# Turn on async
nest_asyncio.apply()

# Enter OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

In [None]:
from tqdm import tqdm
import pandas as pd
import torch

from llama_index.core import (
    Document,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
    PromptTemplate,
    get_response_synthesizer
)

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.response_synthesizers import ResponseMode

from llama_index.core.evaluation import (
    EmbeddingQAFinetuneDataset,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    RetrieverEvaluator
)

from llama_index.finetuning import generate_qa_embedding_pairs, SentenceTransformersFinetuneEngine
from sklearn.model_selection import train_test_split

from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path

## Basic RAG system

**NOTE**: Before continuing, copy openai_tos.zip from the chapter 7 tutorial repository into your working directory.

In [None]:
## Unzip the openai TOS.
!unzip ./openai_tos.zip
directory = './openai_tos/'
doc_names = sorted(os.listdir(directory))

## Collect each TOS doc into a llama_index Document object
documents = []
for i, doc_name in enumerate(doc_names):
  document = open(directory+doc_name).read()
  d = Document(
      text=document,
      metadata = {"file": doc_name, "name": doc_name.split('_')[1].split('.')[0].replace('-',' ')}
  )
  documents.append(d)

In [None]:
documents

In [None]:
# Define a text chunking procedure
text_chunker = SentenceSplitter(chunk_size=128, chunk_overlap=8)

# Split the documnets into nodes
nodes = text_chunker.get_nodes_from_documents(documents)

# Load a model for embedding the text
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
output_dir = './bge-small-en-v1.5_openai-tos_vectors/'

In [None]:
## Generate indexing vectors

if True:
    index = VectorStoreIndex(
        nodes,
        embed_model=embed_model,
        show_progress=True
    )

    ## Save embeddings with a storage context.
    index.storage_context.persist(persist_dir = output_dir)

## Load embeds from storage context. Requires setting the same storage_context as when generated.
else:
    storage_context = StorageContext.from_defaults(persist_dir = output_dir)
    index = load_index_from_storage(
        storage_context=storage_context,
        embed_model=embed_model
    )

In [None]:
## Create a simple query engine and compare to GPT responses

def gpt_and_rag_answers(query,query_engine,llm_engine):
    llm_response = llm_engine.complete(query)
    rag_response = query_engine.query(query)

    print(f'############\nGPT-3 response:\n{llm_response}\n\n############\nRAG response:\n{rag_response}')
    return llm_response, rag_response

## Create a GPT-4 object for API calls
llm_gpt3 = OpenAI(model='gpt-3.5-turbo-0125',temperature=0.0)
query_engine = index.as_query_engine(llm=llm_gpt3)

In [None]:
query = 'Who owns the content created by OpenAI programs?'
gpt, rag = gpt_and_rag_answers(query, query_engine, llm_gpt3)

## Improved RAG Querying

In [None]:
## Create improved querying engine

## Set number of nodes for use in generation
TopK = 5

## Improve the prompting template to give more verbose answers
qa_prompt_tmpl = PromptTemplate(
    "You are an expert Q&A system that is trusted around the world."
    "Always answer the query using the provided context information, and not prior knowledge.\n"
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)

###################

## Define the larger-k retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=TopK,
)

## Set the form of context consolidation
response_synthesizer = get_response_synthesizer(response_mode=ResponseMode.SIMPLE_SUMMARIZE)

## Build the improved query engine and set the template to the new one.
custom_query_engine = RetrieverQueryEngine.from_args(
    retriever,
    response_synthesizer=response_synthesizer,
)
custom_query_engine.update_prompts(
     {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

###################

## Create a GPT-4 generator.
llm_gpt4 = OpenAI(model='gpt-4',temperature=0.0)

In [None]:
## Run the query again with the improved engine.

query = 'Who owns the content created by OpenAI programs?'
gpt, rag = gpt_and_rag_answers(query, custom_query_engine, llm_gpt3)

## Re-ranking

In [None]:
## Create a querying engine with re-ranking

## Set the re-ranker parameters
Rerank_TopK = 20
Rerank_TopRRK = 5

###################

## Define the re-ranking retriever.
retriever_rr = VectorIndexRetriever(
    index=index,
    similarity_top_k=Rerank_TopK,
)

## Set the form of context consolidation
response_synthesizer_rr = get_response_synthesizer(response_mode=ResponseMode.SIMPLE_SUMMARIZE)

## Build the re-ranking query engine and set the template to the new one.
custom_query_engine_rerank = RetrieverQueryEngine.from_args(
    retriever_rr,
    response_synthesizer=response_synthesizer_rr,
    node_postprocessors=[
        LLMRerank(
            choice_batch_size=5,
            top_n=Rerank_TopRRK,
        )],
)
custom_query_engine_rerank.update_prompts(
     {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

In [None]:
## Test a more complex question to show benefit of re-ranking. First with just the improved engine.

query = 'Will OpenAI give me any sort of reward if I find an issue with their software?'

gpt, rag = gpt_and_rag_answers(query, custom_query_engine, llm_gpt4)

In [None]:
## Next, using the re-ranking engine.

query = 'Will OpenAI give me any sort of reward if I find an issue with their software?'

gpt, rag_rr = gpt_and_rag_answers(query, custom_query_engine_rerank, llm_gpt4)

## Fine-tuning

In [None]:
## Create train and test splits with the nodes.
train_nodes, val_nodes = train_test_split(pd.Series(nodes), test_size=0.20, random_state=7)
print(len(train_nodes), len(val_nodes))

In [None]:
## Generate Q&A pairs from the training set. This operates by looking at each
## node in the train set, and using GPT to generate a question answered by that
## node.

json_locs = './bge-small-en-v1.5_openai-tos_qa-embedding-pairs/'
!mkdir $json_locs

if True:
    train_dataset = generate_qa_embedding_pairs(
        train_nodes,
        llm=OpenAI(model='gpt-3.5-turbo',temperature=0.0),
    )
    val_dataset = generate_qa_embedding_pairs(
        val_nodes,
        llm=OpenAI(model='gpt-3.5-turbo',temperature=0.0),
    )

    train_dataset.save_json(json_locs+"train_dataset.json")
    val_dataset.save_json(json_locs+"val_dataset.json")

else:
    train_dataset = EmbeddingQAFinetuneDataset.from_json(json_locs+"train_dataset.json")
    val_dataset = EmbeddingQAFinetuneDataset.from_json(json_locs+"val_dataset.json")

In [None]:
## Look at validation set queries and find an interesting example. Copy/paste a
## query ID into the next cell to see the Q&A.
val_dataset.__dict__['queries']

In [None]:
query_id = '0155e8ac-0174-4336-abcc-0ed302230b13'
corpus_id = val_dataset.__dict__['relevant_docs'][query_id][0]

sample_chunk = val_dataset.__dict__['corpus'][corpus_id]
sample_query = val_dataset.__dict__['queries'][query_id]

print(f'Sample text node:\n------------------\n{sample_chunk}')
print(f'\n\nQuery based on node:\n---------------------\n{sample_query}')

In [None]:
## Fine-tune the model with the training set. This will direct the
## model to find the node that each question was based off of.

model_ft_path = './bge-small-en-v1.5_openai-tos_finetuned-model'

if True:
    finetune_engine = SentenceTransformersFinetuneEngine(
        model_id = "BAAI/bge-small-en-v1.5",
        dataset = train_dataset,
        val_dataset = val_dataset,
        epochs = 4,
        model_output_path = model_ft_path
    )
    finetune_engine.finetune()
    finetuned_embedding_model = finetune_engine.get_finetuned_model()

else:
    finetuned_embedding_model = HuggingFaceEmbedding(
        model_name = model_ft_path
    )

In [None]:
## Create a function for evaluation performance of the fine-tuned RAG model.
## This produces many metrics, but we will look at 'cosine_accuracy@3', which
## determines what % of the questions from the validation set, when put through
## RAG retrieval, return the document they were generated from within the top 3.

def evaluate_st(dataset, model_id, name):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [None]:
## First the base model
base_score = evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")['bge_cosine_accuracy@3']
## Next the fine-tuned model
ft_score = evaluate_st(val_dataset, model_ft_path, name="finetuned")['finetuned_cosine_accuracy@3']

print(f'Base model accuracy@3 = {base_score}\nFine-tuned model accuracy@3 = {ft_score}')

In [None]:
## Now with the updated model, we can re-embed the documents and create
## our fine-tuned RAG system.

output_dir = '../data/bge-small-en-v1.5_openai-tos_vectors-finetuned'

if True:
    ## Generate new vector-index with the fine-tuned embedding model
    index_ft = VectorStoreIndex(
        nodes,
        embed_model = finetuned_embedding_model,
        show_progress=True
    )

    ## Save embeddings with a storage context.
    index_ft.storage_context.persist(persist_dir = output_dir)

else: ## Load vectors
    storage_context = StorageContext.from_defaults(persist_dir = output_dir)
    index_ft = load_index_from_storage(
        storage_context=storage_context,
        embed_model=finetuned_embedding_model
    )

In [None]:
ft_query_engine = index_ft.as_query_engine()

def ft_gpt_and_rag_answers(query,query_engine,llm_engine):
    #gpt_response = OpenAI(model=gpt_model).complete(query)
    llm_response = llm_engine.complete(query)
    rag_response = query_engine.query(query)

    print(f'############\nLLM response:\n{llm_response}\n\n############\nRAG response:\n{rag_response}')
    return llm_response, rag_response

In [None]:
## Create fine-tuned RAG systems using the original improvements and with
## the re-ranker.

TopK = 5

qa_prompt_tmpl = PromptTemplate(
    "You are an expert Q&A system that is trusted around the world."
    "Always answer the query using the provided context information, and not prior knowledge.\n"
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)

###################

retriever_ft = VectorIndexRetriever(
    index=index_ft,
    similarity_top_k=TopK,
)

response_synthesizer_ft = get_response_synthesizer(response_mode=ResponseMode.SIMPLE_SUMMARIZE)

custom_query_engine_ft = RetrieverQueryEngine.from_args(
    retriever_ft,
    response_synthesizer=response_synthesizer_ft,
)
custom_query_engine_ft.update_prompts(
     {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

###################

Rerank_TopK = 20
Rerank_TopRRK = 5

retriever_rr_ft = VectorIndexRetriever(
    index=index_ft,
    similarity_top_k=Rerank_TopK,
)

response_synthesizer_rr_ft = get_response_synthesizer(response_mode=ResponseMode.SIMPLE_SUMMARIZE)

custom_query_engine_rerank_ft = RetrieverQueryEngine.from_args(
    retriever_rr_ft,
    response_synthesizer=response_synthesizer_rr_ft,
    node_postprocessors=[
        LLMRerank(
            choice_batch_size=5,
            top_n=Rerank_TopRRK,
        )],
)
custom_query_engine_rerank_ft.update_prompts(
     {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

In [None]:
query = 'How does ChatGPT deal with inappropriate questions?'

custom_query_engine.query(query).response

In [None]:
custom_query_engine_ft.query(query).response

## Evaluation

### Context Relevance -- Is the retrieved context relevant to the query?

In [None]:
## Define a function that wraps the RetrieverEvaluator function from llama_index
def run_context_relevance_eval(index, queries, expected_ids, rerank=False):
    if rerank:
        retriever = VectorIndexRetriever(
            index=index,
            similarity_top_k=20,
            node_postprocessors=[
                LLMRerank(
                    choice_batch_size=5,
                    top_n=2,
                )],
        )
    else:
        retriever = index.as_retriever(similarity_top_k=2)

    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=retriever
    )

    context_eval_results = []
    for cid, q in zip(expected_ids,queries):
        context_relev_eval = retriever_evaluator.evaluate(
            query=q, expected_ids=[cid,]
        )
        context_eval_results.append(context_relev_eval)

    eval_df = pd.DataFrame({
        'query': [cer.query for cer in context_eval_results],
        'expected_ids': [cer.expected_ids for cer in context_eval_results],
        'retrieved_ids': [cer.retrieved_ids for cer in context_eval_results],
        'mrr': [cer.metric_dict['mrr'].score for cer in context_eval_results],
        'hit_rate': [cer.metric_dict['hit_rate'].score for cer in context_eval_results]
    })

    print('Total MRR = ',eval_df.mrr.sum(),'/ ',len(queries))
    print('# Hits = ',eval_df.hit_rate.sum(),'/ ',len(queries))
    return eval_df

## First lets test on our sample Q&A pair
query_id = '0155e8ac-0174-4336-abcc-0ed302230b13'
corpus_id = val_dataset.__dict__['relevant_docs'][query_id][0]
sample_chunk = val_dataset.__dict__['corpus'][corpus_id]
sample_query = val_dataset.__dict__['queries'][query_id]

print('Query:\n', sample_query)
results = run_context_relevance_eval(index,[sample_query,],[corpus_id,])

print('Expected ID = ',results.iloc[0].expected_ids)
print('Retrieved IDs = ',results.iloc[0].retrieved_ids)
print('Top doc:\n',val_dataset.__dict__['corpus'][results.iloc[0].retrieved_ids[0]])

In [None]:
## Convert the val_dataset Q&A results to a dictionary linking each query
## to the document it was built from
query_ids = list(val_dataset.__dict__['queries'].keys())
corpus_ids = [val_dataset.__dict__['relevant_docs'][qid][0] for qid in query_ids]
queries = [val_dataset.__dict__['queries'][qid] for qid in query_ids]
expected_texts = [val_dataset.__dict__['corpus'][cid] for cid in corpus_ids]
## Create a label mapping from non-finetuned document indexing to fine-tuned doc
## indexing, so we know which document to expect when running the FT model.
index_ids = list(index.__dict__['_index_struct'].nodes_dict.keys())
index_ft_ids = list(index_ft.__dict__['_index_struct'].nodes_dict.keys())
ft_corpus_ids = {index_ids[i]: index_ft_ids[i] for i in range(len(index_ids))}

keys_df = pd.DataFrame({
    'query_ids': query_ids,
    'corpus_ids': corpus_ids,
    'ft_corpus_ids': [ft_corpus_ids[cid] for cid in corpus_ids],
    'queries': queries,
    'expected_texts': expected_texts
}).sample(50,random_state=7)

In [None]:
results = run_context_relevance_eval(index,keys_df['queries'],keys_df['corpus_ids'],rerank=True)

In [None]:
results_ft = run_context_relevance_eval(index_ft,keys_df['queries'],keys_df['ft_corpus_ids'],rerank=True)

### Answer Relevance -- Is the generated answer relevant to the query?

In [None]:
## Define a function that wraps the RelevancyEvaluator function from llama_index

def run_answer_relevance_eval(index,queries,rerank=False):

    if rerank:
        query_engine = index.as_query_engine(
            llm=llm_gpt4,
            node_postprocessors=[
                LLMRerank(choice_batch_size=5, top_n=2,
            )],
        )
    else:
        query_engine = index.as_query_engine(llm=llm_gpt4)
    ans_relev_evaluator = RelevancyEvaluator(llm=llm_gpt4)

    answer_eval_results = []
    for query in tqdm(queries):
        response = query_engine.query(query)
        ans_relev_eval = ans_relev_evaluator.evaluate_response(query=query, response=response)
        answer_eval_results.append(ans_relev_eval)

    print(sum([aer.passing for aer in answer_eval_results]))
    return answer_eval_results

query = "How can individuals request corrections for factually inaccurate information about themselves in ChatGPT output?"
results = run_answer_relevance_eval(index,[query,],rerank=True)

print(results[0].response)
print(str(results[0].passing))

In [None]:
results = run_answer_relevance_eval(index,keys_df['queries'],rerank=True)

In [None]:
results_ft = run_answer_relevance_eval(index_ft,keys_df['queries'],rerank=True)

### Groundedness -- Is the response supported by the context?

In [None]:
## Define a function that wraps the FaithfulnessEvaluator function from llama_index


def run_groundedness_eval(index,queries,rerank=False):

    if rerank:
        query_engine = index.as_query_engine(
            llm=llm_gpt4,
            node_postprocessors=[
                LLMRerank(choice_batch_size=5, top_n=2,
            )],
        )
    else:
        query_engine = index.as_query_engine(llm=llm_gpt4)
    grnd_eval = FaithfulnessEvaluator(llm=llm_gpt4)

    # responses = [query_engine.query(q) for q in queries]
    # grnd_eval_results = [grnd_eval.evaluate_response(r) for r in responses]
    ground_eval_results = []
    for query in tqdm(queries):
        response = query_engine.query(query)
        ground_eval = faithfulness_eval.evaluate_response(response=response)
        ground_eval_results.append(ground_eval)

    print(sum([ger.passing for ger in ground_eval_results]))
    return ground_eval_results

query = "How can individuals request corrections for factually inaccurate information about themselves in ChatGPT output?"
results = run_answer_relevance_eval(index,[query,],rerank=True)

print('Contexts: ',results[0].contexts)
print('\nResponse: ',results[0].response)
print('\n',str(results[0].passing))

In [None]:
results = run_groundedness_eval(index,keys_df['queries'],rerank=True)

In [None]:
results_ft = run_groundedness_eval(index_ft,keys_df['queries'],rerank=True)

In [None]:
## Create dataframes of the output to example results

grnd_df = pd.DataFrame({
    'query': keys_df['queries'],
    'contexts': [r.contexts for r in results],
    'response': [r.response for r in results],
    'pass': [str(r.passing) for r in results],
    'score': [r.score for r in results]
})

grnd_df_ft = pd.DataFrame({
    'query': keys_df['queries'],
    'contexts': [r.contexts for r in results_ft],
    'response': [r.response for r in results_ft],
    'pass': [str(r.passing) for r in results_ft],
    'score': [r.score for r in results_ft]
})