**Step 0: Imports, constants, and API Keys!**

In [1]:
!pip install -q langchain==0.2.16 langchain_core==0.2.38 langchain_community==0.2.16 pymupdf openai 
!pip install -q langchain_openai==0.1.23 langchain-qdrant qdrant_client asyncio ragas==0.1.14 pandas
!pip install -q transformers sentence-transformers langchain_huggingface


In [None]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
RAGAS_METRICS = [ faithfulness, answer_relevancy, context_precision, context_recall ]


In [2]:
import os
import openai
from getpass import getpass

# collect OpenAI key
openai.api_key = getpass("OpenAI API Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

**Step 1: Download and chunk the data**

We are going to use the following docs as our knowledge base:
1. Blueprint for an AI Bill of Rights: Making Automated Systems Work for the American People (PDF)
2. National Institute of Standards and Technology (NIST) Artificial Intelligent Risk Management Framework 

Let's start with a simple fixed chunking strategy as a baseline, and later evaluate parent-doc retrieval if we have time

In [3]:
import importlib
import utils
from vars import PDFS, CHUNK_SIZE, OVERLAP

importlib.reload(utils)
for pdf in PDFS:
    chunks = await utils.load_and_chunk_pdf(pdf,CHUNK_SIZE,OVERLAP)


  from .autonotebook import tqdm as notebook_tqdm


Loading https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf...
Chunking...
Loading https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf...
Chunking...


**Step 2: Construct and test baseline RAG Pipeline**

In [4]:
from vars import BASELINE_CHAT_MODEL, BASELINE_EMBEDDING_MODEL

importlib.reload(utils)
rag_chain = await utils.vanilla_openai_rag_chain(texts=chunks, 
                                            openai_key=openai.api_key, 
                                            embedding_model=BASELINE_EMBEDDING_MODEL,
                                            chat_model=BASELINE_CHAT_MODEL)

created qdrant client
created embeddings
populated vector db
created chain


In [5]:
from pprint import pprint
response = await rag_chain.ainvoke({"input":"What are some key risks associated with modern LLMs?"})
pprint(response)

{'context': [Document(metadata={'_id': 'fa369ee3fbb5442d89cbc2cb81c8c414', '_collection_name': 'default'}, page_content='with greater ease and scale than other technologies. LLMs have been reported to generate dangerous or \nviolent recommendations, and some models have generated actionable instructions for dangerous or \n \n \n9 Confabulations of falsehoods are most commonly a problem for text-based outputs; for audio, image, or video \ncontent, creative generation of non-factual content can be a desired behavior.  \n10 For example, legal confabulations have been shown to be pervasive in current state-of-the-art LLMs. See also, \ne.g.,'),
             Document(metadata={'_id': '3b69051d246044dca5186263d701224b', '_collection_name': 'default'}, page_content='development, production, or use of CBRN weapons or other dangerous materials or agents. While \nrelevant biological and chemical threat knowledge and information is often publicly accessible, LLMs \ncould facilitate its analysis or

**Step 3: Evaluate baseline RAG system**

This assumes that gen_synthetic_data.ipynb has already been run to generate some test data!

In [6]:
from vars import N_EVAL_QUESTIONS
import utils
importlib.reload(utils)

#  Load the dataset and run the RAG pipeline
response_dataset = await utils.gen_rag_responses(rag_chain)
response_dataset.save_to_disk(f"baseline_response_dataset_{N_EVAL_QUESTIONS}")

read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [01:39<00:00,  3.33s/it]
Saving the dataset (1/1 shards): 100%|██████████| 30/30 [00:00<00:00, 3530.26 examples/s]


In [None]:
# Use ragas to evaluate
from datasets import load_from_disk
from langchain_openai.chat_models import ChatOpenAI
from vars import N_EVAL_QUESTIONS, EVALUATION_MODEL

from ragas import evaluate
from ragas.run_config import RunConfig

# uncomment this line to load the responses from disk
#response_dataset = load_from_disk(f"baseline_response_dataset_{N_EVAL_QUESTIONS}") 

results = evaluate(response_dataset, 
                   RAGAS_METRICS, 
                   #run_config=RunConfig(max_workers=2), # uncomment if we need to slow it down to avoid rate limit errors
                   llm=ChatOpenAI(model_name=EVALUATION_MODEL))

In [10]:
import pandas as pd

# Check out the results, save them to disk
print(results)
results_df = pd.DataFrame([results])
results_df.to_csv("baseline_ragas_evaluation_results.csv", index=False)

{'faithfulness': 0.8833, 'answer_relevancy': 0.9104, 'context_precision': 0.7444, 'context_recall': 0.9389}


**Step 4: Evaluate the same system, but with the fine-tuned embedding model we uploaded to huggingface**
This assumes that we have successfully run the notebook in the fine_tuning_arctic directory

In [3]:

from langchain_huggingface import HuggingFaceEmbeddings
from vars import HF_USERNAME, FT_MODEL_NAME

# Load the finetuned embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name=f"{HF_USERNAME}/{FT_MODEL_NAME}")


  from .autonotebook import tqdm as notebook_tqdm
  hf_embeddings = HuggingFaceEmbeddings(model_name=f"{HF_USERNAME}/{FT_MODEL_NAME}")
Some weights of BertModel were not initialized from the model checkpoint at achapman/finetuned_arctic_ai_risk and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from vars import BASELINE_CHAT_MODEL
import utils
importlib.reload(utils)

hf_rag_chain = await utils.vanilla_rag_chain_hf_embeddings(texts=chunks, 
                                                            openai_key=openai.api_key, 
                                                            embeddings = hf_embeddings,
                                                            chat_model=BASELINE_CHAT_MODEL,
                                                            collection_name = "hf_collection")

created qdrant client
populated vector db
created chain


In [13]:
# Test the chain
from pprint import pprint
response = await hf_rag_chain.ainvoke({"input":"What are some key risks associated with modern LLMs?"})
pprint(response)

{'context': [Document(metadata={'_id': '281e18d90ed04674a1a011582a2d12e6', '_collection_name': 'hf_collection'}, page_content='with greater ease and scale than other technologies. LLMs have been reported to generate dangerous or \nviolent recommendations, and some models have generated actionable instructions for dangerous or \n \n \n9 Confabulations of falsehoods are most commonly a problem for text-based outputs; for audio, image, or video \ncontent, creative generation of non-factual content can be a desired behavior.  \n10 For example, legal confabulations have been shown to be pervasive in current state-of-the-art LLMs. See also, \ne.g.,'),
             Document(metadata={'_id': '4f867f2288a8472eaffa3a9ba6db846a', '_collection_name': 'hf_collection'}, page_content='(as well as in tables in Appendix B) to relevant Trustworthy AI Characteristics identiﬁed in the AI RMF.  \n \n \n5 These risks can be further categorized by organizations depending on their unique approaches to risk de

In [15]:
# Load the dataset and run the RAG pipeline
hf_response_dataset = await utils.gen_rag_responses(hf_rag_chain)
hf_response_dataset.save_to_disk(f"finetuned_response_dataset_{N_EVAL_QUESTIONS}")

read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [01:58<00:00,  3.95s/it]
Saving the dataset (1/1 shards): 100%|██████████| 30/30 [00:00<00:00, 5528.03 examples/s]


In [16]:
from vars import EVALUATION_MODEL
import pandas as pd

# uncomment this line to load the responses from disk
#response_dataset = load_from_disk(f"finetuned_response_dataset_{N_EVAL_QUESTIONS}") 

hf_results = evaluate(hf_response_dataset, 
                   RAGAS_METRICS, 
                   #run_config=RunConfig(max_workers=2), # uncomment if we need to slow it down to avoid rate limit errors
                   llm=ChatOpenAI(model_name=EVALUATION_MODEL))

# Check out the results, save them to disk
print(hf_results)

results_df = pd.DataFrame([hf_results])
results_df.to_csv("finetuned_ragas_evaluation_results.csv", index=False)

Evaluating: 100%|██████████| 120/120 [01:40<00:00,  1.20it/s]


{'faithfulness': 0.9285, 'answer_relevancy': 0.8746, 'context_precision': 0.7009, 'context_recall': 0.9833}


**Step 5: Do we do better if we just throw more money at this by upgrading to OpenAI's "best" models?**

In [5]:
# Create and test the chain
from vars import TE3_LARGE, TE3_VECTOR_LENGTH, GPT_4O
importlib.reload(utils)

expensive_rag_chain = await utils.vanilla_openai_rag_chain(texts=chunks, 
                                            openai_key=openai.api_key, 
                                            embedding_model=TE3_LARGE,
                                            chat_model=GPT_4O,
                                            vector_size=TE3_VECTOR_LENGTH)

from pprint import pprint
response = await expensive_rag_chain.ainvoke({"input":"What are some key risks associated with modern LLMs?"})
pprint(response)

created qdrant client
created embeddings
populated vector db
created chain
{'context': [Document(metadata={'_id': 'dd33382b341344dda0e2640411393a7f', '_collection_name': 'default'}, page_content='with greater ease and scale than other technologies. LLMs have been reported to generate dangerous or \nviolent recommendations, and some models have generated actionable instructions for dangerous or \n \n \n9 Confabulations of falsehoods are most commonly a problem for text-based outputs; for audio, image, or video \ncontent, creative generation of non-factual content can be a desired behavior.  \n10 For example, legal confabulations have been shown to be pervasive in current state-of-the-art LLMs. See also, \ne.g.,'),
             Document(metadata={'_id': '447169b053f040cd8ef3e19638f9d421', '_collection_name': 'default'}, page_content='information reports could cause doctors to make incorrect diagnoses and/or recommend the wrong \ntreatments. Risks of confabulated content may be especially

In [6]:
from vars import N_EVAL_QUESTIONS
import utils
importlib.reload(utils)

#  Load the dataset and run the RAG pipeline on our test data
expensive_response_dataset = await utils.gen_rag_responses(expensive_rag_chain)
expensive_response_dataset.save_to_disk(f"expensive_response_dataset_{N_EVAL_QUESTIONS}")

read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [02:34<00:00,  5.17s/it]
Saving the dataset (1/1 shards): 100%|██████████| 30/30 [00:00<00:00, 4716.23 examples/s]


In [7]:
# Use ragas to evaluate
from datasets import load_from_disk
from langchain_openai.chat_models import ChatOpenAI
from vars import N_EVAL_QUESTIONS, EVALUATION_MODEL

from ragas import evaluate
from ragas.run_config import RunConfig

import pandas as pd

# uncomment this line to load the responses from disk
#response_dataset = load_from_disk(f"expensive_response_dataset_{N_EVAL_QUESTIONS}") 

results = evaluate(expensive_response_dataset, 
                   RAGAS_METRICS, 
                   #run_config=RunConfig(max_workers=2), # uncomment if we need to slow it down to avoid rate limit errors
                   llm=ChatOpenAI(model_name=EVALUATION_MODEL))


# Check out the results, save them to disk
print(results)
results_df = pd.DataFrame([results])
results_df.to_csv("expensive_ragas_evaluation_results.csv", index=False)

Evaluating: 100%|██████████| 120/120 [01:45<00:00,  1.14it/s]


{'faithfulness': 0.9190, 'answer_relevancy': 0.9504, 'context_precision': 0.7556, 'context_recall': 1.0000}


**Step 6: Do something fancier and see if it helps**
For our final effort, we are going to test out:
- Cleaning up the docs by removing extraneous content
- Using parent-doc retrieval (still somewhat naive, by page). To make this better, use unstructured to load the docs and chunk them by section.
- Using our fine-tuned embedding model along with GPT-4o for the final pipeline


In [35]:
# Set up a dict with some info about our docs
from vars import GPT_4O, PDF_DICT
import importlib, utils
importlib.reload(utils)

# Construct the chain. This assumes our hf_embeddings that we loaded earlier are still in the environment
fancy_rag_chain = await utils.fancy_rag_chain(PDF_DICT,
                                              openai_key=openai.api_key, 
                                              embeddings = hf_embeddings,
                                              chat_model=GPT_4O,
                                              collection_name = "hf_collection_fancy")


In [44]:
# Test it
from pprint import pprint
response = await fancy_rag_chain.ainvoke({"input":"What are some key risks associated with modern LLMs?"})
pprint(response)

{'context': [Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 9, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': "D:20240805141702-04'00'", 'modDate': "D:20240805143048-04'00'", 'trapped': ''}, page_content=' \n6 \n2.2. Confabulation \n“Confabulation” refers to a phenomenon in which GAI systems generate and conﬁdently present \nerroneous or false content in response to prompts. Confabulations also include generated outputs that \ndiverge from the prompts or other input or that contradict previously generated statements in the same \ncontext. These phenomena are colloquially also referre

In [45]:
from vars import N_EVAL_QUESTIONS
import utils
importlib.reload(utils)

#  Load the dataset and run the RAG pipeline on our test data
fancy_response_dataset = await utils.gen_rag_responses(fancy_rag_chain)
fancy_response_dataset.save_to_disk(f"fancy_response_dataset_{N_EVAL_QUESTIONS}")

read test questions
generating responses


Processing Questions: 100%|██████████| 30/30 [02:20<00:00,  4.70s/it]
Saving the dataset (1/1 shards): 100%|██████████| 30/30 [00:00<00:00, 6189.64 examples/s]


In [46]:
# Use ragas to evaluate
from datasets import load_from_disk
from langchain_openai.chat_models import ChatOpenAI
from vars import N_EVAL_QUESTIONS, EVALUATION_MODEL

from ragas import evaluate
from ragas.run_config import RunConfig

import pandas as pd

# uncomment this line to load the responses from disk
#response_dataset = load_from_disk(f"fancy_response_dataset_{N_EVAL_QUESTIONS}") 

results = evaluate(fancy_response_dataset, 
                   RAGAS_METRICS, 
                   #run_config=RunConfig(max_workers=2), # uncomment if we need to slow it down to avoid rate limit errors
                   llm=ChatOpenAI(model_name=EVALUATION_MODEL))


# Check out the results, save them to disk
print(results)
results_df = pd.DataFrame([results])
results_df.to_csv("fancy_ragas_evaluation_results.csv", index=False)

Evaluating: 100%|██████████| 120/120 [01:18<00:00,  1.53it/s]


{'faithfulness': 0.9036, 'answer_relevancy': 0.9092, 'context_precision': 0.7417, 'context_recall': 0.9889}
