In [1]:
!pip install -q langchain-core==0.2.40 langchain-openai==0.1.25 langchain-huggingface==0.0.3

In [2]:
!pip install -qU pymupdf ragas

In [3]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

Enter Your OpenAI API Key: ··········


In [4]:
from langchain_community.document_loaders import PyMuPDFLoader
bill_docs = PyMuPDFLoader('Blueprint-for-an-AI-Bill-of-Rights.pdf').load()
nist_docs = PyMuPDFLoader('NIST_report.pdf').load()

In [5]:
training_documents_loaded = bill_docs + nist_docs

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)

training_documents = text_splitter.split_documents(training_documents_loaded)

In [7]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [8]:
len(training_documents)

603

In [9]:
training_split_documents = training_documents[:400]
val_split_documents = training_documents[400:500]
test_split_documents = training_documents[500:]

### Construct a fine-tuned dataset

In [10]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [11]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [12]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [13]:
import asyncio
import uuid
from tqdm import tqdm

async def process_document(document, n_questions):
    questions_generated = await question_generation_chain.ainvoke({"context": document.page_content, "n_questions": n_questions})

    doc_questions = {}
    doc_relevant_docs = {}

    for question in questions_generated.content.split("\n"):
        question_id = str(uuid.uuid4())
        doc_questions[question_id] = "".join(question.split(".")[1:]).strip()
        doc_relevant_docs[question_id] = [document.metadata["id"]]

    return doc_questions, doc_relevant_docs

async def create_questions(documents, n_questions):
    tasks = [process_document(doc, n_questions) for doc in documents]

    questions = {}
    relevant_docs = {}

    for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc="Processing documents"):
        doc_questions, doc_relevant_docs = await task
        questions.update(doc_questions)
        relevant_docs.update(doc_relevant_docs)

    return questions, relevant_docs

In [14]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Processing documents: 100%|██████████| 400/400 [00:20<00:00, 19.80it/s]


In [15]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing documents: 100%|██████████| 103/103 [00:05<00:00, 20.44it/s]


In [16]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Processing documents: 100%|██████████| 100/100 [00:03<00:00, 25.34it/s]


In [17]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

#### Fine-tuning snowflake-arctic-embed-m

In [18]:
import json

def get_jsonl_data(filename: str) -> dict:
    with open(filename, 'r') as file:
        data = {}
        for line in file:
            json_obj = json.loads(line)
            data.update(json_obj)
    return data

train_dataset = get_jsonl_data('training_dataset.jsonl')
val_dataset = get_jsonl_data('val_dataset.jsonl')
test_dataset = get_jsonl_data('test_dataset.jsonl')

In [19]:
!pip install -qU sentence_transformers \
    datasets \
    pyarrow \
    accelerate

In [20]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [21]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [22]:
BATCH_SIZE = 20

In [23]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [24]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [25]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [26]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [27]:
EPOCHS = 5

In [28]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100,Dot Accuracy@1,Dot Accuracy@3,Dot Accuracy@5,Dot Accuracy@10,Dot Precision@1,Dot Precision@3,Dot Precision@5,Dot Precision@10,Dot Recall@1,Dot Recall@3,Dot Recall@5,Dot Recall@10,Dot Ndcg@10,Dot Mrr@10,Dot Map@100
40,No log,No log,0.79,0.92,0.945,0.975,0.79,0.306667,0.189,0.0975,0.79,0.92,0.945,0.975,0.885868,0.856817,0.858193,0.79,0.92,0.945,0.975,0.79,0.306667,0.189,0.0975,0.79,0.92,0.945,0.975,0.885868,0.856817,0.858193
50,No log,No log,0.795,0.925,0.95,0.975,0.795,0.308333,0.19,0.0975,0.795,0.925,0.95,0.975,0.892066,0.864681,0.866148,0.795,0.925,0.95,0.975,0.795,0.308333,0.19,0.0975,0.795,0.925,0.95,0.975,0.892066,0.864681,0.866148
80,No log,No log,0.78,0.93,0.955,0.975,0.78,0.31,0.191,0.0975,0.78,0.93,0.955,0.975,0.884384,0.854437,0.855905,0.78,0.93,0.955,0.975,0.78,0.31,0.191,0.0975,0.78,0.93,0.955,0.975,0.884384,0.854437,0.855905
100,No log,No log,0.79,0.935,0.96,0.975,0.79,0.311667,0.192,0.0975,0.79,0.935,0.96,0.975,0.889197,0.860792,0.862249,0.79,0.935,0.96,0.975,0.79,0.311667,0.192,0.0975,0.79,0.935,0.96,0.975,0.889197,0.860792,0.862249
120,No log,No log,0.8,0.93,0.955,0.97,0.8,0.31,0.191,0.097,0.8,0.93,0.955,0.97,0.891525,0.865472,0.867328,0.8,0.93,0.955,0.97,0.8,0.31,0.191,0.097,0.8,0.93,0.955,0.97,0.891525,0.865472,0.867328
150,No log,No log,0.78,0.935,0.955,0.97,0.78,0.311667,0.191,0.097,0.78,0.935,0.955,0.97,0.883082,0.854145,0.856015,0.78,0.935,0.955,0.97,0.78,0.311667,0.191,0.097,0.78,0.935,0.955,0.97,0.883082,0.854145,0.856015
160,No log,No log,0.77,0.93,0.955,0.965,0.77,0.31,0.191,0.0965,0.77,0.93,0.955,0.965,0.876534,0.846964,0.84922,0.77,0.93,0.955,0.965,0.77,0.31,0.191,0.0965,0.77,0.93,0.955,0.965,0.876534,0.846964,0.84922
200,No log,No log,0.785,0.93,0.955,0.96,0.785,0.31,0.191,0.096,0.785,0.93,0.955,0.96,0.880624,0.853964,0.856571,0.785,0.93,0.955,0.96,0.785,0.31,0.191,0.096,0.785,0.93,0.955,0.96,0.880624,0.853964,0.856571


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

#### Evaluating our retriever

In [29]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

In [30]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

#### Snowflake-arctic-embed-m (Fine-tuned)

In [31]:
from langchain_huggingface import HuggingFaceEmbeddings

finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 206/206 [00:02<00:00, 88.76it/s]


In [32]:
finetune_results_df = pd.DataFrame(finetune_results)
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

0.9757281553398058

#### Rebuild RAG Chain with the Fine-tuned model

In [33]:
finetune_vectorstore = FAISS.from_documents(training_documents, finetune_embeddings)
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

In [34]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

In [35]:
rag_llm =  ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [36]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [37]:
finetune_rag_chain.invoke({"question" : "How can we ethically control AI?"})["response"]

'Ethical control of AI can be achieved through several approaches, including:\n\n1. **Establishing Principles and Frameworks**: Organizations and governments can develop ethical principles and frameworks for the responsible use of AI. For example, the Organization for Economic Co-operation and Development (OECD) has proposed recommendations for trustworthy AI.\n\n2. **Legislation and Regulation**: Implementing laws and regulations that specifically address the ethical use of AI technologies can help ensure accountability and protect rights.\n\n3. **Incorporating Ethical Design**: Companies can integrate ethical considerations into the design and development of AI systems, ensuring that these systems are built with safety, security, and effectiveness in mind.\n\n4. **Oversight and Governance**: Establishing oversight bodies, such as the AI Advancement Council in the Department of Energy, can help monitor and guide the ethical development and deployment of AI systems.\n\n5. **Research an

#### RAGAS

In [38]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

In [39]:
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

In [40]:
testset = generator.generate_with_langchain_docs(
    test_split_documents,
    test_size=20,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
    raise_exceptions=True
    )

embedding nodes:   0%|          | 0/206 [00:00<?, ?it/s]



Generating:   0%|          | 0/20 [00:00<?, ?it/s]

In [41]:
testset.to_pandas().head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What factors contribute to the completeness of...,"[intersecting groups; Completeness, representa...",Factors that contribute to the completeness of...,simple,"[{'source': 'NIST_report.pdf', 'file_path': 'N...",True
1,What role do participatory engagement methods ...,[50 \nParticipatory Engagement Methods \nOn an...,Participatory engagement methods play a role i...,simple,"[{'source': 'NIST_report.pdf', 'file_path': 'N...",True
2,What measures can be taken to establish accept...,[48 \n• Data protection \n• Data retention \n...,Establishing acceptable use policies and guida...,simple,"[{'source': 'NIST_report.pdf', 'file_path': 'N...",True
3,What evidence is there regarding the bias of G...,"[Kleinberg, J. et al. (May 2021) Algorithmic m...",The evidence regarding the bias of GPT detecto...,simple,"[{'source': 'NIST_report.pdf', 'file_path': 'N...",True
4,What is the role of AI Actors in addressing th...,"[performance, feedback received, and improveme...",AI Actors play a role in addressing the genera...,simple,"[{'source': 'NIST_report.pdf', 'file_path': 'N...",True


In [42]:
from datasets import Dataset

def generate_answers(chain, testset):
  answers = []
  contexts = []
  questions = testset.to_pandas()["question"].values.tolist()
  ground_truths = testset.to_pandas()["ground_truth"].values.tolist()

  for question in tqdm(questions):
    answer = chain.invoke({"question" : question})
    answers.append(answer["response"])
    contexts.append([context.page_content for context in answer["context"]])

  return Dataset.from_dict({
      "question" : questions,
      "answer" : answers,
      "contexts" : contexts,
      "ground_truth" : ground_truths
  })

In [43]:
finetune_dataset = generate_answers(finetune_rag_chain, testset)

100%|██████████| 20/20 [00:44<00:00,  2.22s/it]


In [44]:
from ragas.metrics import (
    context_recall,
    context_precision,
)
from ragas import evaluate

In [45]:
result = evaluate(
    finetune_dataset,
    metrics=[
        context_precision,
        context_recall,
    ],
)

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

In [46]:
result

{'context_precision': 0.6691, 'context_recall': 0.9542}

In [47]:
result.to_pandas().head()

Unnamed: 0,question,contexts,answer,ground_truth,context_precision,context_recall
0,What factors contribute to the completeness of...,"[intersecting groups; Completeness, representa...",Factors that contribute to the completeness of...,Factors that contribute to the completeness of...,0.7,1.0
1,What role do participatory engagement methods ...,[50 \nParticipatory Engagement Methods \nOn an...,Participatory engagement methods play a crucia...,Participatory engagement methods play a role i...,0.804167,1.0
2,What measures can be taken to establish accept...,[48 \n• Data protection \n• Data retention \n...,To establish acceptable use policies in human-...,Establishing acceptable use policies and guida...,1.0,1.0
3,What evidence is there regarding the bias of G...,"[Wu, K. et al. (2024) How well do LLMs cite re...",The evidence regarding the bias of GPT detecto...,The evidence regarding the bias of GPT detecto...,0.5,1.0
4,What is the role of AI Actors in addressing th...,"[Enhanced \n2.11. \nObscene, Degrading, and/o...",AI Actors play a crucial role in addressing th...,AI Actors play a role in addressing the genera...,0.325,1.0
