# Install dependencies

In [1]:
!pip install -qU pymupdf

In [2]:
!pip install -qU ragas

# Set enviroment values

In [3]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

Please provide your OpenAI Key:  ········


# Load document

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(
    r"..\data\raw\Seattle.pdf",
)

documents = loader.load()

In [5]:
documents[0]

Document(page_content=' \nPort of Seattle \nv03.22.16 \nConstruction Safety Manual \n', metadata={'source': '..\\data\\raw\\Seattle.pdf', 'file_path': '..\\data\\raw\\Seattle.pdf', 'page': 0, 'total_pages': 372, 'format': 'PDF 1.6', 'title': '', 'author': 'Michelle Chatman', 'subject': '', 'keywords': '', 'creator': 'Adobe Acrobat Pro DC 15.8.20082', 'producer': 'Adobe Acrobat Pro DC 15.8.20082', 'creationDate': "D:20160322131244-07'00'", 'modDate': "D:20240323123936+01'00'", 'trapped': ''})

# Document chunks

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 50
)

documents = text_splitter.split_documents(documents)

In [7]:
len(documents)

1392

# Load embedding model

In [8]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

# Synthetic Dataset Generation using Ragas

In [23]:
eval_documents = documents

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 400
)

eval_documents = text_splitter.split_documents(eval_documents)

In [28]:
len(eval_documents)

1392

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.with_openai()

testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.25, reasoning: 0.25, multi_context: 0.5},raise_exceptions=False)
     

In [None]:
testset.test_data[0]

# Generate the responses to the test questions

In [40]:
test_df = testset.to_pandas()

In [41]:
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How can safety pamphlets be used to inform con...,[160 \n \nFig.21.1. Steps in chemical manageme...,Safety pamphlets can be used as a communicatio...,simple,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True
1,How does safety training enhance workers' safe...,[43 \n \nChapter 7 \nTRAINING AND EDUCATION \...,Safety training enhances workers' safety compe...,simple,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True
2,What methods should be used to protect workers...,[157 \n \nwhere fibrous structures and scar ti...,The methods that should be used to protect wor...,reasoning,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True
3,What measures are recommended for safe handlin...,[160 \n \nFig.21.1. Steps in chemical manageme...,The measures recommended for safe handling and...,multi_context,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True
4,How can various methods aid in identifying haz...,[needed. These meetings will remind workers of...,Various methods such as total site health and ...,multi_context,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True
5,How can technology and guidelines help reduce ...,[using cloud and mobile technology is making i...,"Technology, such as cloud and mobile technolog...",multi_context,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True
6,What are the risks of arc/flashover hazards an...,[ Identification of multiple voltage sources ...,Arcing faults in electrical equipment are mult...,multi_context,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True
7,What is the purpose of root cause analysis in ...,[methods used are: \na. Root cause analysis \n...,The purpose of root cause analysis in incident...,multi_context,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True
8,What are some avoidable risks to employee safe...,[maintenance. \n \nSome examples include the f...,Some avoidable risks to employee safety throug...,reasoning,"[{'source': '..\data\raw\Delhi.pdf', 'file_pat...",True


In [44]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [48]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [49]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [50]:
response_dataset[0]

{'question': 'How can safety pamphlets be used to inform construction workers about hazardous chemicals?',
 'answer': 'Safety pamphlets can be used to inform construction workers about hazardous chemicals by providing them with important information and guidelines on handling and storing these chemicals safely.',
 'contexts': ['160 \n \nFig.21.1. Steps in chemical management \nUsing proper communication mediums like supply of SDS (Safety Data Sheet), Safety pamphlets, \ndisplay notices etc., keep everyone informed about hazardous chemicals, so information is \naccessible to everyone. \nIt is the responsibility of safety teams on construction sites to ensure the safe practices. It is \nrequired to include provisions and procedures for handling and storing hazardous chemicals and \nmaterials. Specific best practices include: \na. Provide Correct Training \nConstruction workers must be aware of the hazardous materials they will be likely to handle and',
  '21.4 Chemical Management for the

# Evaluation

In [51]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [52]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

In [53]:
results

{'faithfulness': 0.9120, 'answer_relevancy': 0.9825, 'context_recall': 0.9630, 'context_precision': 0.9259, 'answer_correctness': 0.6149}

In [54]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How can safety pamphlets be used to inform con...,Safety pamphlets can be used to inform constru...,[160 \n \nFig.21.1. Steps in chemical manageme...,Safety pamphlets can be used as a communicatio...,1.0,1.0,1.0,1.0,0.497016
1,How does safety training enhance workers' safe...,Safety training enhances workers' safety compe...,[43 \n \nChapter 7 \nTRAINING AND EDUCATION \...,Safety training enhances workers' safety compe...,1.0,1.0,1.0,0.805556,0.622839
2,What methods should be used to protect workers...,i. Limit exposure.\nii. Substitute less hazard...,[157 \n \nwhere fibrous structures and scar ti...,The methods that should be used to protect wor...,1.0,0.948765,1.0,0.805556,0.984617
3,What measures are recommended for safe handlin...,The recommended measures for safe handling and...,[160 \n \nFig.21.1. Steps in chemical manageme...,The measures recommended for safe handling and...,0.875,0.972037,1.0,1.0,0.365432
4,How can various methods aid in identifying haz...,Various methods can aid in identifying hazards...,[25 \n \nshould develop the workers confidence...,Various methods such as total site health and ...,1.0,0.983844,1.0,1.0,0.299227
5,How can technology and guidelines help reduce ...,Technology and guidelines can help reduce work...,[formulate and reviewed regularly in the saf...,"Technology, such as cloud and mobile technolog...",0.333333,1.0,1.0,0.916667,0.706485
6,What are the risks of arc/flashover hazards an...,The risks of arc/flashover hazards include sev...,[ It takes place due to phase to phase and ph...,Arcing faults in electrical equipment are mult...,1.0,0.992027,0.666667,0.805556,0.359766
7,What is the purpose of root cause analysis in ...,The purpose of root cause analysis in incident...,[c. Source of initiation of event \nd. What sa...,The purpose of root cause analysis in incident...,1.0,0.945813,1.0,1.0,0.849732
8,What are some avoidable risks to employee safe...,Some avoidable risks to employee safety throug...,[maintenance. \n \nSome examples include the f...,Some avoidable risks to employee safety throug...,1.0,1.0,1.0,1.0,0.849197
