## CREATING GOLDEN TESTSET ##

In [1]:
%pip install -qU langsmith langchain-core==0.2.40 langchain-community langchain-openai langchain-qdrant

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -qU pymupdf ragas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langsmith import Client
import getpass
from uuid import uuid4

# Load environment variables
load_dotenv()

# Set up API keys
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")
os.environ["LANGCHAIN_PROJECT"] = f"AI_Ethics_Framework_Analysis_{uuid4().hex[:8]}"
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load documents
documents = PyMuPDFLoader(file_path="https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf").load()


In [5]:
from langchain_community.document_loaders import PyMuPDFLoader

pdf_links = [
    "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf",
    "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
]

documents = []
for link in pdf_links:
    loader = PyMuPDFLoader(file_path=link)
    documents.extend(loader.load())
# Set up LLMs and embeddings
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

# Create TestsetGenerator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Define distributions
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# Generate testset
testset = generator.generate_with_langchain_docs(documents, 20, distributions, with_debugging_logs=True)


embedding nodes:   4%|▍         | 11/284 [00:00<00:16, 16.08it/s][ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Ongoing monitoring', 'Periodic review', 'Organizational roles and responsibilities', 'Content provenance', 'Incident monitoring']}
embedding nodes:   5%|▍         | 14/284 [00:00<00:14, 19.08it/s][ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Automated systems', 'Disparity assessment', 'Disparity mitigation', 'Algorithmic discrimination', 'Ongoing monitoring']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Racial equity', 'Supreme Court Decision', 'Automated society', 'Privacy protection', 'Crime prediction software']}
embedding nodes:   6%|▌         | 17/284 [00:00<00:12, 21.58it/s][ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['GAI incidents', 'AI Actors', 'Incident reporting', 'Documentation practices', 'Information sharing']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Automated systems', 'Algorithmic discrimination', '

In [6]:
from datetime import datetime

if testset is not None:
    # Create LangSmith dataset with a timestamp
    try:
        client = Client()
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        dataset_name = f"AI Ethics Framework Testset_{timestamp}"
        dataset = client.create_dataset(
            dataset_name=dataset_name,
            description="Questions about AI Ethics Frameworks"
        )

        # Add examples to dataset
        for test in testset.to_pandas().iterrows():
            client.create_example(
                inputs={
                    "question": test[1]["question"]
                },
                outputs={
                    "answer": test[1]["ground_truth"]
                },
                metadata={
                    "context": test[0]
                },
                dataset_id=dataset.id
            )

        print(f"Synthetic dataset generated and uploaded to LangSmith as '{dataset_name}'.")
    except Exception as e:
        print(f"Error uploading to LangSmith: {str(e)}")
else:
    print("Failed to generate testset. Cannot upload to LangSmith.")

Synthetic dataset generated and uploaded to LangSmith as 'AI Ethics Framework Testset_20240919_103614'.


## LOOK AT WEEK 4 DAY 2 FOR GUIDANCE ON EDITING BELOW CODE ##


In [7]:
%pip install -qU qdrant-client pymupdf pandas

Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [None]:
PDF_LINK = "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"

loader = PyMuPDFLoader(PDF_LINK)

documents = loader.load()

In [9]:
pdf_links = [
    "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf",
    "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
]

documents = []
for link in pdf_links:
    loader = PyMuPDFLoader(file_path=link)
    documents.extend(loader.load())

In [None]:
documents[0].metadata

{'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf',
 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf',
 'page': 0,
 'total_pages': 64,
 'format': 'PDF 1.6',
 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile',
 'author': 'National Institute of Standards and Technology',
 'subject': '',
 'keywords': '',
 'creator': 'Acrobat PDFMaker 24 for Word',
 'producer': 'Adobe PDF Library 24.2.159',
 'creationDate': "D:20240805141702-04'00'",
 'modDate': "D:20240805143048-04'00'",
 'trapped': ''}

In [10]:
# Find the index where the second document starts
second_doc_start = next(i for i, doc in enumerate(documents) if doc.metadata['source'] != documents[0].metadata['source'])

print("Metadata for the first document:")
print(documents[0].metadata)

print("\nMetadata for the second document:")
print(documents[second_doc_start].metadata)

Metadata for the first document:
{'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 0, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': "D:20240805141702-04'00'", 'modDate': "D:20240805143048-04'00'", 'trapped': ''}

Metadata for the second document:
{'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 0, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'cre

In [11]:
#Chunking the docs 

from langchain.text_splitter import RecursiveCharacterTextSplitter

CHUNK_SIZE = 500
CHUNK_OVERLAP = 40

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    is_separator_regex=False
)

documents = text_splitter.split_documents(documents)

len(documents)

909

In [12]:
#Establishing embedding model to evaluate 
from langchain_openai import OpenAIEmbeddings

EMBEDDING_MODEL = "text-embedding-ada-002"

embeddings = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    openai_api_key=openai.api_key
)

In [13]:
#Setting up the vector store

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

LOCATION = ":memory:"
COLLECTION_NAME = "AI_Ethics_Framework"
VECTOR_SIZE = 1536

In [14]:
#Creating the collection in Qdrant

qdrant_client = QdrantClient(location=LOCATION)

qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME,
    embedding=embeddings
)

qdrant_vector_store.add_documents(documents)

['baf78acdfa114abd87b15abfd26bf62c',
 'e4e58eae86014ce3b9475e8fb6a63a1d',
 '6cd54a4774ee4706aec17907d0ee1a1d',
 'e7efe529eae8414f8535d46fa6510f2a',
 '5489fc780a1e479ba4dd5dbc273bb632',
 '0e535db2d41c4eb8851efdc7a144e1cc',
 'e2fde5029dcd4b498c89761afe9c3ad6',
 '8bbd8857aaa1417b859c6bf36b6ff1dc',
 '54b40c8055c04894bbf7ce0da3010eaa',
 '30376a09e62142a79b03d089e87836be',
 '26ebec3c14614368bcf68a56235790fa',
 'ff20a198f82f4c5785e33e3f801ce24e',
 '3659272848a542b1bf77bd5f70203b6d',
 '9af72f5fa50a4d77ac40ece4caed7c27',
 '8414f52b3c914e8e91e294f191b9ecc7',
 'db2a3d673b8f4ffca095e5b7b7758da8',
 '5ad9659f6cdc41ee972b1ad52a5073d3',
 '4d00aeac88074bd291e433a2b4cadaf3',
 '58a10602d54640559c695b98112de5d6',
 '5b2558290d9744f49a40ef689645f14e',
 '266f94af1bef4052961840fbf4d127a4',
 '9ee8b2c9177b4bbab2f2d96e91cfd9b2',
 '7af0f6f7abea453ea89b4d2c7df067d2',
 'e399052b679c48c882d46f419475dd0d',
 'fde9c44d3877411398793826cf77f006',
 'bb65149d62e047a4b953a45b5c3464b5',
 'da3a13a03d204510bfd721c8a6f0d7c9',
 

In [15]:
#Creating the retriever

retriever = qdrant_vector_store.as_retriever()

In [16]:
#Test retriever

retrieved_documents = retriever.invoke("What is Artificial Intelligence?")

for doc in retrieved_documents:
  print(doc)

page_content='generation of artificially intelligent partners.95 The National Science Foundation’s program on Fairness in 
Artificial Intelligence also includes a specific interest in research foundations for explainable AI.96
45' metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 44, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20220920133035-04'00'", 'modDate': "D:20221003104118-04'00'", 'trapped': '', '_id': '6d503f2d124d493dbb4a5bf68887736e', '_collection_name': 'AI_Ethics_Framework'}
page_content='Artiﬁcial Intelligence. https://www.whitehouse.gov/brieﬁng-room/presidential-
actions/2023/10/30/executive-order-on-the-safe-secure

In [17]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an expert AI assistant with deep knowledge of business, technology, and entrepreneurship. Your task is to provide accurate, insightful answers based solely on the given context. Follow these guidelines:

1. Analyze the question carefully to understand the core information being sought.
2. Thoroughly examine the provided context, identifying key relevant information.
3. Formulate a clear, concise answer that directly addresses the question.
4. Use specific details and examples from the context to support your answer.
5. If the context doesn't contain sufficient information to fully answer the question, state this clearly and say,'I don't know'.
6. Do not introduce any information not present in the context.
7. If asked for an opinion or recommendation, base it strictly on insights from the context.
8. Use a confident, authoritative tone while maintaining accuracy.
9. If you cannot provide a clear answer to the question, reply with "I don't know".

Question:
{question}

Context:
{context}

Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

In [19]:
#Instantiating RAG chain
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [21]:
question = "What is Artificial Intelligence?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result["response"].content)

I don't know.


In [22]:
testset_df = testset.to_pandas()
testset_df.to_csv("testset.csv")

In [23]:
import pandas as pd

test_df = pd.read_csv("testset.csv")

In [24]:
test_df

Unnamed: 0.1,Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,0,What role does human oversight play in ensurin...,"[' \n \n \n \n \n \n \nHUMAN ALTERNATIVES, \nC...",Human oversight plays a crucial role in ensuri...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
1,1,What are the risks associated with confabulati...,[' \n6 \n2.2. Confabulation \n“Confabulation” ...,Risks from confabulations in GAI systems inclu...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
2,2,How can digital transparency mechanisms like p...,[' \n51 \ngeneral public participants. For exa...,Digital transparency mechanisms like provenanc...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
3,3,"How can organizations measure performance, cap...",[' \n49 \nearly lifecycle TEVV approaches are ...,"Organizations can measure performance, capabil...",simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
4,4,What is the role of responsible AI ventures in...,[' \nENDNOTES\n12. Expectations about reportin...,Responsible AI ventures play a crucial role in...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
5,5,What are some examples of enforcement actions ...,"["" \n65. See, e.g., Scott Ikeda. Major Data Br...",Some examples of enforcement actions taken by ...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
6,6,Who are some of the stakeholders that submitte...,['APPENDIX\nSummaries of Additional Engagement...,The stakeholders that submitted responses to t...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
7,7,How can the principles of data privacy be impl...,[' \n \n \n \n \n \n \n \n \n \n \n \n \n \n \...,Real-life examples of how data privacy princip...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
8,8,How can Generative AI systems potentially faci...,[' \n10 \nGAI systems can ease the unintention...,Generative AI systems can potentially facilita...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
9,9,How do advertisement delivery systems reinforc...,[' \n \n \nWHY THIS PRINCIPLE IS IMPORTANT\nT...,Advertisement delivery systems reinforce racia...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True


In [25]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [26]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [27]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [28]:
response_dataset[0]

{'question': 'What role does human oversight play in ensuring the ethical use of automated systems within sensitive domains?',
 'answer': "Human oversight plays a crucial role in ensuring the ethical use of automated systems within sensitive domains, as highlighted in the context provided. These domains, such as healthcare and public benefits, require additional protections due to the potential for unfair, inaccurate, or dangerous outcomes if automated systems are used without adequate safeguards.\n\nExtensive human oversight is essential to monitor and evaluate the performance of these systems, ensuring that they operate safely and effectively. This oversight includes implementing fallback systems that allow for timely human consideration and remedy when automated decisions are made. Furthermore, there should be mechanisms in place for individuals to opt out of automated systems in favor of human alternatives, where appropriate.\n\nAdditionally, ongoing training and assessment of huma

In [29]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [30]:
results = evaluate(response_dataset, metrics)

Evaluating: 100%|██████████| 95/95 [01:08<00:00,  1.38it/s]


In [31]:
results

{'faithfulness': 0.8091, 'answer_relevancy': 0.9646, 'context_recall': 0.6860, 'context_precision': 0.8582, 'answer_correctness': 0.7387}

In [32]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What role does human oversight play in ensurin...,[diagnose disease. Absent appropriate safeguar...,Human oversight plays a crucial role in ensuri...,Human oversight plays a crucial role in ensuri...,1.0,0.983343,0.333333,1.0,0.830999
1,What are the risks associated with confabulati...,"[many real-world applications, such as in heal...",The risks associated with confabulations in Ge...,Risks from confabulations in GAI systems inclu...,0.9,0.940505,1.0,1.0,0.80899
2,How can digital transparency mechanisms like p...,[A.1.6. Content Provenance \nOverview \nGAI te...,"Digital transparency mechanisms, such as prove...",Digital transparency mechanisms like provenanc...,0.846154,0.925938,0.75,1.0,0.647351
3,"How can organizations measure performance, cap...","[materials (SBOMs), application of service lev...","Organizations can measure performance, capabil...","Organizations can measure performance, capabil...",1.0,0.898284,0.25,0.5,0.628032
4,What is the role of responsible AI ventures in...,[and implement measures to prevent similar one...,Responsible AI ventures play a crucial role in...,Responsible AI ventures play a crucial role in...,0.055556,0.9576,0.333333,0.416667,0.985336
5,What are some examples of enforcement actions ...,"[unions-1658603\n68. See, e.g., enforcement ac...",Some examples of enforcement actions taken by ...,Some examples of enforcement actions taken by ...,1.0,0.997466,1.0,1.0,0.822652
6,Who are some of the stakeholders that submitte...,"[Biometric Technologies. Accessed Apr. 19, 202...",Some of the stakeholders that submitted respon...,The stakeholders that submitted responses to t...,0.5,0.963086,1.0,0.833333,0.241504
7,How can the principles of data privacy be impl...,[DATA PRIVACY \nHOW THESE PRINCIPLES CAN MOVE ...,The principles of data privacy can be implemen...,Real-life examples of how data privacy princip...,0.705882,0.968523,0.25,0.916667,0.674507
8,How can Generative AI systems potentially faci...,[Disinformation and misinformation – both of w...,Generative AI (GAI) systems can facilitate the...,Generative AI systems can potentially facilita...,0.730769,0.992055,1.0,1.0,0.57368
9,How do advertisement delivery systems reinforc...,[ering ads in ways that reinforce racial and g...,Advertisement delivery systems reinforce racia...,Advertisement delivery systems reinforce racia...,1.0,1.0,1.0,1.0,0.694556


In [35]:
# Now, convert the results DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(results_df)

def format_for_hf(example):
    return {
        'input_text': f"Question: {example['question']}\nContext: {example['contexts']}",
        'target_text': example['ground_truth'],
        'faithfulness': example['faithfulness'],
        'answer_relevancy': example['answer_relevancy'],
        'context_recall': example['context_recall'],
        'context_precision': example['context_precision'],
        'answer_correctness': example['answer_correctness']
    }

formatted_dataset = dataset.map(format_for_hf)

# Optional: Filter low-scoring entries
def filter_low_scores(example):
    return example['faithfulness'] > 0.5 and example['answer_correctness'] > 0.5

filtered_dataset = formatted_dataset.filter(filter_low_scores)

# Split the dataset
train_test = filtered_dataset.train_test_split(test_size=0.2)

# Save the datasets
train_test['train'].save_to_disk("/Users/annatucker/Workspace/AIE4Midterm/train_dataset")
train_test['test'].save_to_disk("/Users/annatucker/Workspace/AIE4Midterm/test_dataset")


# Print some statistics
print(f"Train dataset size: {len(train_test['train'])}")
print(f"Test dataset size: {len(train_test['test'])}")
print(f"Sample entry:\n{train_test['train'][0]}")

Map: 100%|██████████| 19/19 [00:00<00:00, 2937.19 examples/s]
Filter: 100%|██████████| 19/19 [00:00<00:00, 6974.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 13/13 [00:00<00:00, 3533.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4/4 [00:00<00:00, 1311.44 examples/s]

Train dataset size: 13
Test dataset size: 4
Sample entry:
{'question': 'How can privacy be ensured in automated system design?', 'contexts': ['Privacy by design and by default. Automated systems should be designed and built with privacy protect\xad\ned by default. Privacy risks should be assessed throughout the development life cycle, including privacy risks \nfrom reidentification, and appropriate technical and policy mitigation measures should be implemented. This \nincludes potential harms to those who are not users of the automated system, but who may be harmed by', 'Demonstrate the safety and effectiveness of the system \nIndependent evaluation. Automated systems should be designed to allow for independent evaluation (e.g., \nvia application programming interfaces). Independent evaluators, such as researchers, journalists, ethics \nreview boards, inspectors general, and third-party auditors, should be given access to the system and samples \nof associated data, in a manner consist


