In [4]:
# !pip install -r requirements_doc.txt 

Collecting anyio==4.5.0 (from -r requirements_doc.txt (line 5))
  Using cached anyio-4.5.0-py3-none-any.whl.metadata (4.7 kB)
Collecting dataclasses-json==0.6.7 (from -r requirements_doc.txt (line 10))
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting grpcio-tools==1.66.1 (from -r requirements_doc.txt (line 14))
  Using cached grpcio_tools-1.66.1-cp311-cp311-macosx_10_9_universal2.whl.metadata (5.3 kB)
Collecting httpcore==1.0.5 (from -r requirements_doc.txt (line 18))
  Using cached httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting httpx==0.27.2 (from -r requirements_doc.txt (line 19))
  Using cached httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain==0.3.0 (from -r requirements_doc.txt (line 25))
  Using cached langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community==0.3.0 (from -r requirements_doc.txt (line 26))
  Using cached langchain_community-0.3.0-py3-none-any.whl.metadata (2.8 kB)
Collecting

In [1]:
import re
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings

In [19]:
from langchain_community.vectorstores import Qdrant

In [2]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")

## Load documents

In [3]:
import fitz  # PyMuPDF
import os

def extract_images_and_tables_from_pdf(pdf_path, output_dir):
    # Ensure the output directories for images and tables exist
    image_dir = os.path.join(output_dir, 'images')
    table_dir = os.path.join(output_dir, 'tables')
    os.makedirs(image_dir, exist_ok=True)
    os.makedirs(table_dir, exist_ok=True)
    
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    extracted_content = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        images = page.get_images(full=True)
        
        # Extract images
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_path = f"{image_dir}/image_{page_num + 1}_{img_index}.{image_ext}"
            
            # Save the image
            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
                
            # Store extracted image metadata
            extracted_content.append({
                'content_id': f'regulation_{xref}_image_{img_index}',
                'content_type': 'Image',
                'hierarchy_level': 'Figure',
                'page_number': page_num + 1,
                'bbox': img[2:6],
                'text': None,
                'image_path': image_path,
                'table_path': None,
                'associated_text_id': None  # Modify if you have a way to link this with text
            })
        
        # Extract tables if applicable
        text = page.get_text("text")
        tables = extract_tables_from_page(page)  # This function would extract tables if needed
        for table_index, table in enumerate(tables):
            table_path = f"{table_dir}/table_{page_num + 1}_{table_index}.txt"
            with open(table_path, "w") as table_file:
                table_file.write(table)  # Save the table as text
            
            # Store extracted table metadata
            extracted_content.append({
                'content_id': f'regulation_{page_num + 1}_table_{table_index}',
                'content_type': 'Table',
                'hierarchy_level': 'Table',
                'page_number': page_num + 1,
                'bbox': None,
                'text': None,
                'image_path': None,
                'table_path': table_path,
                'associated_text_id': None
            })
    
    doc.close()
    return extracted_content

def extract_tables_from_page(page):
    # Implement logic to extract tables from the page
    # This function should return a list of table data from the page
    # You can use regex or a table extraction tool
    tables = []
    # Add table extraction logic here
    return tables

In [4]:
wh_link = 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf'
nist_link = 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf'

In [5]:
from langchain_community.document_loaders import PyMuPDFLoader

wh_documents = PyMuPDFLoader(file_path=wh_link).load()
nist_docments = PyMuPDFLoader(file_path=nist_link).load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100,
    length_function = len,
)

wh_split_chunks = text_splitter.split_documents(wh_documents)

wh_split_chunks[2].page_content

nist_split_chunks = text_splitter.split_documents(nist_docments)

### 1. Section-based chunking

In [6]:

def chunk_pdf_with_sections(doc, sections):
    """
    Chunk a PDF by pages and embed section headers for each chunk.

    Parameters:
    - doc: The loaded PDF document using PyMuPDF.
    - sections: Dictionary mapping section names to page ranges.

    Returns:
    - section_chunks: List of dictionaries with page number, section name, and text chunks.
    """
    section_chunks = []

    # Loop through each section
    for section, pages in sections.items():
        for page_num in pages:
            #page = doc.load_page(page_num)
            page_text =  doc[page_num].page_content
            
            # Store each page as a chunk with section info
            chunk = {
                "section": section,
                #"page_num": page_num + 1,  # Adjusting for 0-based indexing
                "content": f"SECTION: {section}\n{page_text}"
            }
            section_chunks.append(chunk)
    
    return section_chunks

In [7]:
# Function to process the text from page 4 onward
def extract_full_text_from_pages(doc, start_page=4):
    full_text = ""
    for page_num in range(start_page, len(doc)):
        full_text += doc[page_num].page_content
    return full_text

In [8]:
nist_sections = [
    "\n1. \nIntroduction",  # Starts at page 4 (index 0) and ends before page 2 (index 1)
    "\n2. \nOverview of Risks Unique to or Exacerbated by GAI",  # Starts at page 5, ends before page 8
    "\n2.1. CBRN Information or Capabilities",  # Page 8 only
    "\n2.2. Confabulation",  # Page 9 only
    "\n2.3. Dangerous, Violent, or Hateful Content",  # Page 9 & 10 only
    "\n2.4. Data Privacy",  # Page 10 & 11 only
    "\n2.5. Environmental Impacts",  # Page 11 only
    "\n2.6. Harmful Bias and Homogenization",  # Page 11 & 12 
    "\n2.7. Human-Al Configuration",  # Page 12 only
    "\n2.8. Information Integrity",  # Page 12 & 13 only
    "\n2.9. Information Security",  # Page 13 & 14 only
    "\n2.10. \nIntellectual Property",  # Page 14 only
    "\n2.11. \nObscene, Degrading, and/or Abusive Content",  # Page 14 & 15 only
    "\n2.12. \nValue Chain and Component Integration",  # Page 15 only
    "\n3. Suggested Actions to Manage GAI Risks",  # Starts at page 15 to 49
    "Appendix A. Primary GAI Considerations",  # Starts at page 50 to 56
    "Appendix B. References" # 57 to 63
]

In [9]:
wh_sections = {
    "USING THIS TECHNICAL COMPANION": [13],
    "SAFE AND EFFECTIVE SYSTEMS": list(range(14, 22)),
    "ALGORITHMIC DISCRIMINATION PROTECTIONS": list(range(22, 29)),
    "DATA PRIVACY": list(range(29, 39)),
    "NOTICE AND EXPLANATION": list(range(39, 45)),
    "HUMAN ALTERNATIVES, CONSIDERATION, AND FALLBACK": list(range(45, 52)),
    "APPENDIX: EXAMPLES OF AUTOMATED SYSTEMS": [52, 53],
    "LISTENING TO THE AMERICAN PEOPLE": list(range(54, 62)),
    "ENDNOTES": list(range(62, 73))
}

In [10]:
full_text = extract_full_text_from_pages(nist_docments, start_page=4)
chunks = {}
pattern = "|".join([re.escape(section) for section in nist_sections])
# Split text based on the pattern
sections = re.split(f'({pattern})', full_text)
nist_chunks = {sections[1:][i]: sections[1:][i + 1] for i in range(0, len(sections[1:]), 2)}

In [11]:
wh_chunks = chunk_pdf_with_sections(wh_documents, wh_sections)

### Task 1: Review the two PDFs and decide how best to chunk up the data with a single strategy to optimally answer the variety of questions you expect to receive from people.

#### Deliverables

* Describe the default chunking strategy that you will use. 

     * After reviewing pdfs, I wanted to maintain the sections or headers of the documents. In order to execute this method, I found the sections via table of contents in each pdf. Then I used functions that found the regex pattern to associate section to content.

* Articulate a chunking strategy that you would also like to test out.

     * The alternative strategy will be section-based chunks plus semantic chunking. 

* Describe how and why you made these decisions

     * In order to maintain the sections or headers of the documents, I found the sections via table of contents in each pdf and converted in syntax (dict or list). Then I used functions that found the regex pattern (nist_docs) or page_num (wh_paper) to associate section to content. Due to the low N of source docs, I was able to execute static rules (regex or page_num) that will work for the sample. These approaches will have difficulty being replicated if new samples were provided. In future, using headers to chunk with around sections may be viable.

### Chunking Strategy 1: Section-based chunks

#### Format into Document object

In [12]:
from langchain_core.documents import Document

In [13]:
def process_nist_chunks(nist_chunks):
    # This function processes the nist_chunks dictionary.
    documents = []
    for section_name, page_content in nist_chunks.items():
        doc = Document(page_content=page_content, metadata={'section': section_name})
        documents.append(doc)
    return documents

def process_wh_chunks(wh_chunks):
    # This function processes the wh_chunks list of dictionaries.
    documents = []
    for chunk in wh_chunks:
        section_name = chunk['section']
        page_content = chunk['content']
        doc = Document(page_content=page_content, metadata={'section': section_name})
        documents.append(doc)
    return documents

In [14]:
nist_docs = process_nist_chunks(nist_chunks)

In [15]:
wh_docs= process_wh_chunks(wh_chunks)

#### Combine wh_docs and nist_docs document objects into one section-based Document

In [16]:
full_docs = wh_docs + nist_docs

In [22]:
from  langchain.schema import Document
import json
from typing import Iterable

def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array
    


### Chunking Strategy 2: Section (page or topic) + Semantic Chunking (Sentence-level for semantic similarity chunk points)

In [44]:
#base_text_splitter = SemanticChunker(OpenAIEmbeddings())
#sem_documents = base_text_splitter.split_documents(full_docs) # 2 min 

In [91]:
len(full_docs)

75

In [90]:
len(sem_documents)

228

In [129]:
## used to transfer file to google colab for ft artic embed model
#save_docs_to_jsonl(sem_documents,'sem_documents.jsonl')

In [23]:
sem_documents = load_docs_from_jsonl('sem_documents.jsonl')

## Embedding chunks

In [95]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

embeddings_lg = OpenAIEmbeddings(model="text-embedding-3-large")

In [96]:
from langchain_community.vectorstores import Qdrant

vectorstore = Qdrant.from_documents(
    documents=full_docs,
    embedding=embeddings,
    location=":memory:",
    collection_name="ai_policy"
)
base_retriever = vectorstore.as_retriever()

In [34]:
from langchain_community.vectorstores import Qdrant

vectorstore = Qdrant.from_documents(
    documents=full_docs,
    embedding=embeddings_lg,
    location=":memory:",
    collection_name="ai_policy"
)
retriever = vectorstore.as_retriever()

In [24]:
vectorstore_1 = Qdrant.from_documents(
    documents=sem_documents,
    embedding=embeddings_lg,
    location=":memory:",
    collection_name="alt_ai_policy"
)
alt_retriever = vectorstore_1.as_retriever()

### RAG_chain

In [35]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

## Generation LLM
llm = ChatOpenAI(model="gpt-4o")

In [36]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

### LCEL chain for each different method

#### * rag_chain: with section-based chunking only

#### * retrieval_augmented_qa_chain: with section chunks + SemanticChunker at sentence-level processing

In [98]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain.schema import StrOutputParser

In [100]:
base_org_retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

In [37]:
base_retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

In [29]:
retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | alt_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

In [135]:
## adds context and response keys to result object
retrieval_augmented_qa_chain.invoke({"question" : "What is the AI framework all about?"})['response']

AIMessage(content='The AI framework appears to be the "AI Risk Management Framework" developed by the National Institute of Standards and Technology (NIST). It includes several chapters and appendices detailing various aspects of AI risks and trustworthiness, profiles, descriptions of AI actor tasks, and comparisons of AI risks to traditional software risks. The framework also features a playbook and a glossary of terms related to trustworthy AI. The framework aims to address and manage risks associated with AI systems comprehensively.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 93, 'prompt_tokens': 1891, 'total_tokens': 1984, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_e375328146', 'finish_reason': 'stop', 'logprobs': None}, id='run-f6b34af4-7bcc-4873-85d1-3febe8b4aa0c-0', usage_metadata={'input_tokens': 1891, 'output_tokens': 93, 'total_tokens': 1984})

# Task 2

### Build an end-to-end RAG application using an industry-standard open-source stack and your choice of commercial off-the-shelf models

Build a prototype and deploy to a Hugging Face Space, and include the public URL link to your space  create a short (< 2 min) loom video demonstrating some initial testing inputs and outputs.

 * https://huggingface.co/spaces/aaromosshf2424/midterm_aie4

* Loom video link https://www.loom.com/share/701153f9d7234883aad5df9a5621a6ca?sid=d3b24367-41d1-4155-a06c-f73376801ab2

How did you choose your stack, and why did you select each tool the way you did?

* Docker to containerize the model & app. Why: Self-sustained version of the model and requirements that can be published.

* Chainlit was used to allow for interaction between user, their query, and model functionality.

* Qdrant is the vectorDb to store embeddings that are created from the doc chunks

* Embedding model: OpenAI is used to build embeddings model "text-embedding-3-large"

* HuggingFace was used to host model (Open API)

* OpenAI 4o is the Generating LLM API using the user query and context chunks

* Langchain was used for orchestration across OpenAI API, Qdrant, etc

## Task 3: Creating a Golden Test Data Set

Generate a synthetic test data set and baseline an initial evaluation

#### pip install ragas

In [36]:
! pip install ragas

Collecting ragas
  Using cached ragas-0.1.20-py3-none-any.whl.metadata (5.5 kB)
Collecting datasets (from ragas)
  Using cached datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pysbd>=0.3.4 (from ragas)
  Using cached pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting appdirs (from ragas)
  Using cached appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting filelock (from datasets->ragas)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting pyarrow>=15.0.0 (from datasets->ragas)
  Using cached pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->ragas)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets->ragas)
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting xxhash (from datasets->ragas)
  Using cached xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from 

In [38]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

In [47]:
testset = generator.generate_with_langchain_docs(full_docs, 60, distributions, with_debugging_logs=True)

embedding nodes:   0%|          | 0/218 [00:00<?, ?it/s]

embedding nodes:   8%|▊         | 17/218 [00:00<00:05, 33.54it/s][ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Data Privacy', 'AI Deployment', 'Human subjects', 'Information Security', 'Confabulation']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Data Privacy', 'AI Deployment', 'Human subjects', 'Information Security', 'Confabulation']}
embedding nodes:  10%|█         | 22/218 [00:00<00:05, 33.01it/s][ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Demographic group membership', 'Digital divide', 'Hate speech filtering', 'Gender bias evaluation', 'Environmental impact']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Demographic group membership', 'Digital divide', 'Hate speech filtering', 'Gender bias evaluation', 'Environmental impact']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Equal Opportunities', 'Civil Justice', 'AI systems', 'Privacy law', 'Surveillance']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Equal Opp

In [48]:
test_df = testset.to_pandas()

In [49]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

#### Baseline

In [None]:
answers = []
contexts = []

for question in test_questions:
  response = base_org_retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]]) # 3.5 mins

In [101]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [104]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [105]:
base_results = evaluate(response_dataset, metrics) # 3 mins

Evaluating: 100%|██████████| 300/300 [03:46<00:00,  1.32it/s]


In [106]:
base_org_results_df = base_results.to_pandas()
base_org_results_df.head(10)

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How should entities allow for independent eval...,[SECTION: ALGORITHMIC DISCRIMINATION PROTECTIO...,Entities should allow for independent evaluati...,Entities should allow for independent evaluati...,0.769231,0.989957,1.0,1.0,0.413849
1,What organizations and individuals submitted r...,[SECTION: LISTENING TO THE AMERICAN PEOPLE\nAP...,The Request For Information (RFI) on the use a...,The organizations and individuals that submitt...,0.75,0.96694,1.0,1.0,0.242236
2,What role do domain-specific standards play in...,[SECTION: SAFE AND EFFECTIVE SYSTEMS\n \n \n \...,Domain-specific standards play a crucial role ...,Domain-specific standards play a crucial role ...,0.85,0.96975,1.0,1.0,0.552311
3,How does the Equal Credit Opportunity Act requ...,[SECTION: NOTICE AND EXPLANATION\n \n \n \n \n...,The Equal Credit Opportunity Act (ECOA) requir...,The Equal Credit Opportunity Act requires lend...,1.0,0.918922,1.0,1.0,0.991845
4,How are fairness and bias evaluated in the con...,[ \nBias exists in many forms and can become i...,Fairness and bias in the context of GAI system...,Fairness and bias in GAI systems are evaluated...,1.0,0.992961,1.0,1.0,0.515009
5,"How do generative tasks, such as text summariz...","[ \nTraining, maintaining, and operating (runn...","Generative tasks in GAI systems, such as text ...","Generative tasks, such as text summarization, ...",1.0,0.956932,1.0,1.0,0.704696
6,What is the importance of establishing policie...,[ \nThe following primary considerations were ...,Establishing policies and procedures in the co...,Establishing policies and procedures in the co...,1.0,0.986777,1.0,1.0,0.862679
7,What was President Biden's response to the Sup...,[SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.Th...,President Biden made remarks on the Supreme Co...,The answer to given question is not present in...,1.0,0.956853,1.0,0.0,0.183731
8,How do intellectual property risks from GAI sy...,[ \nIntellectual property risks from GAI syste...,Intellectual property risks from GAI systems p...,Intellectual property risks from GAI systems m...,1.0,0.950035,1.0,1.0,0.994962
9,What is the purpose of a fallback system in th...,"[SECTION: HUMAN ALTERNATIVES, CONSIDERATION, A...",The purpose of a fallback system in the contex...,The purpose of a fallback system in the contex...,0.769231,0.994518,1.0,0.805556,0.77092


### Section Chunking & OpenAI Embedding Small 

In [107]:
base_org_results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.956892
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.696299
dtype: float64

#### chunk strategy 2 rag chain

In [50]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]]) # 3.5 mins

In [51]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

#### base response dataset

In [52]:
answers = []
contexts = []

for question in test_questions:
  response = base_retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]]) # ~4 mins

In [53]:
from datasets import Dataset

base_response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

### Evaluate our Pipeline with Ragas

In [103]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [55]:
results = evaluate(response_dataset, metrics) # 3 mins

Evaluating:   0%|          | 0/300 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 300/300 [02:40<00:00,  1.87it/s]


In [56]:
results_df = results.to_pandas()
results_df.head(10)

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How should entities allow for independent eval...,[SECTION: ALGORITHMIC DISCRIMINATION PROTECTIO...,Entities should allow for independent evaluati...,Entities should allow for independent evaluati...,1.0,0.948951,1.0,1.0,0.995871
1,What organizations and individuals submitted r...,[SECTION: LISTENING TO THE AMERICAN PEOPLE\nAP...,The organizations and individuals who submitte...,The organizations and individuals that submitt...,0.0,0.965681,1.0,0.916667,0.245349
2,What role do domain-specific standards play in...,[SECTION: SAFE AND EFFECTIVE SYSTEMS\n \n \n \...,Domain-specific standards play a crucial role ...,Domain-specific standards play a crucial role ...,0.857143,0.96975,1.0,1.0,0.825669
3,How does the Equal Credit Opportunity Act requ...,[Sarah Ammermann. Adverse Action Notice Requir...,The Equal Credit Opportunity Act requires lend...,The Equal Credit Opportunity Act requires lend...,1.0,0.946984,1.0,1.0,0.994053
4,How are fairness and bias evaluated in the con...,[ \nBias exists in many forms and can become i...,Fairness and bias in the context of GAI (Gener...,Fairness and bias in GAI systems are evaluated...,1.0,0.916413,1.0,0.916667,0.767938
5,"How do generative tasks, such as text summariz...","[ \nTraining, maintaining, and operating (runn...","Generative tasks, such as text summarization, ...","Generative tasks, such as text summarization, ...",1.0,0.963591,1.0,1.0,0.847724
6,What is the importance of establishing policie...,[• \nGAI Risks: Tags linking suggested actions...,Establishing policies and procedures is crucia...,Establishing policies and procedures in the co...,1.0,0.986777,0.666667,1.0,0.258585
7,What was President Biden's response to the Sup...,[SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.Th...,The context provided does not include the spec...,The answer to given question is not present in...,1.0,0.0,1.0,0.0,0.190501
8,How do intellectual property risks from GAI sy...,[ \nIntellectual property risks from GAI syste...,Intellectual property risks from GAI systems a...,Intellectual property risks from GAI systems m...,0.833333,0.950035,1.0,1.0,0.747417
9,What is the purpose of a fallback system in th...,[Using both AI and human agents is viewed as k...,The purpose of a fallback system in the contex...,The purpose of a fallback system in the contex...,0.833333,0.994518,1.0,1.0,0.698745


In [57]:
results_df.iloc[:,4:].describe()

Unnamed: 0,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
count,60.0,60.0,60.0,60.0,60.0
mean,0.952975,0.884611,0.937222,0.86713,0.658824
std,0.143122,0.270374,0.200455,0.306222,0.257477
min,0.0,0.0,0.0,0.0,0.178728
25%,1.0,0.940647,1.0,0.979167,0.504729
50%,1.0,0.96333,1.0,1.0,0.699359
75%,1.0,0.987995,1.0,1.0,0.868115
max,1.0,1.0,1.0,1.0,0.999095


In [58]:
results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.963330
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.699359
dtype: float64

#### base chunking rag eval

In [59]:
base_results = evaluate(base_response_dataset, metrics)
base_results_df = base_results.to_pandas()
base_results_df.head(10)

Evaluating:   0%|          | 0/300 [00:00<?, ?it/s]

Evaluating:  52%|█████▏    | 156/300 [01:20<01:17,  1.85it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 300/300 [02:51<00:00,  1.75it/s]


Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How should entities allow for independent eval...,[SECTION: ALGORITHMIC DISCRIMINATION PROTECTIO...,Entities should allow for independent evaluati...,Entities should allow for independent evaluati...,1.0,0.953252,1.0,1.0,0.486086
1,What organizations and individuals submitted r...,[SECTION: LISTENING TO THE AMERICAN PEOPLE\nAP...,The organizations and individuals that submitt...,The organizations and individuals that submitt...,,0.96694,1.0,1.0,0.994629
2,What role do domain-specific standards play in...,[SECTION: SAFE AND EFFECTIVE SYSTEMS\n \n \n \...,Domain-specific standards play a crucial role ...,Domain-specific standards play a crucial role ...,1.0,0.96975,1.0,1.0,0.491018
3,How does the Equal Credit Opportunity Act requ...,[SECTION: NOTICE AND EXPLANATION\n \n \n \n \n...,The Equal Credit Opportunity Act requires lend...,The Equal Credit Opportunity Act requires lend...,1.0,0.931939,1.0,1.0,0.997151
4,How are fairness and bias evaluated in the con...,[ \nBias exists in many forms and can become i...,"In the context of GAI (General AI) systems, fa...",Fairness and bias in GAI systems are evaluated...,0.944444,0.937613,0.4,0.805556,0.533597
5,"How do generative tasks, such as text summariz...","[ \nTraining, maintaining, and operating (runn...","Generative tasks, such as text summarization, ...","Generative tasks, such as text summarization, ...",1.0,0.963591,1.0,1.0,0.847789
6,What is the importance of establishing policie...,[ \nGAI value chains involve many third-party ...,Establishing policies and procedures in the co...,Establishing policies and procedures in the co...,1.0,0.988152,0.666667,1.0,0.341341
7,What was President Biden's response to the Sup...,[SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.Th...,President Biden's response to the Supreme Cour...,The answer to given question is not present in...,1.0,0.944351,1.0,0.0,0.182324
8,How do intellectual property risks from GAI sy...,[ \nIntellectual property risks from GAI syste...,Intellectual property risks from GAI systems a...,Intellectual property risks from GAI systems m...,0.833333,0.986026,1.0,1.0,0.697944
9,What is the purpose of a fallback system in th...,"[SECTION: HUMAN ALTERNATIVES, CONSIDERATION, A...",The purpose of a fallback system in the contex...,The purpose of a fallback system in the contex...,0.833333,0.994518,1.0,1.0,0.548957


In [60]:
base_results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.964068
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.673044
dtype: float64

In [61]:
results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.963330
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.699359
dtype: float64

Check out the specific metrics we'll be using in the Ragas documentation:

- [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html)
- [Answer Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/answer_relevance.html)
- [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/context_precision.html)
- [Context Recall](https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html)
- [Answer Correctness](https://docs.ragas.io/en/stable/concepts/metrics/answer_correctness.html)

#### Spot check one

In [78]:
def insert_line_breaks(text):
    # Use string replace to replace every period followed by a space with a period and a line break
    print (text.replace('. ', '.\n'))

In [79]:
insert_line_breaks(results_df.iloc[8]['answer']) ## Lack of SLA in answer

Vendor contracts can be reviewed by avoiding arbitrary or capricious termination clauses for critical GAI technologies or vendor services and by ensuring non-standard terms that may amplify or defer liability in unexpected ways are not included.
Additionally, contracts should be scrutinized to prevent clauses that could contribute to unauthorized data collection by vendors or third-parties, such as secondary data use.


In [80]:
insert_line_breaks(results_df.iloc[8]['ground_truth'])

Review vendor contracts to avoid arbitrary or capricious termination of critical GAI technologies or vendor services and non-standard terms that may amplify or defer liability in unexpected ways and/or contribute to unauthorized data collection by vendors or third-parties.
Consider clear assignment of liability and responsibility for incidents, GAI system changes over time (e.g., fine-tuning, drift, decay); Request notification and disclosure for serious incidents arising from third-party data and systems; Service Level Agreements (SLAs) in vendor contracts that address incident response, response times, and availability of critical support.


### Assess your pipeline using the RAGAS framework including key metrics faithfulness, answer relevancy, context precision, and context recall.  Provide a table of your output results.

### What conclusions can you draw about performance and effectiveness of your pipeline with this information?

#### * Answer: While answer correctness (0.747) is lower, it suggests that despite the model retrieving the correct context and staying faithful to the source, there might be some issues in how the final answer is processed or interpreted. Precision and recall in context retrieval are both perfect, which suggests the problem lies not in the retrieval step, but in generating or interpreting the final answer. Increasing complexity or steps in the generation LLM could boost performance. 

 _______________________________

## Task 4: Fine-Tuning Open-Source Embeddings: In Colab

### Generate synthetic fine-tuning data and complete fine-tuning of the open-source embedding model

aie4_midterm/Task4Midterm_Fine_tuning_Embedding_Models_for_RAG.ipynb

#### Deliverables

 #### * Swap out your existing embedding model for the new fine-tuned version.  Provide a link to your fine-tuned embedding model on the Hugging Face Hub.

##### * https://huggingface.co/northstaranlyticsma24/artic_ft_midterm

#### * How did you choose the embedding model for this application?

##### * 

pip install langchain_huggingface

or I got around the error by simply getting back to LangChain 2.4.

## Task 5: Assessing Performance

#### Test out new ft embedding model performance on RAG

In [62]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
huggingface_embeddings = HuggingFaceEmbeddings(model_name="northstaranlyticsma24/artic_ft_midterm") ## 5 mins download yikes...

  huggingface_embeddings = HuggingFaceEmbeddings(model_name="northstaranlyticsma24/artic_ft_midterm") ## 5 mins download yikes...
Some weights of BertModel were not initialized from the model checkpoint at northstaranlyticsma24/artic_ft_midterm and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
vectorstore_ft = Qdrant.from_documents(
    documents=sem_documents,
    embedding=huggingface_embeddings,
    location=":memory:",
    collection_name="ft_snow_ai_policy"
)
ft_emb_retriever = vectorstore_ft.as_retriever()

In [64]:
ft_retrieval_augmented_qa_chain = (

    {"context": itemgetter("question") | ft_emb_retriever, "question": itemgetter("question")}

    | RunnablePassthrough.assign(context=itemgetter("context"))

    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

In [65]:
answers = []
contexts = []

for question in test_questions:
  response = ft_retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]]) # 3.5 mins

In [66]:
from datasets import Dataset

response_dataset_1 = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

### Evaluate our Pipeline with Ragas

In [67]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [68]:
results = evaluate(response_dataset_1, metrics)

Evaluating:   0%|          | 0/300 [00:00<?, ?it/s]

Evaluating:  11%|█         | 33/300 [00:15<02:14,  1.98it/s]No statements were generated from the answer.
Evaluating:  24%|██▍       | 72/300 [00:36<02:20,  1.62it/s]Failed to parse output. Returning None.
Evaluating: 100%|██████████| 300/300 [02:52<00:00,  1.74it/s]


In [69]:
results_df_ft = results.to_pandas()
results_df_ft.head(10)

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How should entities allow for independent eval...,[SECTION: ALGORITHMIC DISCRIMINATION PROTECTIO...,Entities should allow for independent evaluati...,Entities should allow for independent evaluati...,1.0,0.972576,1.0,1.0,0.997969
1,What organizations and individuals submitted r...,[Dacus. Synopsis of Responses to OSTP’s Reques...,The organizations and individuals who submitte...,The organizations and individuals that submitt...,,0.958498,1.0,1.0,0.99702
2,What role do domain-specific standards play in...,[SECTION: SAFE AND EFFECTIVE SYSTEMS\n \n \n \...,Domain-specific standards play a critical role...,Domain-specific standards play a crucial role ...,1.0,0.96975,1.0,1.0,0.87956
3,How does the Equal Credit Opportunity Act requ...,[Sarah Ammermann. Adverse Action Notice Requir...,The Equal Credit Opportunity Act (ECOA) requir...,The Equal Credit Opportunity Act requires lend...,1.0,0.934886,1.0,1.0,0.786187
4,How are fairness and bias evaluated in the con...,[ \nBias exists in many forms and can become i...,Fairness and bias in the context of GAI (Gener...,Fairness and bias in GAI systems are evaluated...,1.0,0.94175,0.4,0.75,0.408798
5,"How do generative tasks, such as text summariz...","[ \nTraining, maintaining, and operating (runn...","Generative tasks, such as text summarization, ...","Generative tasks, such as text summarization, ...",1.0,0.963591,1.0,1.0,0.847789
6,What is the importance of establishing policie...,[Address general risks associated with a lack ...,Establishing policies and procedures is crucia...,Establishing policies and procedures in the co...,1.0,0.926897,1.0,1.0,0.722662
7,What was President Biden's response to the Sup...,[SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.Th...,President Biden's response to the Supreme Cour...,The answer to given question is not present in...,1.0,1.0,1.0,0.0,0.182001
8,How do intellectual property risks from GAI sy...,[ \nIntellectual property risks from GAI syste...,Intellectual property risks from GAI systems a...,Intellectual property risks from GAI systems m...,0.857143,0.950035,1.0,0.916667,0.747258
9,What is the purpose of a fallback system in th...,[Using both AI and human agents is viewed as k...,The purpose of a fallback system in the contex...,The purpose of a fallback system in the contex...,1.0,0.994518,1.0,1.0,0.749775


# RAGAS full eval results (eval = 60)

#### Base Chunk RAG 

In [70]:
base_results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.964068
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.673044
dtype: float64

In [71]:
base_results_df.iloc[:,4:].mean()

faithfulness          0.945561
answer_relevancy      0.851933
context_recall        0.922778
context_precision     0.857870
answer_correctness    0.648622
dtype: float64

#### Section + Semnatic Chunk 

In [73]:
results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.963330
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.699359
dtype: float64

In [72]:
results_df.iloc[:,4:].mean()

faithfulness          0.952975
answer_relevancy      0.884611
context_recall        0.937222
context_precision     0.867130
answer_correctness    0.658824
dtype: float64

#### Section + SF Artic

In [91]:
results_df_ft2.iloc[:,4:].mean()

faithfulness          0.944031
answer_relevancy      0.938613
context_recall        0.958750
context_precision     0.865278
answer_correctness    0.682602
dtype: float64

In [92]:
results_df_ft2.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.956892
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.719604
dtype: float64

#### Section & Semnatic Chunk + Ft Artic Emb model 

In [94]:
results_df_ft.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.966723
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.747258
dtype: float64

In [93]:
results_df_ft.iloc[:,4:].mean()

faithfulness          0.965303
answer_relevancy      0.887470
context_recall        0.865556
context_precision     0.898148
answer_correctness    0.679844
dtype: float64

## Task 5b: Simple fixes based on RAGAS performance 

#### Add Bigger Generation LLM in chain (GPT4-Turbo vs 4o)

Context scores and faithfulness & answer_relevancy performance was great to perfect on small sample. However, answer correctness lower score indicates the Generation Model had great information but left detail off the stable. Lets prompt for more detail and use 'GPT4-Turbo' vs 4o in rag chain. 

 Obviously, one would need to increase stats power with increased sample size to really make jumps in performance and evaluation-led decisions.

In [80]:
## Generation LLM
llm_4t = ChatOpenAI(model="gpt-4-turbo")

In [81]:
RAG_PROMPT_2 = """\
Given a provided context and question, you must answer the question based only on context. 
When using context to answer the question, be precise and use the contexts to their fullest. Response should be more than one setence.

Context: {context}
Question: {question}
"""

rag_prompt2 = ChatPromptTemplate.from_template(RAG_PROMPT_2)

In [82]:
ft_retrieval_augmented_qa_chain = (

    {"context": itemgetter("question") | ft_emb_retriever, "question": itemgetter("question")}

    | RunnablePassthrough.assign(context=itemgetter("context"))

    | {"response": rag_prompt2 | llm_4t, "context": itemgetter("context")}
)

#### Section + FT embedding 

In [83]:
vectorstore_ft_sec = Qdrant.from_documents(
    documents=full_docs,
    embedding=huggingface_embeddings,
    location=":memory:",
    collection_name="sec_ft_snow_ai_policy"
)
sec_ft_emb_retriever = vectorstore_ft_sec.as_retriever()

In [84]:
sec_ft_retrieval_augmented_qa_chain = (

    {"context": itemgetter("question") | sec_ft_emb_retriever, "question": itemgetter("question")}

    | RunnablePassthrough.assign(context=itemgetter("context"))

    | {"response": rag_prompt2 | llm_4t, "context": itemgetter("context")}
)

In [85]:
answers = []
contexts = []

for question in test_questions:
  response = sec_ft_retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]]) # 10 mins

In [86]:
from datasets import Dataset

response_dataset_2 = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})
results2 = evaluate(response_dataset_2, metrics)
results_df_ft2 = results2.to_pandas()
results_df_ft2.head(10)

Evaluating:  91%|█████████▏| 274/300 [02:50<00:11,  2.29it/s]Failed to parse output. Returning None.
Evaluating: 100%|██████████| 300/300 [03:19<00:00,  1.51it/s]


Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How should entities allow for independent eval...,[SECTION: ALGORITHMIC DISCRIMINATION PROTECTIO...,Entities should allow for independent evaluati...,Entities should allow for independent evaluati...,1.0,0.989957,1.0,1.0,0.413849
1,What organizations and individuals submitted r...,[SECTION: LISTENING TO THE AMERICAN PEOPLE\nAP...,The Request For Information (RFI) on the use a...,The organizations and individuals that submitt...,0.5,0.96694,1.0,1.0,0.242236
2,What role do domain-specific standards play in...,[SECTION: SAFE AND EFFECTIVE SYSTEMS\n \n \n \...,Domain-specific standards play a crucial role ...,Domain-specific standards play a crucial role ...,1.0,0.96975,1.0,1.0,0.552311
3,How does the Equal Credit Opportunity Act requ...,[SECTION: NOTICE AND EXPLANATION\n \n \n \n \n...,The Equal Credit Opportunity Act (ECOA) requir...,The Equal Credit Opportunity Act requires lend...,1.0,0.918922,1.0,1.0,0.991845
4,How are fairness and bias evaluated in the con...,[ \nBias exists in many forms and can become i...,Fairness and bias in the context of GAI system...,Fairness and bias in GAI systems are evaluated...,1.0,0.992961,1.0,1.0,0.506988
5,"How do generative tasks, such as text summariz...","[ \nTraining, maintaining, and operating (runn...","Generative tasks in GAI systems, such as text ...","Generative tasks, such as text summarization, ...",1.0,0.956932,1.0,1.0,0.473927
6,What is the importance of establishing policie...,[ \nThe following primary considerations were ...,Establishing policies and procedures in the co...,Establishing policies and procedures in the co...,1.0,0.986777,1.0,1.0,0.339822
7,What was President Biden's response to the Sup...,[SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.Th...,President Biden made remarks on the Supreme Co...,The answer to given question is not present in...,0.75,0.956853,1.0,0.0,0.183731
8,How do intellectual property risks from GAI sy...,[ \nIntellectual property risks from GAI syste...,Intellectual property risks from GAI systems p...,Intellectual property risks from GAI systems m...,0.909091,0.950035,1.0,0.916667,0.932462
9,What is the purpose of a fallback system in th...,"[SECTION: HUMAN ALTERNATIVES, CONSIDERATION, A...",The purpose of a fallback system in the contex...,The purpose of a fallback system in the contex...,0.923077,0.994518,1.0,0.805556,0.719604


In [87]:
results_df_ft2[results_df_ft2['answer_correctness']<.6]['question'].to_list()

['How should entities allow for independent evaluation of data policies in automated systems?',
 'What organizations and individuals submitted responses to the Request For Information (RFI) issued by OSTP on the use and governance of biometric technologies?',
 'What role do domain-specific standards play in ensuring the safety and effectiveness of automated systems?',
 'How are fairness and bias evaluated in the context of GAI systems?',
 'How do generative tasks, such as text summarization, compare in terms of energy and carbon intensity to discriminative or non-generative tasks in GAI systems?',
 'What is the importance of establishing policies and procedures in the context of GAI risk measurement and oversight functions?',
 "What was President Biden's response to the Supreme Court Decision to Overturn Roe v. Wade?",
 "What information does the ACLU of New York provide about New York's temporary ban on facial recognition in schools?",
 'What tasks are AI Actors responsible for in mon

In [88]:
response = ft_retrieval_augmented_qa_chain.invoke({"question" : 'Who contributed ideas for the AI Bill of Rights? Tell me the names of stakeholders'})

In [89]:
response["response"].content

"The stakeholders who contributed ideas for the development of the Blueprint for an AI Bill of Rights included a variety of entities from the private sector and civil society. Some of the notable participants mentioned in the context are Adobe, American Civil Liberties Union (ACLU), The Aspen Commission on Information Disorder, The Awood Center, The Australian Human Rights Commission, Biometrics Institute, The Brookings Institute, BSA | The Software Alliance, Cantellus Group, Center for American Progress, Center for Democracy and Technology, Center on Privacy and Technology at Georgetown Law, Christiana Care, Color of Change, Coworker, Data Robot, Data Trust Alliance, Data and Society Research Institute, Deepmind, EdSAFE AI Alliance, Electronic Privacy Information Center (EPIC), Encode Justice, Equal AI, Google, Hitachi's AI Policy Committee, The Innocence Project, Institute of Electrical and Electronics Engineers (IEEE), Intuit, Lawyers Committee for Civil Rights Under Law, Legal Aid 

In [90]:
results_df_ft2.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.956892
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.719604
dtype: float64

### 

## Task 5: Assessing Performance

synthesis on final report

## Task 6: Managing Your Boss and User Expectations

You are the SVP of Technology.  Given the work done by your team so far, you're now sitting down with the AI Solutions Engineer.  You have tasked the solutions engineer to test out the new application with at least 50 different internal stakeholders over the next month.

What is the story that you will give to the CEO to tell the whole company at the launch next month?

There appears to be important information not included in our build, for instance, the 270-day update on the 2023 executive order on Safe, Secure, and Trustworthy AI.  How might you incorporate relevant white-house briefing information into future versions? 