In [None]:
# pip install -r requirements_doc.txt 

In [11]:
import re
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings

In [12]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")

## Load documents

In [19]:
import fitz  # PyMuPDF
import os

def extract_images_and_tables_from_pdf(pdf_path, output_dir):
    # Ensure the output directories for images and tables exist
    image_dir = os.path.join(output_dir, 'images')
    table_dir = os.path.join(output_dir, 'tables')
    os.makedirs(image_dir, exist_ok=True)
    os.makedirs(table_dir, exist_ok=True)
    
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    extracted_content = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        images = page.get_images(full=True)
        
        # Extract images
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_path = f"{image_dir}/image_{page_num + 1}_{img_index}.{image_ext}"
            
            # Save the image
            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
                
            # Store extracted image metadata
            extracted_content.append({
                'content_id': f'regulation_{xref}_image_{img_index}',
                'content_type': 'Image',
                'hierarchy_level': 'Figure',
                'page_number': page_num + 1,
                'bbox': img[2:6],
                'text': None,
                'image_path': image_path,
                'table_path': None,
                'associated_text_id': None  # Modify if you have a way to link this with text
            })
        
        # Extract tables if applicable
        text = page.get_text("text")
        tables = extract_tables_from_page(page)  # This function would extract tables if needed
        for table_index, table in enumerate(tables):
            table_path = f"{table_dir}/table_{page_num + 1}_{table_index}.txt"
            with open(table_path, "w") as table_file:
                table_file.write(table)  # Save the table as text
            
            # Store extracted table metadata
            extracted_content.append({
                'content_id': f'regulation_{page_num + 1}_table_{table_index}',
                'content_type': 'Table',
                'hierarchy_level': 'Table',
                'page_number': page_num + 1,
                'bbox': None,
                'text': None,
                'image_path': None,
                'table_path': table_path,
                'associated_text_id': None
            })
    
    doc.close()
    return extracted_content

def extract_tables_from_page(page):
    # Implement logic to extract tables from the page
    # This function should return a list of table data from the page
    # You can use regex or a table extraction tool
    tables = []
    # Add table extraction logic here
    return tables

In [17]:
wh_link = 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf'
nist_link = 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf'

In [18]:
from langchain_community.document_loaders import PyMuPDFLoader

wh_documents = PyMuPDFLoader(file_path=wh_link).load()
nist_docments = PyMuPDFLoader(file_path=nist_link).load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100,
    length_function = len,
)

wh_split_chunks = text_splitter.split_documents(wh_documents)

wh_split_chunks[2].page_content

nist_split_chunks = text_splitter.split_documents(nist_docments)

### 1. Section-based chunking

In [20]:

def chunk_pdf_with_sections(doc, sections):
    """
    Chunk a PDF by pages and embed section headers for each chunk.

    Parameters:
    - doc: The loaded PDF document using PyMuPDF.
    - sections: Dictionary mapping section names to page ranges.

    Returns:
    - section_chunks: List of dictionaries with page number, section name, and text chunks.
    """
    section_chunks = []

    # Loop through each section
    for section, pages in sections.items():
        for page_num in pages:
            #page = doc.load_page(page_num)
            page_text =  doc[page_num].page_content
            
            # Store each page as a chunk with section info
            chunk = {
                "section": section,
                #"page_num": page_num + 1,  # Adjusting for 0-based indexing
                "content": f"SECTION: {section}\n{page_text}"
            }
            section_chunks.append(chunk)
    
    return section_chunks

In [21]:
# Function to process the text from page 4 onward
def extract_full_text_from_pages(doc, start_page=4):
    full_text = ""
    for page_num in range(start_page, len(doc)):
        full_text += doc[page_num].page_content
    return full_text

In [22]:
nist_sections = [
    "\n1. \nIntroduction",  # Starts at page 4 (index 0) and ends before page 2 (index 1)
    "\n2. \nOverview of Risks Unique to or Exacerbated by GAI",  # Starts at page 5, ends before page 8
    "\n2.1. CBRN Information or Capabilities",  # Page 8 only
    "\n2.2. Confabulation",  # Page 9 only
    "\n2.3. Dangerous, Violent, or Hateful Content",  # Page 9 & 10 only
    "\n2.4. Data Privacy",  # Page 10 & 11 only
    "\n2.5. Environmental Impacts",  # Page 11 only
    "\n2.6. Harmful Bias and Homogenization",  # Page 11 & 12 
    "\n2.7. Human-Al Configuration",  # Page 12 only
    "\n2.8. Information Integrity",  # Page 12 & 13 only
    "\n2.9. Information Security",  # Page 13 & 14 only
    "\n2.10. \nIntellectual Property",  # Page 14 only
    "\n2.11. \nObscene, Degrading, and/or Abusive Content",  # Page 14 & 15 only
    "\n2.12. \nValue Chain and Component Integration",  # Page 15 only
    "\n3. Suggested Actions to Manage GAI Risks",  # Starts at page 15 to 49
    "Appendix A. Primary GAI Considerations",  # Starts at page 50 to 56
    "Appendix B. References" # 57 to 63
]

In [23]:
wh_sections = {
    "USING THIS TECHNICAL COMPANION": [13],
    "SAFE AND EFFECTIVE SYSTEMS": list(range(14, 22)),
    "ALGORITHMIC DISCRIMINATION PROTECTIONS": list(range(22, 29)),
    "DATA PRIVACY": list(range(29, 39)),
    "NOTICE AND EXPLANATION": list(range(39, 45)),
    "HUMAN ALTERNATIVES, CONSIDERATION, AND FALLBACK": list(range(45, 52)),
    "APPENDIX: EXAMPLES OF AUTOMATED SYSTEMS": [52, 53],
    "LISTENING TO THE AMERICAN PEOPLE": list(range(54, 62)),
    "ENDNOTES": list(range(62, 73))
}

In [24]:
full_text = extract_full_text_from_pages(nist_docments, start_page=4)
chunks = {}
pattern = "|".join([re.escape(section) for section in nist_sections])
# Split text based on the pattern
sections = re.split(f'({pattern})', full_text)
nist_chunks = {sections[1:][i]: sections[1:][i + 1] for i in range(0, len(sections[1:]), 2)}

In [25]:
wh_chunks = chunk_pdf_with_sections(wh_documents, wh_sections)

### Task 1: Review the two PDFs and decide how best to chunk up the data with a single strategy to optimally answer the variety of questions you expect to receive from people.

#### Deliverables

* Describe the default chunking strategy that you will use. 

     * After reviewing pdfs, I wanted to maintain the sections or headers of the documents. In order to execute this method, I found the sections via table of contents in each pdf. Then I used functions that found the regex pattern to associate section to content.

* Articulate a chunking strategy that you would also like to test out.

     * The alternative strategy will be section-based chunks plus semantic chunking. 

* Describe how and why you made these decisions

     * In order to maintain the sections or headers of the documents, I found the sections via table of contents in each pdf and converted in syntax (dict or list). Then I used functions that found the regex pattern (nist_docs) or page_num (wh_paper) to associate section to content. Due to the low N of source docs, I was able to execute static rules (regex or page_num) that will work for the sample. These approaches will have difficulty being replicated if new samples were provided. In future, using headers to chunk with around sections may be viable.

### Chunking Strategy 1: Section-based chunks

#### Format into Document object

In [27]:
from langchain_core.documents import Document

In [28]:
def process_nist_chunks(nist_chunks):
    # This function processes the nist_chunks dictionary.
    documents = []
    for section_name, page_content in nist_chunks.items():
        doc = Document(page_content=page_content, metadata={'section': section_name})
        documents.append(doc)
    return documents

def process_wh_chunks(wh_chunks):
    # This function processes the wh_chunks list of dictionaries.
    documents = []
    for chunk in wh_chunks:
        section_name = chunk['section']
        page_content = chunk['content']
        doc = Document(page_content=page_content, metadata={'section': section_name})
        documents.append(doc)
    return documents

In [29]:
nist_docs = process_nist_chunks(nist_chunks)

In [30]:
wh_docs= process_wh_chunks(wh_chunks)

#### Combine wh_docs and nist_docs document objects into one section-based Document

In [33]:
full_docs = wh_docs + nist_docs

### Chunking Strategy 2: Section (page or topic) + Semantic Chunking (Sentence-level for semantic similarity chunk points)

In [44]:
base_text_splitter = SemanticChunker(OpenAIEmbeddings())
sem_documents = base_text_splitter.split_documents(full_docs) # 2 min 

In [91]:
len(full_docs)

75

In [90]:
len(sem_documents)

228

In [93]:
from  langchain.schema import Document
import json
from typing import Iterable

def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array
    


In [129]:
## used to transfer file to google colab for ft artic embed model
save_docs_to_jsonl(sem_documents,'sem_documents.jsonl')

## Embedding chunks

In [45]:
from langchain_openai import OpenAIEmbeddings

#embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

embeddings_lg = OpenAIEmbeddings(model="text-embedding-3-large")

In [46]:
from langchain_community.vectorstores import Qdrant

vectorstore = Qdrant.from_documents(
    documents=full_docs,
    embedding=embeddings_lg,
    location=":memory:",
    collection_name="ai_policy"
)
retriever = vectorstore.as_retriever()

In [47]:
vectorstore_1 = Qdrant.from_documents(
    documents=sem_documents,
    embedding=embeddings_lg,
    location=":memory:",
    collection_name="alt_ai_policy"
)
alt_retriever = vectorstore_1.as_retriever()

### RAG_chain

In [48]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

## Generation LLM
llm = ChatOpenAI(model="gpt-4o")

In [49]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

### LCEL chain for each different method

#### * rag_chain: with section-based chunking only

#### * retrieval_augmented_qa_chain: with section chunks + SemanticChunker at sentence-level processing

In [118]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain.schema import StrOutputParser

In [119]:
base_retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

In [55]:
retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | alt_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

In [134]:
base_retrieval_augmented_qa_chain.invoke({"question" : "What is the AI framework all about?"})['response']

AIMessage(content='The AI Risk Management Framework (AI RMF) is intended to improve the ability of organizations to incorporate trustworthiness considerations into the design, development, use, and evaluation of AI products, services, and systems. Released in January 2023, it is a voluntary framework developed by the National Institute of Standards and Technology (NIST) in response to President Biden’s Executive Order 14110. The framework helps organizations manage AI risks in a manner that aligns with their goals, legal/regulatory requirements, and best practices. It covers various stages of the AI lifecycle and includes considerations for generative AI, such as governance, content provenance, pre-deployment testing, and incident disclosure. The framework aims to foster the development of trustworthy AI characterized by accuracy, explainability, reliability, privacy, robustness, safety, security, and fairness.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'

In [135]:
## adds context and response keys to result object
retrieval_augmented_qa_chain.invoke({"question" : "What is the AI framework all about?"})['response']

AIMessage(content='The AI framework appears to be the "AI Risk Management Framework" developed by the National Institute of Standards and Technology (NIST). It includes several chapters and appendices detailing various aspects of AI risks and trustworthiness, profiles, descriptions of AI actor tasks, and comparisons of AI risks to traditional software risks. The framework also features a playbook and a glossary of terms related to trustworthy AI. The framework aims to address and manage risks associated with AI systems comprehensively.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 93, 'prompt_tokens': 1891, 'total_tokens': 1984, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_e375328146', 'finish_reason': 'stop', 'logprobs': None}, id='run-f6b34af4-7bcc-4873-85d1-3febe8b4aa0c-0', usage_metadata={'input_tokens': 1891, 'output_tokens': 93, 'total_tokens': 1984})

# Task 2

### Build an end-to-end RAG application using an industry-standard open-source stack and your choice of commercial off-the-shelf models

Build a prototype and deploy to a Hugging Face Space, and include the public URL link to your space  create a short (< 2 min) loom video demonstrating some initial testing inputs and outputs.

 * https://huggingface.co/spaces/aaromosshf2424/midterm_aie4

* Loom video link https://www.loom.com/share/701153f9d7234883aad5df9a5621a6ca?sid=d3b24367-41d1-4155-a06c-f73376801ab2

How did you choose your stack, and why did you select each tool the way you did?

* Docker to containerize the model & app. Why: Self-sustained version of the model and requirements that can be published.

* Chainlit was used to allow for interaction between user, their query, and model functionality.

* Qdrant is the vectorDb to store embeddings that are created from the doc chunks

* Embedding model: OpenAI is used to build embeddings model "text-embedding-3-large"

* HuggingFace was used to host model (Open API)

* OpenAI 4o is the Generating LLM API using the user query and context chunks

* Langchain was used for orchestration across OpenAI API, Qdrant, etc

## Task 3: Creating a Golden Test Data Set

Generate a synthetic test data set and baseline an initial evaluation

#### pip install ragas

In [36]:
! pip install ragas

Collecting ragas
  Using cached ragas-0.1.20-py3-none-any.whl.metadata (5.5 kB)
Collecting datasets (from ragas)
  Using cached datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pysbd>=0.3.4 (from ragas)
  Using cached pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting appdirs (from ragas)
  Using cached appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting filelock (from datasets->ragas)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting pyarrow>=15.0.0 (from datasets->ragas)
  Using cached pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->ragas)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets->ragas)
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting xxhash (from datasets->ragas)
  Using cached xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from 

In [37]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
testset = generator.generate_with_langchain_docs(full_docs, 20, distributions, with_debugging_logs=True)

In [40]:
test_df = testset.to_pandas()

In [42]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

#### chunk strategy 2 rag chain

In [57]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [58]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

#### base response dataset

In [121]:
answers = []
contexts = []

for question in test_questions:
  response = base_retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [122]:
from datasets import Dataset

base_response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

### Evaluate our Pipeline with Ragas

In [60]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [61]:
results = evaluate(response_dataset, metrics)

Evaluating:  40%|████      | 40/100 [00:17<00:19,  3.02it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 100/100 [01:20<00:00,  1.24it/s]


In [62]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What was President Biden's response to the Sup...,[SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.Th...,President Biden's response to the Supreme Cour...,Remarks by President Biden on the Supreme Cour...,1.0,0.941003,1.0,1.0,0.986177
1,What information should be included in regular...,[Reporting. When members of the public wish to...,Regularly-updated reports for entities respons...,Regularly-updated reports for entities respons...,1.0,0.978649,1.0,1.0,0.994118
2,How is systemic bias evaluated and documented ...,[ \nBias exists in many forms and can become i...,Systemic bias in the context of GAI systems is...,Systemic bias in GAI systems is evaluated and ...,1.0,0.992335,1.0,0.916667,0.537858
3,How do text-to-image models make it easy to cr...,[Text-to-image models also make it easy to cre...,Text-to-image models make it easy to create im...,Text-to-image models make it easy to create im...,1.0,0.875886,1.0,1.0,0.768552
4,How is systemic bias evaluated and documented ...,[ \nBias exists in many forms and can become i...,Systemic bias in GAI systems is evaluated and ...,Systemic bias in GAI systems is evaluated and ...,1.0,0.989375,1.0,1.0,0.680668
5,What metrics can be used to reflect the effect...,[Action ID \nSuggested Action \nGAI Risks \nMS...,The effectiveness of security measures in term...,Identify metrics that reflect the effectivenes...,0.764706,0.997419,0.666667,1.0,0.361112
6,How do intellectual property risks arise from ...,[ \nIntellectual property risks from GAI syste...,Intellectual property risks from GAI systems i...,Intellectual property risks from GAI systems m...,1.0,0.987392,0.75,0.75,0.879481
7,What organizations were involved in the meetin...,[SECTION: LISTENING TO THE AMERICAN PEOPLE\nAP...,The organizations involved in the meetings con...,"Adobe, American Civil Liberties Union (ACLU), ...",,0.999274,1.0,1.0,0.982893
8,How can vendor contracts be reviewed to avoid ...,[Address general risks associated with a lack ...,Vendor contracts can be reviewed by avoiding a...,Review vendor contracts to avoid arbitrary or ...,1.0,0.949382,0.25,1.0,0.759448
9,What are some examples of automated systems th...,[SECTION: APPENDIX: EXAMPLES OF AUTOMATED SYST...,Examples of automated systems that impact the ...,Automated systems that impact the safety of co...,1.0,1.0,1.0,1.0,0.967447


In [84]:
results_df.iloc[:,4:].describe()

Unnamed: 0,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
count,19.0,20.0,20.0,20.0,20.0
mean,0.970941,0.92653,0.846667,0.945833,0.707715
std,0.060824,0.220406,0.28442,0.109073,0.228754
min,0.764706,0.0,0.0,0.583333,0.226779
25%,0.977273,0.949325,0.729167,0.916667,0.550496
50%,1.0,0.988384,1.0,1.0,0.747616
75%,1.0,0.997882,1.0,1.0,0.901472
max,1.0,1.0,1.0,1.0,0.994118


In [126]:
results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.988384
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.747616
dtype: float64

#### base chunking rag eval

In [124]:
base_results = evaluate(base_response_dataset, metrics)
base_results_df = base_results.to_pandas()
base_results_df

Evaluating:  72%|███████▏  | 72/100 [00:40<00:18,  1.53it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 100/100 [01:40<00:00,  1.01s/it]


Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What was President Biden's response to the Sup...,[SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.Th...,President Biden's response to the Supreme Cour...,Remarks by President Biden on the Supreme Cour...,1.0,0.941003,1.0,1.0,0.834859
1,What information should be included in regular...,[SECTION: SAFE AND EFFECTIVE SYSTEMS\n \n \n \...,Regularly-updated reports for entities respons...,Regularly-updated reports for entities respons...,1.0,0.978649,1.0,1.0,0.635246
2,How is systemic bias evaluated and documented ...,[ \nBias exists in many forms and can become i...,Systemic bias in the context of Generative AI ...,Systemic bias in GAI systems is evaluated and ...,0.916667,0.899814,1.0,1.0,0.58537
3,How do text-to-image models make it easy to cr...,[ \nGAI systems can produce content that is in...,Text-to-image models facilitate the creation o...,Text-to-image models make it easy to create im...,1.0,0.979026,1.0,1.0,0.579387
4,How is systemic bias evaluated and documented ...,[ \nBias exists in many forms and can become i...,Systemic bias in the context of Generative AI ...,Systemic bias in GAI systems is evaluated and ...,1.0,0.90579,1.0,1.0,0.481002
5,What metrics can be used to reflect the effect...,[SECTION: SAFE AND EFFECTIVE SYSTEMS\n \n \n \...,To reflect the effectiveness of security measu...,Identify metrics that reflect the effectivenes...,1.0,1.0,1.0,1.0,0.638213
6,How do intellectual property risks arise from ...,[ \nIntellectual property risks from GAI syste...,Intellectual property risks from GAI (General ...,Intellectual property risks from GAI systems m...,1.0,0.987392,1.0,1.0,0.845114
7,What organizations were involved in the meetin...,[SECTION: LISTENING TO THE AMERICAN PEOPLE\nAP...,The organizations involved in the meetings con...,"Adobe, American Civil Liberties Union (ACLU), ...",,1.0,1.0,0.916667,0.982671
8,How can vendor contracts be reviewed to avoid ...,[ \nGAI systems raise several risks to privacy...,Vendor contracts can be reviewed to avoid arbi...,Review vendor contracts to avoid arbitrary or ...,1.0,0.932752,1.0,1.0,0.992155
9,What are some examples of automated systems th...,[SECTION: APPENDIX: EXAMPLES OF AUTOMATED SYST...,Examples of automated systems that impact the ...,Automated systems that impact the safety of co...,1.0,1.0,1.0,1.0,0.993029


In [125]:
base_results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.978838
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.774234
dtype: float64

In [127]:
results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.988384
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.747616
dtype: float64

Check out the specific metrics we'll be using in the Ragas documentation:

- [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html)
- [Answer Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/answer_relevance.html)
- [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/context_precision.html)
- [Context Recall](https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html)
- [Answer Correctness](https://docs.ragas.io/en/stable/concepts/metrics/answer_correctness.html)

In [78]:
def insert_line_breaks(text):
    # Use string replace to replace every period followed by a space with a period and a line break
    print (text.replace('. ', '.\n'))

In [79]:
insert_line_breaks(results_df.iloc[8]['answer']) ## Lack of SLA in answer

Vendor contracts can be reviewed by avoiding arbitrary or capricious termination clauses for critical GAI technologies or vendor services and by ensuring non-standard terms that may amplify or defer liability in unexpected ways are not included.
Additionally, contracts should be scrutinized to prevent clauses that could contribute to unauthorized data collection by vendors or third-parties, such as secondary data use.


In [80]:
insert_line_breaks(results_df.iloc[8]['ground_truth'])

Review vendor contracts to avoid arbitrary or capricious termination of critical GAI technologies or vendor services and non-standard terms that may amplify or defer liability in unexpected ways and/or contribute to unauthorized data collection by vendors or third-parties.
Consider clear assignment of liability and responsibility for incidents, GAI system changes over time (e.g., fine-tuning, drift, decay); Request notification and disclosure for serious incidents arising from third-party data and systems; Service Level Agreements (SLAs) in vendor contracts that address incident response, response times, and availability of critical support.


### Assess your pipeline using the RAGAS framework including key metrics faithfulness, answer relevancy, context precision, and context recall.  Provide a table of your output results.

### What conclusions can you draw about performance and effectiveness of your pipeline with this information?

#### * Answer: While answer correctness (0.747) is lower, it suggests that despite the model retrieving the correct context and staying faithful to the source, there might be some issues in how the final answer is processed or interpreted. Precision and recall in context retrieval are both perfect, which suggests the problem lies not in the retrieval step, but in generating or interpreting the final answer. Increasing complexity or steps in the generation LLM could boost performance. 

## _______________________________

## Task 4: Fine-Tuning Open-Source Embeddings: In Colab

### Generate synthetic fine-tuning data and complete fine-tuning of the open-source embedding model

aie4_midterm/Task4Midterm_Fine_tuning_Embedding_Models_for_RAG.ipynb

#### Deliverables

 #### * Swap out your existing embedding model for the new fine-tuned version.  Provide a link to your fine-tuned embedding model on the Hugging Face Hub.

##### * https://huggingface.co/northstaranlyticsma24/artic_ft_midterm

#### * How did you choose the embedding model for this application?

##### * 

pip install langchain_huggingface

or I got around the error by simply getting back to LangChain 2.4.

## Task 5: Assessing Performance

#### Test out new ft embedding model performance on RAG

In [98]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
huggingface_embeddings = HuggingFaceEmbeddings(model_name="northstaranlyticsma24/artic_ft_midterm") ## 5 mins download yikes...

  huggingface_embeddings = HuggingFaceEmbeddings(model_name="northstaranlyticsma24/artic_ft_midterm")
Some weights of BertModel were not initialized from the model checkpoint at northstaranlyticsma24/artic_ft_midterm and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
vectorstore_ft = Qdrant.from_documents(
    documents=sem_documents,
    embedding=huggingface_embeddings,
    location=":memory:",
    collection_name="ft_snow_ai_policy"
)
ft_emb_retriever = vectorstore_ft.as_retriever()

In [101]:
ft_retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | ft_emb_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

In [103]:
answers = []
contexts = []

for question in test_questions:
  response = ft_retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [104]:
from datasets import Dataset

response_dataset_1 = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [105]:
response_dataset_1[0]

{'question': "What was President Biden's response to the Supreme Court decision to overturn Roe v. Wade?",
 'answer': "President Biden's response to the Supreme Court decision to overturn Roe v. Wade can be found in his remarks, which were made on June 24, 2022. The full text of his remarks is available at the following link: https://www.whitehouse.gov/briefing-room/speeches-remarks/2022/06/24/remarks-by-president-biden-on-the-supreme-court-decision-to-overturn-roe-v-wade/.",
 'contexts': ["SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.The Executive Order On Advancing Racial Equity and Support for Underserved Communities Through the\nFederal\xa0Government. https://www.whitehouse.gov/briefing-room/presidential-actions/2021/01/20/executive\norder-advancing-racial-equity-and-support-for-underserved-communities-through-the-federal-government/\n2. The White House. Remarks by President Biden on the Supreme Court Decision to Overturn Roe v. Wade. Jun. 24, 2022. https://www.whitehouse.gov/briefin

### Evaluate our Pipeline with Ragas

In [107]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [108]:
results = evaluate(response_dataset_1, metrics)

Evaluating: 100%|██████████| 100/100 [01:02<00:00,  1.59it/s]


In [109]:
results_df_ft = results.to_pandas()
results_df_ft

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What was President Biden's response to the Sup...,[SECTION: ENDNOTES\n \n \n \n \nENDNOTES\n1.Th...,President Biden's response to the Supreme Cour...,Remarks by President Biden on the Supreme Cour...,1.0,0.941003,1.0,1.0,0.986582
1,What information should be included in regular...,[SECTION: NOTICE AND EXPLANATION\n \nYou shoul...,Regularly-updated reports for entities respons...,Regularly-updated reports for entities respons...,1.0,0.978649,1.0,1.0,0.816293
2,How is systemic bias evaluated and documented ...,[ \nBias exists in many forms and can become i...,Systemic bias in the context of GAI (General A...,Systemic bias in GAI systems is evaluated and ...,1.0,0.900467,0.0,0.805556,0.643216
3,How do text-to-image models make it easy to cr...,[Text-to-image models also make it easy to cre...,Text-to-image models make it easy to create im...,Text-to-image models make it easy to create im...,1.0,0.944609,1.0,1.0,0.788668
4,How is systemic bias evaluated and documented ...,[ \nBias exists in many forms and can become i...,Systemic bias in GAI systems is evaluated and ...,Systemic bias in GAI systems is evaluated and ...,1.0,0.989375,0.0,0.805556,0.58012
5,What metrics can be used to reflect the effect...,[CBRN Information or Capabilities; \nInformati...,The metrics that can be used to reflect the ef...,Identify metrics that reflect the effectivenes...,1.0,1.0,1.0,1.0,0.986636
6,How do intellectual property risks arise from ...,[ \nIntellectual property risks from GAI syste...,Intellectual property risks from GAI systems i...,Intellectual property risks from GAI systems m...,1.0,0.987392,0.75,0.805556,0.889606
7,What organizations were involved in the meetin...,[SECTION: LISTENING TO THE AMERICAN PEOPLE\nAP...,The meetings conducted by the Office of Scienc...,"Adobe, American Civil Liberties Union (ACLU), ...",0.6,0.991257,1.0,1.0,0.218192
8,How can vendor contracts be reviewed to avoid ...,[Consider: Clear \nassignment of liability and...,To review vendor contracts in a way that avoid...,Review vendor contracts to avoid arbitrary or ...,0.533333,0.967781,1.0,1.0,0.781875
9,What are some examples of automated systems th...,[SECTION: APPENDIX: EXAMPLES OF AUTOMATED SYST...,Examples of automated systems that impact the ...,Automated systems that impact the safety of co...,1.0,1.0,1.0,1.0,0.993067


In [116]:
results_df_ft.iloc[14]['answer']

'A variety of stakeholders in the private sector and civil society contributed ideas for the AI Bill of Rights.'

In [117]:
results_df_ft.iloc[14]['ground_truth']

"Adobe, American Civil Liberties Union (ACLU), The Aspen Commission on Information Disorder, The Awood Center, The Australian Human Rights Commission, Biometrics Institute, The Brookings Institute, BSA | The Software Alliance, Cantellus Group, Center for American Progress, Center for Democracy and Technology, Center on Privacy and Technology at Georgetown Law, Christiana Care, Color of Change, Coworker, Data Robot, Data Trust Alliance, Data and Society Research Institute, Deepmind, EdSAFE AI Alliance, Electronic Privacy Information Center (EPIC), Encode Justice, Equal AI, Google, Hitachi's AI Policy Committee, The Innocence Project, Institute of Electrical and Electronics Engineers (IEEE), Intuit, Lawyers Committee for Civil Rights Under Law, Legal Aid Society, The Leadership Conference on Civil and Human Rights, Meta, Microsoft, The MIT AI Policy Forum, Movement Alliance Project, The National Association of Criminal Defense Lawyers, O'Neil Risk Consulting & Algorithmic Auditing, The P

#### Base Chunk RAG 

In [128]:
base_results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.978838
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.774234
dtype: float64

#### Section + Semnatic Chunk 

In [110]:
results_df.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.988384
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.747616
dtype: float64

#### Section & Semnatic Chunk + Ft Artic Emb model 

In [111]:
results_df_ft.iloc[:,4:].median()

faithfulness          1.000000
answer_relevancy      0.987297
context_recall        1.000000
context_precision     1.000000
answer_correctness    0.698568
dtype: float64