In [444]:
import os

import tiktoken
from langchain.text_splitter import MarkdownTextSplitter
from langchain_community.document_loaders import CSVLoader

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# huggingface model
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm.autonotebook import tqdm, trange

from pydantic import BaseModel
from langchain_community.document_loaders.base import BaseLoader
from langchain_text_splitters.base import TextSplitter

import pymupdf4llm
import pymupdf
import pdfplumber
import markdown
import re
import tempfile
from collections import Counter
import numpy as np
import pandas as pd


In [2]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

### pymupdf to markdown

Processing data/Blueprint-for-an-AI-Bill-of-Rights.pdf...


230467

In [435]:
def replace_newlines(text):
    # Replace consecutive newlines (two or more) with the same number of <br>
    text = re.sub(r'\n{2,}', '\n\n', text)
    # Replace single newlines with a space
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Ensure there is a blank line before headings
    text = re.sub(r'([^\n])\n(#+)', r'\1\n\n\2', text)
    text = re.sub(r'([^\n|#])(#+)', r'\1\n\n\2', text)
    text = re.sub(r'\n\n-----\n\n', ' ', text)
    
    return text

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text)
    return len(tokens)

def get_markdown_documents(path, pages, margins):
    md = pymupdf4llm.to_markdown(path, pages=pages, margins=margins, force_text=True)
    md = replace_newlines(md)
    markdown_splitter = MarkdownTextSplitter(chunk_size = 400,
                                        chunk_overlap = 50,
                                        length_function = tiktoken_len,
                                        )
    documents = markdown_splitter.create_documents([md])
    return documents

In [433]:
doc1_path = 'data/Blueprint-for-an-AI-Bill-of-Rights.pdf'

documents1 = get_markdown_documents(doc1_path, pages=list(range(1,73)), margins=(10,40))
len(documents1)

Processing data/Blueprint-for-an-AI-Bill-of-Rights.pdf...


144

### extract pdf with table

In [506]:
def get_pages(path):
    text = pymupdf4llm.to_markdown(path, page_chunks=True, margins=(10,70), force_text=True)
    text_pages = [d['metadata']['page']-1 for d in text if not d['tables']]
    table_pages = [d['metadata']['page']-1 for d in text if d['tables']]
    print(f'text pages: {text_pages}')
    print(f'table pages: {table_pages}')
    return text_pages, table_pages

def clean_up_table(table):
    table = [[i for i in r if i is not None] for r in table]
    rows_cnt = Counter([len(r) for r in table])
    if rows_cnt[1]>2 or rows_cnt[3]==0:
        return None, None, None
    
    gov_id = []
    action = []
    if len(table[-1]) == 1:
        action.append(table.pop()[0])
    if len(table[0]) == 1:
        gov_id.append(table.pop(0)[0])
        try:
            df = pd.DataFrame(table[1:], columns=['Action ID', 'Suggested Action', 'GAI Risks'])
        except:
            df = None
            pass
    else:
        df = pd.DataFrame(table, columns=['Action ID', 'Suggested Action', 'GAI Risks'])
    return df, gov_id, action

def extract_and_process_tables(path, table_pages):
    pdf = pdfplumber.open(path)
    
    table_settings = {"vertical_strategy": "lines", 
                        "horizontal_strategy": "lines",
                        "snap_y_tolerance": 20}
    
    tables = []
    dfs = []
    gov_id = []
    actions = []
    for p in table_pages:
        table = pdf.pages[p].extract_tables(table_settings)
        tables.extend(table)
        
    for t in tables:
        df, gid, action = clean_up_table(t)
        dfs.append(df)
        if gid:
            gov_id.extend(gid)
            
        if action:
            actions.extend(action)          
    
    df = pd.concat(dfs)
    dsc = pd.DataFrame(list(zip(gov_id, actions)))    
    
    df.to_csv('data/actions.csv', header=True, index=False)
    dsc.to_csv('data/tasks.csv', header=False, index=False)
    
    return df, dsc

def get_table_documents(path, field_names=None):
       
    csv_loader = CSVLoader(file_path=path,
                            csv_args={'delimiter': ',',
                                        'quotechar': '"',
                                        'fieldnames': field_names
                            })
    documents = csv_loader.load()
    os.remove(path)
    return documents

In [508]:
doc2_path = 'data/NIST.AI.600-1.pdf'

text_pages, table_pages = get_pages(doc2_path)

df, dsc = extract_and_process_tables(doc2_path, table_pages)

Processing data/NIST.AI.600-1.pdf...
text pages: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]
table pages: [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [509]:
table_documents1 = get_table_documents('data/actions.csv', ['Action ID', 'Suggested Action', 'GAI Risks'])
len(table_documents1)

table_documents2 = get_table_documents('data/tasks.csv')
len(table_documents2)

table_documents = [*table_documents1, *table_documents2]

213

In [514]:
md_documents = get_markdown_documents(doc2_path, text_pages, margins=(10, 70))


Processing data/NIST.AI.600-1.pdf...


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings


# embedding model
# MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
MODEL_ID = 'Snowflake/snowflake-arctic-embed-m-v1.5'
EMBEDDING_MODEL = HuggingFaceEmbeddings(model_name=MODEL_ID)

In [8]:
import json
results = {'faithfulness': 0.8049, 'answer_relevancy': 0.8946, 'context_recall': 0.6981, 'context_precision': 0.6903}
with open('data/eval_results/default_chunking_base_model.json', 'w') as f:
    json.dump(results, f)

In [14]:
path1 = 'data/eval_results/advanced_chunking_base_model.json'
path2 = 'data/eval_results/advanced_chunking_finetuned_model.json'
path3 = 'data/eval_results/default_chunking_base_model.json'
path4 = 'data/eval_results/default_chunking_finetuned_model.json'
paths = [path1, path2, path3, path4]
cols = [{'chunking': 'advanced', 'model': 'base'}, 
        {'chunking': 'advanced', 'model': 'finetuned'},
        {'chunking': 'default', 'model': 'base'},
        {'chunking': 'default', 'model': 'finetuned'}
        ]

In [16]:
results = []
for i in range(4):
    with open(paths[i], 'r') as f:
        js = json.load(f)
        results.append({**cols[i], **js})
results

[{'chunking': 'advanced',
  'model': 'base',
  'faithfulness': 0.7227251327261983,
  'answer_relevancy': 0.9565417271796411,
  'context_recall': 0.787037037037037,
  'context_precision': 0.8539351851642689},
 {'chunking': 'advanced',
  'model': 'finetuned',
  'faithfulness': 0.8106028029941074,
  'answer_relevancy': 0.9588964495178147,
  'context_recall': 0.8564814814814814,
  'context_precision': 0.9106481481289467},
 {'chunking': 'default',
  'model': 'base',
  'faithfulness': 0.8049,
  'answer_relevancy': 0.8946,
  'context_recall': 0.6981,
  'context_precision': 0.6903},
 {'chunking': 'default',
  'model': 'finetuned',
  'faithfulness': 0.9316390768596651,
  'answer_relevancy': 0.9501681388615635,
  'context_recall': 0.8972222222222221,
  'context_precision': 0.9273148147935765}]

In [19]:
import pandas as pd
df = pd.DataFrame(results)
df.round(3)

Unnamed: 0,chunking,model,faithfulness,answer_relevancy,context_recall,context_precision
0,advanced,base,0.723,0.957,0.787,0.854
1,advanced,finetuned,0.811,0.959,0.856,0.911
2,default,base,0.805,0.895,0.698,0.69
3,default,finetuned,0.932,0.95,0.897,0.927


### embedding model

In [None]:
model_id = "Snowflake/snowflake-arctic-embed-m"
embedding_model = HuggingFaceEmbeddings(model_name=model_id)

### Retriever

In [14]:
qdrant_client = QdrantClient(":memory:")
collection_name = "ai-policy"

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=collection_name,
    embedding=embedding_model,
)

vector_store.add_documents(documents)

retriever = vector_store.as_retriever(search_kwargs={"k": 5})

### prompt and llm

In [15]:
RAG_PROMPT = """\
You are an expert in AI ethics and policy. The CEO of a company is asking legal advice from you regarding their investment in AI application. Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

rag_llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [24]:
def create_rag_chain(rag_prompt_template, vector_store, llm):
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    rag_chain = ({"context": itemgetter("question") | retriever, "question": itemgetter("question")}
                    | RunnablePassthrough.assign(context=itemgetter("context"))
                    | {"response": rag_prompt_template | llm | StrOutputParser(), "context": itemgetter("context")})
    return rag_chain

In [18]:
from pydantic import BaseModel, InstanceOf
class RAGRunnables(BaseModel):
    rag_prompt_template: InstanceOf[ChatPromptTemplate]
    vector_store: InstanceOf[QdrantVectorStore]
    llm: InstanceOf[ChatOpenAI]

In [19]:
rag_runnables = RAGRunnables(
                        rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT),
                        vector_store = vector_store,
                        llm = rag_llm
                    )

In [None]:
chain = create_rag_chain(rag_runnables.rag_prompt_template, rag_runnables.vector_store, rag_runnables.llm)
chain.invoke({'question': 'Who are the authors of NIST Trustworthy and Responsible AI?'})

In [None]:
results = chain.invoke({'question': 'Who are the authors of NIST Trustworthy and Responsible AI?'})
results['context'][0]

In [20]:
from urllib.request import urlopen
with urlopen( 'https://www.whitehouse.gov/briefing-room/presidential-actions/2023/10/30/executive-order-on-the-safe-secure-and-trustworthy-development-and-use-of-artificial-intelligence/' ) as webpage:
    content = webpage.read().decode()

# Save to file.
with open( 'data/output.html', 'w' ) as output:
    output.write( content )

In [None]:
def get_html_documents(url):
    loader = BSHTMLLoader(file_path=url)
    return loader.load_and_split()

In [24]:
from langchain_community.document_loaders import BSHTMLLoader

loader = BSHTMLLoader(
    file_path="data/output.html",
)


Document(metadata={'source': 'data/output.html', 'title': 'Executive Order on the Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence | The White House'}, page_content="\n\n\n\n\n\n\n\n\nExecutive Order on the Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence | The White House\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\nThe White House\n\n\nThe White House\n \n\n\n\n\nThe White House\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\t\t\t\t\t\tHome\t\t\t\t\t\t\t\n\n\nAdministration\nPriorities\nThe Record\nBriefing Room\nEspañol\n \n\n\n\nInstagramOpens in a new window\nFacebookOpens in a new window\nXOpens in a new window\nYouTubeOpens in a new window\n \n\n\nContact Us\nPrivacy Policy\nCopyright Policy\nAccessibility Statement\n \n\n\n\n\n\n\n\n\n\nMenu\nClose\n\n\n\n\n\n\n\n\nTo search this site, enter a search term\n

In [28]:
docs = loader.load_and_split()
docs[0]

Document(metadata={'source': 'data/output.html', 'title': 'Executive Order on the Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence | The White House'}, page_content='Executive Order on the Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence | The White House\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\nThe White House\n\n\nThe White House\n \n\n\n\n\nThe White House\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\t\t\t\t\t\t\tHome\t\t\t\t\t\t\t\n\n\nAdministration\nPriorities\nThe Record\nBriefing Room\nEspañol\n \n\n\n\nInstagramOpens in a new window\nFacebookOpens in a new window\nXOpens in a new window\nYouTubeOpens in a new window\n \n\n\nContact Us\nPrivacy Policy\nCopyright Policy\nAccessibility Statement\n \n\n\n\n\n\n\n\n\n\nMenu\nClose\n\n\n\n\n\n\n\n\nTo search this site, enter a search term\n\n\n\nSearch\n\n\n

In [29]:
len(docs)

41