In [1]:
!pip install --quiet langchain langchain-community langchain-openai langchain-core langchain-huggingface

In [2]:
!pip install --quiet pdfplumber pymupdf4llm pydantic

In [3]:
import os
from typing import List

from langchain_community.document_loaders import PyMuPDFLoader
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFPlumberLoader

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# huggingface model
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm.autonotebook import tqdm, trange

from pydantic import BaseModel
from langchain_community.document_loaders.base import BaseLoader
from langchain_text_splitters.base import TextSplitter

  from tqdm.autonotebook import tqdm, trange


In [4]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

### pymupdf to markdown

In [10]:
import pymupdf4llm, pymupdf

In [12]:
docs1 =  pymupdf.open("data/Blueprint-for-an-AI-Bill-of-Rights.pdf")
len(docs1)

73

In [25]:
docs1.get_toc(False)

[[1,
  'Blank Page',
  56,
  {'kind': 1, 'xref': 1711, 'page': 55, 'to': Point(0.0, 0.0), 'zoom': 0.0}],
 [1,
  'Untitled',
  56,
  {'kind': 1,
   'xref': 1714,
   'page': 55,
   'to': Point(-37.0, -173.0),
   'zoom': 0.0}],
 [1,
  'Untitled',
  55,
  {'kind': 1,
   'xref': 1712,
   'page': 54,
   'to': Point(-37.0, 318.0),
   'zoom': 0.0}]]

In [16]:
docs1[0:3]

[page 0 of data/Blueprint-for-an-AI-Bill-of-Rights.pdf,
 page 1 of data/Blueprint-for-an-AI-Bill-of-Rights.pdf,
 page 2 of data/Blueprint-for-an-AI-Bill-of-Rights.pdf]

In [17]:

md_text = pymupdf4llm.to_markdown(docs1[:2])
type(md_text)

TypeError: bad filename: type(filename)=<class 'list'> filename=[page 0 of data/Blueprint-for-an-AI-Bill-of-Rights.pdf, page 1 of data/Blueprint-for-an-AI-Bill-of-Rights.pdf].

In [8]:
# load documents 1
docs1 =  PyMuPDFLoader("data/Blueprint-for-an-AI-Bill-of-Rights.pdf").load()

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(
        text,
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 0,
    length_function = tiktoken_len,
)

split_chunks_1 = text_splitter.split_documents(docs1)

### extract table

In [9]:
# load documents 2
docs2 = PDFPlumberLoader('data/NIST.AI.600-1.pdf').load()

split_chunks_2 = text_splitter.split_documents(docs2)

In [None]:
len(split_chunks_1), len(split_chunks_2)

In [11]:
documents = [*split_chunks_1, *split_chunks_2]

### embedding model

In [None]:
model_id = "Snowflake/snowflake-arctic-embed-m"
embedding_model = HuggingFaceEmbeddings(model_name=model_id)

### Retriever

In [14]:
qdrant_client = QdrantClient(":memory:")
collection_name = "ai-policy"

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=collection_name,
    embedding=embedding_model,
)

vector_store.add_documents(documents)

retriever = vector_store.as_retriever(search_kwargs={"k": 5})

### prompt and llm

In [15]:
RAG_PROMPT = """\
You are an expert in AI ethics and policy. The CEO of a company is asking legal advice from you regarding their investment in AI application. Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

rag_llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [24]:
def create_rag_chain(rag_prompt_template, vector_store, llm):
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    rag_chain = ({"context": itemgetter("question") | retriever, "question": itemgetter("question")}
                    | RunnablePassthrough.assign(context=itemgetter("context"))
                    | {"response": rag_prompt_template | llm | StrOutputParser(), "context": itemgetter("context")})
    return rag_chain

In [18]:
from pydantic import BaseModel, InstanceOf
class RAGRunnables(BaseModel):
    rag_prompt_template: InstanceOf[ChatPromptTemplate]
    vector_store: InstanceOf[QdrantVectorStore]
    llm: InstanceOf[ChatOpenAI]

In [19]:
rag_runnables = RAGRunnables(
                        rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT),
                        vector_store = vector_store,
                        llm = rag_llm
                    )

In [None]:
chain = create_rag_chain(rag_runnables.rag_prompt_template, rag_runnables.vector_store, rag_runnables.llm)
chain.invoke({'question': 'Who are the authors of NIST Trustworthy and Responsible AI?'})

In [None]:
results = chain.invoke({'question': 'Who are the authors of NIST Trustworthy and Responsible AI?'})
results['context'][0]