In [1]:
!pip install langchain langchain-community langchain-openai langchain-core langchain-huggingface



In [2]:
!pip install pdfplumber



In [3]:
!pip install pydantic



In [4]:
import os
from typing import List
from chainlit.types import AskFileResponse

import chainlit as cl

from langchain_community.document_loaders import PyMuPDFLoader
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFPlumberLoader

2024-09-28 09:27:23 - Loaded .env file


In [5]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
# from langchain_community.vectorstores import Qdrant
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel


In [6]:
# huggingface model
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm.autonotebook import tqdm, trange

  from tqdm.autonotebook import tqdm, trange


In [7]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

### Retrieval documents

In [8]:
# load documents 1
docs1 =  PyMuPDFLoader("data/Blueprint-for-an-AI-Bill-of-Rights.pdf").load()

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(
        text,
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 0,
    length_function = tiktoken_len,
)

split_chunks_1 = text_splitter.split_documents(docs1)

In [9]:
# load documents 2
docs2 = PDFPlumberLoader('data/NIST.AI.600-1.pdf').load()

split_chunks_2 = text_splitter.split_documents(docs2)

In [10]:
len(split_chunks_1), len(split_chunks_2)

(196, 144)

In [11]:
documents = [*split_chunks_1, *split_chunks_2]

In [12]:
len(documents)

340

### embedding model

In [13]:
model_id = "Snowflake/snowflake-arctic-embed-m"
embedding_model = HuggingFaceEmbeddings(model_name=model_id)

2024-09-28 09:28:01 - PyTorch version 2.4.0 available.
2024-09-28 09:28:01 - Use pytorch device_name: mps
2024-09-28 09:28:01 - Load pretrained SentenceTransformer: Snowflake/snowflake-arctic-embed-m
2024-09-28 09:28:03 - 1 prompts are loaded, with the keys: ['query']


### Retriever

In [14]:
qdrant_client = QdrantClient(":memory:")
collection_name = "ai-policy"

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=collection_name,
    embedding=embedding_model,
)

vector_store.add_documents(documents)

retriever = vector_store.as_retriever(search_kwargs={"k": 5})

### prompt and llm

In [15]:
RAG_PROMPT = """\
You are an expert in AI ethics and policy. The CEO of a company is asking legal advice from you regarding their investment in AI application. Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

rag_llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [24]:
def create_rag_chain(rag_prompt_template, vector_store, llm):
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    rag_chain = ({"context": itemgetter("question") | retriever, "question": itemgetter("question")}
                    | RunnablePassthrough.assign(context=itemgetter("context"))
                    | {"response": rag_prompt_template | llm | StrOutputParser(), "context": itemgetter("context")})
    return rag_chain

In [18]:
from pydantic import BaseModel, InstanceOf
class RAGRunnables(BaseModel):
    rag_prompt_template: InstanceOf[ChatPromptTemplate]
    vector_store: InstanceOf[QdrantVectorStore]
    llm: InstanceOf[ChatOpenAI]

In [19]:
rag_runnables = RAGRunnables(
                        rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT),
                        vector_store = vector_store,
                        llm = rag_llm
                    )

In [35]:
rag_runnables.model_dump().keys()


dict_keys(['rag_prompt_template', 'vector_store', 'llm'])

In [36]:
chain = create_rag_chain(rag_runnables.rag_prompt_template, rag_runnables.vector_store, rag_runnables.llm)
chain.invoke({'question': 'Who are the authors of NIST Trustworthy and Responsible AI?'})

2024-09-28 09:52:13 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'response': 'I do not know the specific authors of the NIST Trustworthy and Responsible AI document. The provided context does not include that information.',
 'context': [Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 2, 'total_pages': 64, 'Author': 'National Institute of Standards and Technology', 'Category': 'NIST AI 600-1', 'Comments': '', 'Company': '', 'ContentTypeId': '0x01010068CEA9BE6E0AF749888425167690E526', 'CreationDate': "D:20240805141702-04'00'", 'Creator': 'Acrobat PDFMaker 24 for Word', 'Keywords': '', 'MediaServiceImageTags': '', 'ModDate': "D:20240805143048-04'00'", 'Producer': 'Adobe PDF Library 24.2.159', 'SourceModified': 'D:20240805160221', 'Subject': '', 'Title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', '_id': '3a0f50a7ef614f4cab39c1131319c43d', '_collection_name': 'ai-policy'}, page_content='About AI at NIST: The National Institute of Standards and Tec

In [40]:
results = chain.invoke({'question': 'Who are the authors of NIST Trustworthy and Responsible AI?'})
results['context'][0]

2024-09-28 10:55:43 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 2, 'total_pages': 64, 'Author': 'National Institute of Standards and Technology', 'Category': 'NIST AI 600-1', 'Comments': '', 'Company': '', 'ContentTypeId': '0x01010068CEA9BE6E0AF749888425167690E526', 'CreationDate': "D:20240805141702-04'00'", 'Creator': 'Acrobat PDFMaker 24 for Word', 'Keywords': '', 'MediaServiceImageTags': '', 'ModDate': "D:20240805143048-04'00'", 'Producer': 'Adobe PDF Library 24.2.159', 'SourceModified': 'D:20240805160221', 'Subject': '', 'Title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', '_id': '3a0f50a7ef614f4cab39c1131319c43d', '_collection_name': 'ai-policy'}, page_content='About AI at NIST: The National Institute of Standards and Technology (NIST) develops measurements,\ntechnology, tools, and standards to advance reliable, safe, transparent, explainable, privacy-enhanced,\nand fair artificial intellige

In [42]:
results['context'][0].metadata['Title']

'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile'

In [38]:
async def arun_chain(prompt):
    response = await chain.ainvoke({'question': prompt})
    return response

In [39]:
arun_chain('Who are the authors of NIST Trustworthy and Responsible AI?')

<coroutine object arun_chain at 0x17ba8dfc0>

### chain

In [16]:
rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [17]:
rag_chain.invoke({'question': 'Who are the authors of NIST Trustworthy and Responsible AI?'})

2024-09-28 09:28:24 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'response': 'I do not know the specific authors of the NIST Trustworthy and Responsible AI document. The provided context does not include that information.',
 'context': [Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 2, 'total_pages': 64, 'Author': 'National Institute of Standards and Technology', 'Category': 'NIST AI 600-1', 'Comments': '', 'Company': '', 'ContentTypeId': '0x01010068CEA9BE6E0AF749888425167690E526', 'CreationDate': "D:20240805141702-04'00'", 'Creator': 'Acrobat PDFMaker 24 for Word', 'Keywords': '', 'MediaServiceImageTags': '', 'ModDate': "D:20240805143048-04'00'", 'Producer': 'Adobe PDF Library 24.2.159', 'SourceModified': 'D:20240805160221', 'Subject': '', 'Title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', '_id': '3a0f50a7ef614f4cab39c1131319c43d', '_collection_name': 'ai-policy'}, page_content='About AI at NIST: The National Institute of Standards and Tec

In [25]:
from pydantic import BaseModel

In [32]:
from langchain_community.document_loaders.base import BaseLoader
from langchain_text_splitters.base import TextSplitter