In [1]:
import os 
from dotenv import load_dotenv  

load_dotenv() 

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
CONNECTION = os.getenv("DATABASE_URL") 
CONNECTION

'postgresql+psycopg://postgres:postgres@localhost:5432/rag_db'

In [2]:
# extract from pdf 
from PyPDF2 import PdfReader 

def read_pdf(file_path: str) -> str: 
    with open(file_path, 'rb') as file: 
        pdf_reader = PdfReader(file) 
        pdf_text = '' 
        for page in pdf_reader.pages: 
            pdf_text += page.extract_text() + '\n' 
        return pdf_text 
    
pdf_text = read_pdf('data/nfp.pdf')

In [3]:
pdf_text

' \n \nTransmission of material in this news release is embargoed unti l USDL -24-2052 \n8:30 a.m. (E T) Friday,  Octo ber 4, 2024 \n Technical information:  \nHousehold data: (202) 691-6378  •  cpsinfo@bls.gov  •  www.bls.gov/cps  \nEstablishment data:  (202) 691-6555  •  cesinfo@bls.gov  •  www.bls.gov/ces \n  \nMedia contact:  (202) 691-5902  •  PressOffice@bls.gov \n  \nTHE EMPLOYMENT  SITUATION — SEPTEMBER 2024  \n  \nTotal  nonfarm payroll  employment increased by 254,000 in September, and the unemployment rate  \nchanged little at 4.1 percent, the U.S. Bureau of Labor Statistics reported today. Employment continued \nto trend up in food services and drinking places , health care, government, social assistance, and \nconstruction.   \n \n \nT\nhis news release presents statistics from two monthly surveys. The household survey measures labor \nforce status, including unemployment, by demographic characteristics. The establishment survey  \nmeasures nonfarm  employment, hours, and 

In [4]:
# vector store 
from langchain_openai import OpenAIEmbeddings 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_postgres.vectorstores import PGVector 

collection_name = "pgvector_docs"
embeddings = OpenAIEmbeddings() 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150) 
store = PGVector(embeddings=embeddings, collection_name=collection_name, connection=CONNECTION) 

In [5]:
# documents 
documents = text_splitter.create_documents(texts=[pdf_text])
splits = text_splitter.split_documents(documents)
print(f"Num Splits: {splits}")

Num Splits: [Document(page_content='Transmission of material in this news release is embargoed unti l USDL -24-2052 \n8:30 a.m. (E T) Friday,  Octo ber 4, 2024 \n Technical information:  \nHousehold data: (202) 691-6378  •  cpsinfo@bls.gov  •  www.bls.gov/cps  \nEstablishment data:  (202) 691-6555  •  cesinfo@bls.gov  •  www.bls.gov/ces \n  \nMedia contact:  (202) 691-5902  •  PressOffice@bls.gov \n  \nTHE EMPLOYMENT  SITUATION — SEPTEMBER 2024  \n  \nTotal  nonfarm payroll  employment increased by 254,000 in September, and the unemployment rate  \nchanged little at 4.1 percent, the U.S. Bureau of Labor Statistics reported today. Employment continued \nto trend up in food services and drinking places , health care, government, social assistance, and \nconstruction.   \n \n \nT\nhis news release presents statistics from two monthly surveys. The household survey measures labor \nforce status, including unemployment, by demographic characteristics. The establishment survey  \nmeasures non

In [6]:
# store to vector db 
store.from_documents(splits, embeddings, connection=CONNECTION, collection_name=collection_name) 

<langchain_postgres.vectorstores.PGVector at 0x1e7cf941b20>

In [7]:
retriever = store.as_retriever()
retriever.invoke("What was the unemployment rate for September 2024?")

[Document(page_content='Sept. 2024\nEmployment status\nCivilian noninstitutional population........................................... 267,428 268,644 268,856 269,080 224\nCivilian labor force........................................................... 167,897 168,429 168,549 168,699 150\nParticipation rate.......................................................... 62.8 62.7 62.7 62.7 0.0\nEmployed................................................................... 161,550 161,266 161,434 161,864 430\nEmployment-population ratio.......................................... 60.4 60.0 60.0 60.2 0.2\nUnemployed............................................................... 6,347 7,163 7,115 6,834 -281\nUnemployment rate.................................................... 3.8 4.3 4.2 4.1 -0.1\nNot in labor force............................................................ 99,531 100,215 100,306 100,381 75\nUnemployment rates\nTotal, 16 years and over...............................................

In [25]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain 
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI 

history = [
    {"role": "AI", "content": "how can I help you"},
    {"role": "HUMAN", "content": "what is 1 + 1"}
]

formatted_history = [
    {"role": item["role"].lower(), "content": item["content"]} 
    for item in history
]

llm = ChatOpenAI(model="gpt-4o-mini")
system_prompt = "You are a helpful assistant. {context}" 
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")
    ]
)

qa = create_stuff_documents_chain(llm, prompt) 
rag_chain = create_retrieval_chain(retriever, qa) 


In [26]:
rag_chain.invoke({"input": "What was the unemployment rate for September 2024?", "chat_history": formatted_history})

{'input': 'What was the unemployment rate for September 2024?',
 'chat_history': [{'role': 'ai', 'content': 'how can I help you'},
  {'role': 'human', 'content': 'what is 1 + 1'}],
 'context': [Document(page_content='Sept. 2024\nEmployment status\nCivilian noninstitutional population........................................... 267,428 268,644 268,856 269,080 224\nCivilian labor force........................................................... 167,897 168,429 168,549 168,699 150\nParticipation rate.......................................................... 62.8 62.7 62.7 62.7 0.0\nEmployed................................................................... 161,550 161,266 161,434 161,864 430\nEmployment-population ratio.......................................... 60.4 60.0 60.0 60.2 0.2\nUnemployed............................................................... 6,347 7,163 7,115 6,834 -281\nUnemployment rate.................................................... 3.8 4.3 4.2 4.1 -0.1\nNot in labor

In [13]:

# streaming 
stream_response = rag_chain.stream({"input": "What was the unemployment rate for September 2024?"})
for chunk in stream_response:  
    if 'answer' in chunk: 
        print(chunk.get('answer')) 


The
 unemployment
 rate
 for
 September
 
202
4
 was
 
4
.
1
 percent
.

