In [1]:
from pymongo import MongoClient 
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_community.document_transformers.openai_functions import create_metadata_tagger 
from langchain_mongodb import MongoDBAtlasVectorSearch  
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import os 
from dotenv import load_dotenv 

load_dotenv() 

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
MONGO_URI = os.getenv('MONGODB_URI')
DATABASE_NAME = 'mongo-rag' 
COLLECTION_NAME = 'mongo-rag-collection'
INDEX_NAME = 'mongo_index'

In [2]:
client = MongoClient(MONGO_URI)
collection = client[DATABASE_NAME][COLLECTION_NAME]
collection

Collection(Database(MongoClient(host=['ac-ra03c4t-shard-00-00.o6nv3ct.mongodb.net:27017', 'ac-ra03c4t-shard-00-02.o6nv3ct.mongodb.net:27017', 'ac-ra03c4t-shard-00-01.o6nv3ct.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='Cluster0', authsource='admin', replicaset='atlas-g2izdt-shard-0', tls=True), 'mongo-rag'), 'mongo-rag-collection')

In [3]:
loader = PyPDFLoader('./../data/nfp.pdf') 
pages = loader.load() 
print(len(pages))
pages[0]

39


Document(metadata={'source': './../data/nfp.pdf', 'page': 0}, page_content=' \n \nTransmission of material in this news release is embargoed unti l USDL -24-2052 \n8:30 a.m. (E T) Friday,  Octo ber 4, 2024 \n Technical information:  \nHousehold data: (202) 691-6378  •  cpsinfo@bls.gov  •  www.bls.gov/cps  \nEstablishment data:  (202) 691-6555  •  cesinfo@bls.gov  •  www.bls.gov/ces \n  \nMedia contact:  (202) 691-5902  •  PressOffice@bls.gov \n  \nTHE EMPLOYMENT  SITUATION — SEPTEMBER 2024  \n  \nTotal  nonfarm payroll  employment increased by 254,000 in September, and the unemployment rate  \nchanged little at 4.1 percent, the U.S. Bureau of Labor Statistics reported today. Employment continued \nto trend up in food services and drinking places , health care, government, social assistance, and \nconstruction.   \n \n \nT\nhis news release presents statistics from two monthly surveys. The household survey measures labor \nforce status, including unemployment, by demographic characteris

In [4]:
cleaned_pages = [] 
for page in pages: 
    if len(page.page_content.split(' ')) > 20: 
        cleaned_pages.append(page) 
print(cleaned_pages[0])

page_content=' 
 
Transmission of material in this news release is embargoed unti l USDL -24-2052 
8:30 a.m. (E T) Friday,  Octo ber 4, 2024 
 Technical information:  
Household data: (202) 691-6378  •  cpsinfo@bls.gov  •  www.bls.gov/cps  
Establishment data:  (202) 691-6555  •  cesinfo@bls.gov  •  www.bls.gov/ces 
  
Media contact:  (202) 691-5902  •  PressOffice@bls.gov 
  
THE EMPLOYMENT  SITUATION — SEPTEMBER 2024  
  
Total  nonfarm payroll  employment increased by 254,000 in September, and the unemployment rate  
changed little at 4.1 percent, the U.S. Bureau of Labor Statistics reported today. Employment continued 
to trend up in food services and drinking places , health care, government, social assistance, and 
construction.   
 
 
T
his news release presents statistics from two monthly surveys. The household survey measures labor 
force status, including unemployment, by demographic characteristics. The establishment survey  
measures nonfarm  employment, hours, and earnings

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, 
    chunk_overlap=150
)
split_docs = text_splitter.split_documents(cleaned_pages)
split_docs[12]

Document(metadata={'source': './../data/nfp.pdf', 'page': 1}, page_content='for work during the 4 weeks preceding the survey or were unavailable to take a job. (See table A -1.) \n Among those not in the labor force who wanted a job, the number of people marginally attached to the labor force  increased  by 204,000 to 1.6 million in September. These individuals wanted and were \navailable for work and had looked for a job sometime in the prior 12 months but had not looked for work in the 4 weeks preceding the survey. The number of discouraged workers, a subset  of the')

In [6]:
split_docs[11]

Document(metadata={'source': './../data/nfp.pdf', 'page': 1}, page_content='full-time employment but were working part time because their hours had been reduced or they were \nunable to find full- time jobs. (See table A -8.) \n The number of people not in the labor force who currently want a job , at 5.7 million, changed little in  \nSeptember . These individuals were not counted as unemployed because they were not actively  looking \nfor work during the 4 weeks preceding the survey or were unavailable to take a job. (See table A -1.)')

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, 
    chunk_overlap=150
)
schema = {
    "properties": {
        "title": {"type": "string"}, 
        "keywords": {"type": "array", "items": {"type": "string"}}, 
        "hasCode": {"type": "boolean"}
    },
    "required": ["title", "keywords", "hasCode"]
}
llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0, model="gpt-4o-mini") 
document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)
docs = document_transformer.transform_documents(cleaned_pages) 
split_docs = text_splitter.split_documents(docs)
split_docs[12]

Document(metadata={'title': 'Household Survey Data', 'keywords': ['unemployment rate', 'number of unemployed people', 'long-term unemployed', 'labor force participation rate', 'employment-population ratio', 'part-time employment for economic reasons', 'not in the labor force who want a job', 'marginally attached to the labor force', 'discouraged workers', 'total nonfarm payroll employment'], 'hasCode': False, 'source': './../data/nfp.pdf', 'page': 1}, page_content='for work during the 4 weeks preceding the survey or were unavailable to take a job. (See table A -1.) \n Among those not in the labor force who wanted a job, the number of people marginally attached to the labor force  increased  by 204,000 to 1.6 million in September. These individuals wanted and were \navailable for work and had looked for a job sometime in the prior 12 months but had not looked for work in the 4 weeks preceding the survey. The number of discouraged workers, a subset  of the')

In [8]:
split_docs[11]

Document(metadata={'title': 'Household Survey Data', 'keywords': ['unemployment rate', 'number of unemployed people', 'long-term unemployed', 'labor force participation rate', 'employment-population ratio', 'part-time employment for economic reasons', 'not in the labor force who want a job', 'marginally attached to the labor force', 'discouraged workers', 'total nonfarm payroll employment'], 'hasCode': False, 'source': './../data/nfp.pdf', 'page': 1}, page_content='full-time employment but were working part time because their hours had been reduced or they were \nunable to find full- time jobs. (See table A -8.) \n The number of people not in the labor force who currently want a job , at 5.7 million, changed little in  \nSeptember . These individuals were not counted as unemployed because they were not actively  looking \nfor work during the 4 weeks preceding the survey or were unavailable to take a job. (See table A -1.)')

In [9]:
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY) 
vector_store = MongoDBAtlasVectorSearch.from_documents(split_docs, embeddings, collection=collection)

In [10]:
namespace = DATABASE_NAME + "." + COLLECTION_NAME 
namespace

'mongo-rag.mongo-rag-collection'

In [11]:
store = MongoDBAtlasVectorSearch.from_connection_string(
    MONGO_URI, 
    namespace,
    embeddings,
    index_name=INDEX_NAME
)

In [12]:
def query_data(query: str): 
    retriever = store.as_retriever(
        search_type="similarity", 
        search_kwargs={
            "k": 10,
        }
    )
    results = retriever.invoke(query)
    return results 

query_data("What was the unemployment rate for September?")

[Document(metadata={'_id': '671cd70e36e5bb11eb7ee73a', 'title': 'Household Survey Data', 'keywords': ['unemployment rate', 'number of unemployed people', 'long-term unemployed', 'labor force participation rate', 'employment-population ratio', 'part-time employment for economic reasons', 'not in the labor force who want a job', 'marginally attached to the labor force', 'discouraged workers', 'total nonfarm payroll employment'], 'hasCode': False, 'source': './../data/nfp.pdf', 'page': 1}, page_content='-2- \n surveys were within normal ranges for September. For information on how unusually severe weather \ncan affect employment and hours estimates, see the Frequently Asked Questions section of this news \nrelease.  \n \nBLS will release the state estimates of employment and unemployment for September on October 22, \n2024, at 10:00 a.m. (ET). \n \n \nHousehold Survey Data  \n Both the unemployment rate , at 4.1 percent, and the number of unemployed people , at 6.8 million,'),
 Document(m

In [13]:
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
template = """
    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Do not answer the question if there is no given context.
    Do not answer the question if it is not related to the context.
    Do not give recommendations to anything other than MongoDB.
    Context:
    {context}
    Question: {question}
    """

In [14]:
custom_rag_prompt = PromptTemplate.from_template(template)
retriever = store.as_retriever(
        search_type="similarity", 
        search_kwargs={
            "k": 10,
        }
    )
retrieve = {
    "context": retriever | (lambda docs: "\n\n".join([d.page_content for d in docs])), 
    "question": RunnablePassthrough()
    }

llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0)
response_parser = StrOutputParser() 

chain = retrieve | custom_rag_prompt | llm | response_parser 
answer = chain.invoke('What was the unemployment rate for September?')

In [15]:
answer

'The unemployment rate for September was 4.1 percent.'