In [100]:
import os
import numpy as np


In [2]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader

In [5]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [35]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.chat_models import ChatOpenAI

In [7]:
folder_path = 'D:/abir/ai_ml_projects/rag_multiple_documents/docs/'

In [18]:
def data_ingestion(folder_path):
    loader = PyPDFDirectoryLoader(folder_path)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
    splitted_document = text_splitter.split_documents(docs)
    return splitted_document

In [29]:
def open_ai_embedding(deployment="text-embedding-3-small"):    
    # Use old version of Ada. You probably want V2 rather than this.
    embeddings = OpenAIEmbeddings(deployment=deployment)
    return embeddings

embeddings = open_ai_embedding()

In [25]:
if __name__ == '__main__':
    embeddings = open_ai_embedding()
    embeddings.embed_query('Hello world')

1536

In [30]:
def get_vector_store(docs):
    faiss_embedding = FAISS.from_documents(
        docs,
        embeddings
    )
    faiss_embedding.save_local("faiss_index")
    return

In [34]:
docs = data_ingestion(folder_path)
get_vector_store(docs)

In [36]:
def llm_model():
    return ChatOpenAI()

In [37]:
prompt_template = """
Human : Use the following pieces of context to provide a concise answer,
to the question at the end. However, please use atlease 250 words with detailed 
explanation. If you don't know the answer, just say that you don't know and don't
try  to make up an answer.
<context>
{context}
</context>

Question: {question}

Assistant:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context","question"]
)

In [74]:
def get_response(llm,vectorstore_faiss,query):
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type='stuff',
        retriever=vectorstore_faiss.as_retriever(
            search_type='similarity',search_kwargs={"k":6}
        ),
        return_source_documents=True,
        chain_type_kwargs={'prompt':prompt}
    )
    answer = qa({'query':query})
    vectorstore_faiss.similarity

    return answer

In [69]:
faiss_index = FAISS.load_local("faiss_index",embeddings,allow_dangerous_deserialization=True)
# llm = llm_model_openai()
llm = llm_model()
user_question = "What is climate change?"

llm_response = get_response(llm,faiss_index,user_question)

In [97]:
def return_similarity_score(faiss_index,query,k=6):
    similarity_score = faiss_index.similarity_search_with_score(query,k=k)
    score = []
    for score_ in similarity_score:
        score.append(score_[1])
    
    return score

In [101]:
def process_llm_response(llm_response,faiss_index,user_question,k=6):
    print(llm_response['result'])
    print('\n\nSources:')
    score = return_similarity_score(faiss_index,user_question,k)
    i = 0
    for source in llm_response['source_documents']:
        print(source.metadata['source'].split('\\')[-1],' ','page number is', source.metadata['page'],' and score is ',np.round(score[i],2))
        i+=1

In [124]:
source = llm_response['source_documents'][0]
f"{source.metadata['source'].split('\\')[-1]}  page number is  {source.metadata['page']} and score is {np.round(score[i],2)}"

SyntaxError: f-string expression part cannot include a backslash (256530741.py, line 2)

In [127]:
source = llm_response['source_documents'][0]


In [128]:
source

Document(metadata={'source': 'D:\\abir\\ai_ml_projects\\rag_multiple_documents\\docs\\document_4.pdf', 'page': 2}, page_content='The climate problem\nClimate change is a serious and urgent issue. Th  e Earth’s climate is changing, and the scientiﬁ  c consensus is \nnot only that human activities have contributed to it signiﬁ  cantly, but that the change is far more rapid and \ndangerous than thought earlier (IPCC 2007)\n1. In this section, we will only highlight some of these points \n(for more detail, we refer to IPCC 2007 and Stern 2006).\nTh e global mean temperature of the earth is rising; it has risen by 0.7oC in the 20th century, and con-\ntinues on an upward trend. Th  is has already begun to impose costs (e.g., in the form of heat waves, frequency \nof extreme events, and recession of glaciers), but these are still within the bounds of common experience. \n1 Th  e precise statement is that IPCC now has “very high conﬁ  dence that the globally averaged net eﬀ  ect of human \nact

In [135]:
doc = source.metadata['source'].split("\\")[-1]
f'''{doc}'''

'document_4.pdf'

In [102]:
process_llm_response(llm_response,faiss_index,user_question)

Climate change refers to the long-term alteration in Earth's climate patterns, particularly in terms of temperature, precipitation, and weather events. It is a significant and urgent issue that is primarily driven by human activities, such as the emission of greenhouse gases from burning fossil fuels and deforestation. The Earth's climate is changing at a rapid pace, leading to a rise in global mean temperature, increased frequency of extreme weather events, and potential sea-level rise.

The Intergovernmental Panel on Climate Change (IPCC) has highlighted the substantial impact of human activities on global warming since 1750. This warming effect has already begun to impose costs on society and ecosystems, with potential catastrophic consequences if not addressed promptly. Climate change poses severe risks to biodiversity, water resources, agriculture, and human health, among other aspects of life on Earth.

While climate change affects all countries, its impacts can vary significantl

In [121]:
import streamlit as st

def main():
    st.set_page_config("Chat PDF")
    st.header("Chat with multiple PDFs using OpenAI")

    user_question = st.text_input("Ask a Question from the PDF Files, Like- What is climate change?")

    with st.sidebar:
        st.title("Menu:")

        if st.button("Vectors Update"):
            with st.spinner("Processing..."):
                docs = data_ingestion("D:/abir/ai_ml_projects/rag_multiple_documents/docs/")
                get_vector_store(docs)
                st.success('done')
        
    if st.button("LLM output"):
        with st.spinner("Processing..."):
            faiss_index = FAISS.load_local("faiss_index",embeddings,allow_dangerous_deserialization=True)
            llm = llm_model()
            st.write(get_response(llm,faiss_index,user_question))
            st.success("Done")


if __name__ == "__main__":
    main()

2024-12-06 01:20:08.736 
  command:

    streamlit run C:\Users\Abir\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-12-06 01:20:08.759 Session state does not function when running a script without `streamlit run`
