# Loading Multiple Type Doc(Text, PDF, PPTX) for RAG

In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
loader = DirectoryLoader('./rag_docs')

In [3]:
documents = loader.load()

Cannot set gray non-stroke color because /'P1' is an invalid float value




In [4]:
len(documents)

3

In [5]:
documents

[Document(metadata={'source': 'rag_docs/state_of_the_union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\n\nLast year COVID-19 kept us apart. This year we are finally together again.\n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\n\nWith a duty to one another to the American people to the Constitution.\n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.\n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.\n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined.\n\nHe met the Ukrainian people.\n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determin

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap= 100)

In [7]:
splitted_docs = splitter.split_documents(documents)

In [8]:
len(splitted_docs)

139

In [9]:
import os
from dotenv import load_dotenv

In [10]:
load_dotenv()

True

In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

In [12]:
embedding = HuggingFaceEmbeddings(model="sentence-transformers/all-mpnet-base-v2")

In [13]:
from langchain_chroma import Chroma

In [14]:
vectorstore = Chroma.from_documents(
    documents=splitted_docs,
    embedding=embedding,
    collection_name="multi_type_doc"
)

In [15]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [17]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [18]:
template = """You are a helpful assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the the question.
If you don't know the answer, just say that you don't know, don't try to makeup an answer.
Use ten sentences maximum to keep your answer concise.

Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

In [19]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo")

In [20]:
output_perser = StrOutputParser()

In [21]:
rag_chain= (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | output_perser
)

In [22]:
rag_chain.invoke("Why is Dispersion Important?")

'Dispersion is important because it helps identify variability in the data, allowing for a deeper understanding of how spread out or scattered the data values are around a central value. It helps compare consistency across datasets, indicating which dataset is more stable or reliable. In business and science, choosing the option with less variability is preferred for predictability and control. Dispersion is useful for risk analysis in various fields like business, finance, and healthcare as it helps measure risk or uncertainty, with higher variability usually signifying greater risk. Organizations rely on dispersion to make informed decisions, manage risks, and allocate resources effectively. Businesses prefer suppliers with lower dispersion for consistent performance, highlighting the practical importance of understanding dispersion. In finance, investors use standard deviation to assess risk, with variations in returns indicating the level of volatility in investments. Essentially, 

In [24]:
rag_chain.invoke("Zelenskyy is the president of which country?")

'Zelenskyy is the president of Ukraine.'