# DATA Loading from Different File Types, Creating Embeddings, and Storing in a Vector Database

This notebook demonstrates how to load data from various file types, generate embeddings, and store them in a vector database for efficient retrieval and search.

In [57]:
import os
from langchain_community.document_loaders import TextLoader, PyPDFLoader, WebBaseLoader, CSVLoader
from langchain.embeddings import HuggingFaceEmbeddings



In [36]:
text_loader = TextLoader(file_path="d:/Langchain Tutorial/data retrival/langchain.txt")
pdf_loader = PyPDFLoader(file_path="d:/Langchain Tutorial/data retrival/Medical_book.pdf")
web_loader = WebBaseLoader(web_path="https://langchain-ai.github.io/langgraph/concepts/why-langgraph/")
csv_loader = CSVLoader(file_path="d:/Langchain Tutorial/data retrival/sample_financial_data.csv")


print("Document loaders created successfully!")
print("Files being loaded:")

Document loaders created successfully!
Files being loaded:


In [44]:
text_loader = TextLoader(file_path="d:/Langchain Tutorial/data retrival/langchain.txt")
text_docs = text_loader.load()

text_docs

[Document(metadata={'source': 'd:/Langchain Tutorial/data retrival/langchain.txt'}, page_content='LangChain is an open-source framework designed to help developers build applications using Large Language Models (LLMs) like GPT-4. It provides tools and abstractions that make it easier to integrate LLMs with:\n\nExternal data sources (like databases, PDFs, web pages, or APIs)\n\nReasoning and memory (handling multi-step tasks or tracking conversation history)\n\nTool use (such as calling APIs or executing code)\n\nAgents that can make decisions dynamically based on LLM outputs\n\nðŸ”§ Core Features of LangChain\nChains\nCombine multiple steps (e.g., prompt -> LLM -> output formatting) into a sequence.\n\nAgents\nUse LLMs to make decisions and choose which tools to call (e.g., a calculator, search engine).\n\nTools & Integrations\nPlug in external tools like:\n\nGoogle Search\n\nSQL databases\n\nPython REPL\n\nWolframAlpha\n\nZapier, etc.\n\nMemory\nStore and use conversation history, ena

In [None]:
pdf_loader = PyPDFLoader(file_path="d:/Langchain Tutorial/data retrival/Medical_book.pdf")

pdf_docs = pdf_loader.load()
pdf_docs

In [38]:
import bs4
web_loader = WebBaseLoader(web_path="https://langchain-ai.github.io/langgraph/concepts/why-langgraph/")

web_docs = web_loader.load()
web_docs

[Document(metadata={'source': 'https://langchain-ai.github.io/langgraph/concepts/why-langgraph/', 'title': 'Overview', 'description': 'Build reliable, stateful AI systems, without giving up control', 'language': 'en'}, page_content="\n\n\n\n\n\n\n\n\n\n\n\nOverview\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          Skip to content\n        \n\n\n\n\n\n\n\nWe are growing and hiring for multiple roles for LangChain, LangGraph and LangSmith.  Join our team!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n            LangGraph\n          \n\n\n\n            \n              Overview\n            \n          \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n            Initializing search\n          \n\n\n\n\n\n\n\n\n\n\n\n\n    GitHub\n  \n\n\n\n\n\n\n\n\n\n\n          \n  \n  \n    \n  \n  Get started\n\n        \n\n\n\n          \n  \n  \n    \n  \n  Guides\n\n        \n\n\n\n          \n  \n  \n    \n  \n  Reference\n\n        \n\n\n\n          \n  \n  \n

In [None]:
csv_loader = CSVLoader(file_path="d:/Langchain Tutorial/data retrival/sample_financial_data.csv")

csv_docs = csv_loader.load()
csv_docs


682

In [None]:
final_docs = text_docs + csv_docs + pdf_docs + web_docs
len(final_docs)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)

chunks_document = text_splitter.split_documents(final_docs)

print('No of chunks:', len(chunks_document))
type(chunks_document)

No of chunks: 6


list

In [88]:
# from langchain.embeddings import HuggingFaceEmbeddings
# def create_embedding():
#     embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#     return embedding

# embedding = create_embedding()
# embedding

In [92]:
from langchain_ollama import OllamaEmbeddings

embed = OllamaEmbeddings(
    model="nomic-embed-text:latest"
)

In [99]:

from langchain_community.vectorstores import Chroma

db=Chroma.from_documents(chunks_document,embed) 

In [100]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x232f637ffe0>

In [104]:
query="india"
result=db.similarity_search(query=query)
result


[Document(metadata={'source': 'd:/Langchain Tutorial/data retrival/langchain.txt'}, page_content="Frameworks: Streamlit, FastAPI, Gradio\n\nOrchestration tools: Airflow, LangServe, LangSmith\n\nIf you're planning to build a production-ready AI app, LangChain gives you the building blocks to do it effectively.\n\nWant a demo or example for a specific use case?"),
 Document(metadata={'source': 'd:/Langchain Tutorial/data retrival/langchain.txt'}, page_content='Agents that can make decisions dynamically based on LLM outputs\n\nðŸ”§ Core Features of LangChain\nChains\nCombine multiple steps (e.g., prompt -> LLM -> output formatting) into a sequence.\n\nAgents\nUse LLMs to make decisions and choose which tools to call (e.g., a calculator, search engine).\n\nTools & Integrations\nPlug in external tools like:\n\nGoogle Search\n\nSQL databases\n\nPython REPL\n\nWolframAlpha\n\nZapier, etc.\n\nMemory\nStore and use conversation history, enabling stateful chatbots.'),
 Document(metadata={'source

In [106]:
retriever = db.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)
result1 =retriever.invoke("what is langchain")
result1

[Document(metadata={'source': 'd:/Langchain Tutorial/data retrival/langchain.txt'}, page_content='LangChain is an open-source framework designed to help developers build applications using Large Language Models (LLMs) like GPT-4. It provides tools and abstractions that make it easier to integrate LLMs with:\n\nExternal data sources (like databases, PDFs, web pages, or APIs)\n\nReasoning and memory (handling multi-step tasks or tracking conversation history)\n\nTool use (such as calling APIs or executing code)\n\nAgents that can make decisions dynamically based on LLM outputs')]