# Chapter 2: RAG Part I: Indexing Your Data

### Converting Your Documents into Text

In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('./test.txt', encoding="utf-8")
docs = loader.load()

print(docs)



In [2]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader('https://www.langchain.com/')
docs = loader.load()

print(docs)

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://www.langchain.com/', 'title': 'LangChain', 'description': 'LangChain’s suite of products supports developers along each step of their development journey.', 'language': 'en'}, page_content="LangChain\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nFrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources\n\nGuidesBlogCustomer StoriesLangChain AcademyCommunityEventsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricingGet a demoSign up\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nFrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources\n\nGuidesBlogCustomer StoriesLangChain AcademyCommunityEventsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricingGet a demoSign upThe platform for reliable agents. Tools for every step of the agent development lifecycle -- built to unlock powerful AI\xa0in produ

In [4]:
# install the pdf parsing library !pip install pypdf

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('./test.pdf')
pages = loader.load()

print(pages)

[Document(metadata={'producer': 'macOS Version 13.4 (Build 22F66) Quartz PDFContext', 'creator': 'Pages', 'creationdate': "D:20250125061027Z00'00'", 'title': 'ancient_greece', 'moddate': "D:20250125061027Z00'00'", 'source': './test.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Life in ancient Greece was centered around the polis, or city-state, which served as the heart of \nsocial, political and economic activity. Citizens would gather in the agora - the central \nmarketplace and meeting place - to trade goods, discuss politics, and engage in philosophical \ndebates. The streets would buzz with activity as merchants sold their wares, from fresh ﬁsh and \nolive oil to handcrafted pottery and textiles.\nEducation was highly valued, though primarily available to male citizens. Young boys would \nattend schools to study reading, writing, mathematics, music, and physical education. The \ngymnasium was an important institution where men would exercise, compete in athl

### Splitting Your Text into Chunks

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.document_loaders import TextLoader

loader = TextLoader('./test.txt', encoding="utf-8")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_docs = splitter.split_documents(docs)

print(splitted_docs)



In [6]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

PYTHON_CODE = """ def hello_world(): print("Hello, World!") # Call the function hello_world() """

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)

python_docs = python_splitter.create_documents([PYTHON_CODE])

print(python_docs)

[Document(metadata={}, page_content='def hello_world(): print("Hello, World!") # Call'), Document(metadata={}, page_content='the function hello_world()')]


In [7]:
markdown_text = """ # 🦜🔗 LangChain ⚡ Building applications with LLMs through composability ⚡ ## Quick Install ```bash pip install langchain ``` As an open source project in a rapidly developing field, we are extremely open     to contributions. """

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
)

md_docs = md_splitter.create_documents(
    [markdown_text], [{"source": "https://www.langchain.com"}])

print(md_docs)

[Document(metadata={'source': 'https://www.langchain.com'}, page_content='# 🦜🔗 LangChain ⚡ Building applications with LLMs through'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='composability ⚡ ## Quick Install ```bash pip install'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='langchain ``` As an open source project in a rapidly'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='developing field, we are extremely open     to'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='contributions.')]


### Generating Text Embeddings 

In [8]:
from langchain_openai import OpenAIEmbeddings

model = OpenAIEmbeddings(model="text-embedding-3-small")

embeddings = model.embed_documents([
    "Hi there!",
    "Oh, hello!",
    "What's your name?",
    "My friends call me World",
    "Hello World!"
])

print(embeddings)

[[-0.019119014963507652, -0.03810252994298935, -0.030893942341208458, -0.004644253756850958, -0.035392533987760544, -0.004007405135780573, 0.01292667631059885, 0.051056306809186935, -0.005826489068567753, -0.037208229303359985, -0.010765454731881618, -0.0022188082803040743, 0.02727609872817993, -0.002250989666208625, 0.005911176558583975, 0.03398333862423897, -0.01649031974375248, -0.01016925647854805, -0.03176113963127136, 0.07658445835113525, 0.05989088863134384, -0.018726065754890442, 0.0029674444813281298, 0.018969964236021042, 0.039755627512931824, 0.04574471339583397, 0.020866962149739265, 0.00654802517965436, 0.01336705032736063, -0.004800078459084034, 0.0298912450671196, -0.022330358624458313, 0.006957911886274815, -0.024024104699492455, -0.015013372525572777, -0.0035704183392226696, -0.007574435789138079, 0.018143415451049805, -0.009640807285904884, -0.04249272122979164, 0.01214077789336443, -0.01869896613061428, 0.022316807880997658, -0.0016319251153618097, -0.030758442357182

### Load - Split - Embed

In [18]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

# Load the document
loader = TextLoader("./test.txt", encoding="utf-8")
doc = loader.load()

# Split the document
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(doc)

# Generate embeddings
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings = embeddings_model.embed_documents(
    [chunk.page_content for chunk in chunks]
)

### Working with Vector Stores

In [22]:
"""
1. Ensure docker is installed and running (https://docs.docker.com/get-docker/)
2. pip install -qU langchain_postgres
3. Run the following command to start the postgres container:
   
docker run \
    --name pgvector-container \
    -e POSTGRES_USER=langchain \
    -e POSTGRES_PASSWORD=langchain \
    -e POSTGRES_DB=langchain \
    -p 6024:5432 \
    -d pgvector/pgvector:pg16
4. Use the connection string below for the postgres container

"""

from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import uuid


# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"

# Load the document, split it into chunks
raw_documents = TextLoader('./test.txt', encoding="utf-8").load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(raw_documents)

# Create embeddings for the documents
embeddings_model = OpenAIEmbeddings()

db = PGVector.from_documents(
    documents, embeddings_model, connection=connection)

results = db.similarity_search("query", k=4)

print(results)

print("Adding documents to the vector store")
ids = [str(uuid.uuid4()), str(uuid.uuid4())]
db.add_documents(
    [
        Document(
            page_content="there are cats in the pond",
            metadata={"location": "pond", "topic": "animals"},
        ),
        Document(
            page_content="ducks are also found in the pond",
            metadata={"location": "pond", "topic": "animals"},
        ),
    ],
    ids=ids,
)

print("Documents added successfully.\n Fetched documents count:",
      len(db.get_by_ids(ids)))

print("Deleting document with id", ids[1])
db.delete({"ids": ids})

print("Document deleted successfully.\n Fetched documents count:",
      len(db.get_by_ids(ids)))

[Document(id='8469c217-1a57-4de3-89ac-f0cc88ff7d05', metadata={'source': './test.txt'}, page_content='V.'), Document(id='841168aa-0e4d-4b3a-a1b3-4b8c60d6e27b', metadata={'source': './test.txt'}, page_content='V.'), Document(id='a7855e59-9235-47af-b716-7d3cb89fcdaf', metadata={'source': './test.txt'}, page_content='II.'), Document(id='38cfc76d-a9f3-4a82-831b-b1afbb303577', metadata={'source': './test.txt'}, page_content='II.')]
Adding documents to the vector store
Documents added successfully.
 Fetched documents count: 2
Deleting document with id d29178e6-d1a3-41ef-95f7-474efc829ec1
Document deleted successfully.
 Fetched documents count: 2


### Tracking Changes to Your Documents

In [1]:
from langchain.indexes import SQLRecordManager, index
from langchain_postgres.vectorstores import PGVector
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document

connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "my_docs"
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
namespace = "my_docs_namespace"

vectorstore = PGVector(
    embeddings=embeddings_model,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

record_manager = SQLRecordManager(
    namespace,
    db_url="postgresql+psycopg://langchain:langchain@localhost:6024/langchain",
)

# Create the schema if it doesn't exist
record_manager.create_schema()

# Create documents
docs = [
    Document(page_content='there are cats in the pond', metadata={
             "id": 1, "source": "cats.txt"}),
    Document(page_content='ducks are also found in the pond', metadata={
             "id": 2, "source": "ducks.txt"}),
]

# Index the documents
index_1 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",  # prevent duplicate documents
    source_id_key="source",  # use the source field as the source_id
)

print("Index attempt 1:", index_1)

# second time you attempt to index, it will not add the documents again
index_2 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

print("Index attempt 2:", index_2)

# If we mutate a document, the new version will be written and all old versions sharing the same source will be deleted.

docs[0].page_content = "I just modified this document!"

index_3 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

print("Index attempt 3:", index_3)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable