In [1]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from pydantic import BaseModel

In [2]:
ollama_model = ChatOllama(
    base_url='http://127.0.0.1:11434',
    model="deepseek-r1:1.5b"
)

ollama_model.invoke("Hi!")

AIMessage(content='<think>\n\n</think>\n\nHello! How can I assist you today? 😊', additional_kwargs={}, response_metadata={'model': 'deepseek-r1:1.5b', 'created_at': '2025-07-30T12:29:43.3359255Z', 'done': True, 'done_reason': 'stop', 'total_duration': 4609569300, 'load_duration': 3474870100, 'prompt_eval_count': 5, 'prompt_eval_duration': 147768600, 'eval_count': 16, 'eval_duration': 978133700, 'model_name': 'deepseek-r1:1.5b'}, id='run--ee466577-ece3-495c-bd32-efc72a2d092b-0', usage_metadata={'input_tokens': 5, 'output_tokens': 16, 'total_tokens': 21})

Indexing and retrieving

### Indexing, Ingestion, Embeddings and VectorStore

## Embeddings

- Bag of Words
- LLM-Based

In [3]:
#Load
from langchain_community.document_loaders import TextLoader
path = '../data/test.txt'

loader = TextLoader(path)

docs = loader.load()
docs

[Document(metadata={'source': '../data/test.txt'}, page_content='Test text\n\nAnd Chunks just in case. Fatal\n\nOr not\n\nGreat President and friend')]

In [4]:
from langchain_community.document_loaders import WebBaseLoader
#
#loader = WebBaseLoader("https://www.langchain.com/")
#docs = loader.load()
#
#docs

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('../data/test.pdf')

pages = loader.load()

print(pages)


[Document(metadata={'producer': 'MiKTeX-dvipdfmx (20220710)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-07-10T17:14:12-04:00', 'source': '../data/test.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='Alejandro Cespón\nARTiFiCiALINTELLiGENCERESEARCHER · SOFTWAREMiDDEVELOPER\nSanta Clara, Villa Clara, Cuba\nć cesponalejandro@gmail.com | ^ acferriol | ] alejandro-cespon-b36771209 | Ȉ 0000-0002-8584-6958 | Ƹ Alejandro-Cespon-Ferriol | Ǒ alejandrocespon\n| ŵ Alejandro Cespón Ferriol | 24 years\n“Lifeisthis...,Ilikethis”\nHarveySpecter\nEducation\nGranadaUniversity Granada,Spain\nPHDONCOMPUTERSCiENCE Nov. 2024‑Present\n• DoctoralProgrammeinInformationandCommunicationTechnologies(B25/56/1).\nUCLV(CentralUniversity”MartaAbreu”ofLasVillas) VillaClara,Cuba\nMASTERONCOMPUTERSCiENCE Dec. 2023‑Sept. 2024\n• Studyingfromundergraduatebyaspecialtrainingplan\nUCLV(CentralUniversity”MartaAbreu”ofLasVillas) VillaClara,Cuba\nB.S.ONCOMPUTERSCiENCE Sep. 2019‑Dec. 2023\n• Go

### Chunks

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader("../data/test.txt") 
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=10,
    chunk_overlap=2,
)
splitted_docs = splitter.split_documents(docs)

print(len(splitted_docs))

11


In [7]:
splitted_docs

[Document(metadata={'source': '../data/test.txt'}, page_content='Test text'),
 Document(metadata={'source': '../data/test.txt'}, page_content='And'),
 Document(metadata={'source': '../data/test.txt'}, page_content='Chunks'),
 Document(metadata={'source': '../data/test.txt'}, page_content='just in'),
 Document(metadata={'source': '../data/test.txt'}, page_content='case.'),
 Document(metadata={'source': '../data/test.txt'}, page_content='Fatal'),
 Document(metadata={'source': '../data/test.txt'}, page_content='Or not'),
 Document(metadata={'source': '../data/test.txt'}, page_content='Great'),
 Document(metadata={'source': '../data/test.txt'}, page_content='President'),
 Document(metadata={'source': '../data/test.txt'}, page_content='and'),
 Document(metadata={'source': '../data/test.txt'}, page_content='friend')]

In [8]:
### Programming Language
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""
python_splitter = RecursiveCharacterTextSplitter.from_language( #From language for Languages
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE]) #Create documents for str(not docs)

print(python_docs)

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'), Document(metadata={}, page_content='# Call the function\nhello_world()')]


In [9]:
markdown_text = """
# LangChain

⚡ Building applications with LLMs through composability ⚡

## Quick Install

```bash
pip install langchain
```

As an open source project in a rapidly developing field, we are extremely open 
    to contributions.
"""

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
)
md_docs = md_splitter.create_documents(texts = [markdown_text], #List of Texts
    metadatas=[{"source": "https://www.langchain.com"}]) #List of metadata

print(md_docs)

[Document(metadata={'source': 'https://www.langchain.com'}, page_content='# LangChain'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='⚡ Building applications with LLMs through composability ⚡'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='## Quick Install\n\n```bash\npip install langchain'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='```'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='As an open source project in a rapidly developing field, we'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='are extremely open'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='to contributions.')]


### Generate Embeddings

In [10]:
from langchain_ollama import OllamaEmbeddings

model = OllamaEmbeddings(
    base_url='http://127.0.0.1:11434',
    model="deepseek-r1:1.5b"
)

In [11]:
embeddings = model.embed_documents([
    "Hi there!",
    "Oh, hello!",
    "What's your name?",
    "My friends call me World",
    "Hello World!"
])

In [12]:
len(embeddings)

5

In [13]:
## Complete pipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings

pdfloader = PyPDFLoader('../data/test.pdf')
doc = pdfloader.load()

In [14]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splitted_docs = splitter.split_documents(doc)

splitted_docs

[Document(metadata={'producer': 'MiKTeX-dvipdfmx (20220710)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-07-10T17:14:12-04:00', 'source': '../data/test.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='Alejandro Cespón\nARTiFiCiALINTELLiGENCERESEARCHER · SOFTWAREMiDDEVELOPER\nSanta Clara, Villa Clara, Cuba\nć cesponalejandro@gmail.com | ^ acferriol | ] alejandro-cespon-b36771209 | Ȉ 0000-0002-8584-6958 | Ƹ Alejandro-Cespon-Ferriol | Ǒ alejandrocespon\n| ŵ Alejandro Cespón Ferriol | 24 years\n“Lifeisthis...,Ilikethis”\nHarveySpecter\nEducation\nGranadaUniversity Granada,Spain\nPHDONCOMPUTERSCiENCE Nov. 2024‑Present\n• DoctoralProgrammeinInformationandCommunicationTechnologies(B25/56/1).\nUCLV(CentralUniversity”MartaAbreu”ofLasVillas) VillaClara,Cuba\nMASTERONCOMPUTERSCiENCE Dec. 2023‑Sept. 2024\n• Studyingfromundergraduatebyaspecialtrainingplan\nUCLV(CentralUniversity”MartaAbreu”ofLasVillas) VillaClara,Cuba\nB.S.ONCOMPUTERSCiENCE Sep. 2019‑Dec. 2023\n• Go

In [15]:
embedding_model = OllamaEmbeddings(
    base_url='http://127.0.0.1:11434',
    model="deepseek-r1:1.5b"
)

In [16]:
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in splitted_docs])
#embeddings

In [17]:
len(embeddings)

10

### Vector Store (Using opensearch)

In [22]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_core.documents import Document
import uuid
import os

# Load the document, split it into chunks
raw_documents = TextLoader('../data/test.txt').load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10, 
    chunk_overlap=2)
documents = text_splitter.split_documents(raw_documents)

# embed each chunk and insert it into the vector store
embeddings = OllamaEmbeddings(
    base_url="http://127.0.0.1:11434", model="deepseek-r1:1.5b"
)

os_key = os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")

docsearch = OpenSearchVectorSearch.from_documents(
    docs,
    embeddings,
    opensearch_url="https://localhost:9200",
    http_auth=("admin", os_key),
    use_ssl=False,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
    engine="faiss",
)

docsearch

<langchain_community.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x24291662fb0>

### Search

In [23]:
results = docsearch.similarity_search("query", k=4)
results

[Document(id='2a385fa7-c0b8-4e4e-8efe-00e0b5d7eb48', metadata={'id': 1, 'source': 'cats.txt'}, page_content='there are cats in the pond'),
 Document(id='b0297389-6ab4-4b75-930a-7611941ab29f', metadata={'id': 2, 'source': 'ducks.txt'}, page_content='ducks are also found in the pond')]

### Add docs

In [24]:
ids = [str(uuid.uuid4()), str(uuid.uuid4())]
docsearch.add_documents(
    [
        Document(
            page_content="there are cats in the pond",
            metadata={"location": "pond", "topic": "animals"},
        ),
        Document(
            page_content="ducks are also found in the pond",
            metadata={"location": "pond", "topic": "animals"},
        ),
    ],
    ids=ids,
)

['d585f69b-4f10-409f-a02b-5252101f05db',
 '8869f9e4-1b30-412c-8dc9-1c881f3f6fe3']

### Delete docs

In [25]:
docsearch.delete(ids=[0])

True

## Tracking changes (Step by step with opensearch)

In [26]:
from langchain.indexes import SQLRecordManager, index
from langchain_core.documents import Document
from langchain_elasticsearch import ElasticsearchStore

In [27]:
collection_name = "test_index"

vectorstore = OpenSearchVectorSearch(
    opensearch_url="https://localhost:9200",
    embedding_function=embeddings,
    index_name=collection_name,
    http_auth=("admin", os_key),
    use_ssl=False,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
    engine="faiss",
)

In [28]:
namespace = f"elasticsearch/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

In [29]:
record_manager.create_schema()

In [30]:
doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"})
doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"})

In [31]:
def _clear():
    """Hacky helper method to clear content. See the `full` mode section to to understand why it works."""
    index([], record_manager, vectorstore, cleanup="full", source_id_key="source")

In [32]:
_clear()

  _warn_about_sha1()


In [33]:
index(
    [doc1, doc1, doc1, doc1, doc1],
    record_manager,
    vectorstore,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [34]:
index([doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key="source")

{'num_added': 1, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 0}

In [35]:
docs = [
    Document(page_content='there are cats in the pond',
        metadata={"id": 1, "source": "cats.txt"}),
    Document(page_content='ducks are also found in the pond',
        metadata={"id": 2, "source": "ducks.txt"}),
]

In [36]:
index_1 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source"
)

print("Index 1: ", index_1)

Index 1:  {'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}


In [37]:
index_2 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source"
)

print("Index 2: ", index_2)

Index 2:  {'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}


In [38]:
## Change document
docs[0].page_content = "Modifications"

index_3 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source"
)

print("Index 3: ", index_3)

Index 3:  {'num_added': 1, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 1}


>  First, you create a record manager, which keeps track of which documents have been indexed before. Then you use the index function to synchronize your vector store with the new list of documents. In this example, we’re using the incremental mode, so any documents that have the same ID as previous ones will be replaced with the new version.