In [2]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from pydantic import BaseModel

In [3]:
ollama_model = ChatOllama(
    base_url='http://127.0.0.1:11434',
    model="deepseek-r1:1.5b"
)

ollama_model.invoke("Hi!")

AIMessage(content='<think>\n\n</think>\n\nHello! How can I assist you today? 😊', additional_kwargs={}, response_metadata={'model': 'deepseek-r1:1.5b', 'created_at': '2025-07-25T00:47:09.883307Z', 'done': True, 'done_reason': 'stop', 'total_duration': 4744910400, 'load_duration': 3473493700, 'prompt_eval_count': 5, 'prompt_eval_duration': 167907300, 'eval_count': 16, 'eval_duration': 1099525700, 'model_name': 'deepseek-r1:1.5b'}, id='run--46fd7899-b9cf-4f1a-afb7-6c58af4c8e8d-0', usage_metadata={'input_tokens': 5, 'output_tokens': 16, 'total_tokens': 21})

Indexing and retrieving

### Indexing, Ingestion, Embeddings and VectorStore

## Embeddings

- Bag of Words
- LLM-Based

In [4]:
#Load
from langchain_community.document_loaders import TextLoader
path = '../data/test.txt'

loader = TextLoader(path)

docs = loader.load()
docs

[Document(metadata={'source': '../data/test.txt'}, page_content='Test text')]

In [5]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.langchain.com/")
docs = loader.load()

docs

[Document(metadata={'source': 'https://www.langchain.com/', 'title': 'LangChain', 'description': 'LangChain’s suite of products supports developers along each step of their development journey.', 'language': 'en'}, page_content="LangChain\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nFrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources\n\nResources HubBlogCustomer StoriesLangChain AcademyCommunityExpertsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricingGet a demoSign up\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nFrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources\n\nResources HubBlogCustomer StoriesLangChain AcademyCommunityExpertsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricingGet a demoSign upThe platform for reliable agents. Tools for every step of the agent development lifecycle -- built to unlock powerfu

In [6]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('../data/test.pdf')

pages = loader.load()

print(pages)


[Document(metadata={'producer': 'MiKTeX-dvipdfmx (20220710)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-07-10T17:14:12-04:00', 'source': '../data/test.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='Alejandro Cespón\nARTiFiCiALINTELLiGENCERESEARCHER · SOFTWAREMiDDEVELOPER\nSanta Clara, Villa Clara, Cuba\nć cesponalejandro@gmail.com | ^ acferriol | ] alejandro-cespon-b36771209 | Ȉ 0000-0002-8584-6958 | Ƹ Alejandro-Cespon-Ferriol | Ǒ alejandrocespon\n| ŵ Alejandro Cespón Ferriol | 24 years\n“Lifeisthis...,Ilikethis”\nHarveySpecter\nEducation\nGranadaUniversity Granada,Spain\nPHDONCOMPUTERSCiENCE Nov. 2024‑Present\n• DoctoralProgrammeinInformationandCommunicationTechnologies(B25/56/1).\nUCLV(CentralUniversity”MartaAbreu”ofLasVillas) VillaClara,Cuba\nMASTERONCOMPUTERSCiENCE Dec. 2023‑Sept. 2024\n• Studyingfromundergraduatebyaspecialtrainingplan\nUCLV(CentralUniversity”MartaAbreu”ofLasVillas) VillaClara,Cuba\nB.S.ONCOMPUTERSCiENCE Sep. 2019‑Dec. 2023\n• Go

### Chunks

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader("../data/test.txt") 
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=10,
    chunk_overlap=2,
)
splitted_docs = splitter.split_documents(docs)

print(len(splitted_docs))

7


In [11]:
splitted_docs

[Document(metadata={'source': '../data/test.txt'}, page_content='Test text'),
 Document(metadata={'source': '../data/test.txt'}, page_content='And'),
 Document(metadata={'source': '../data/test.txt'}, page_content='Chunks'),
 Document(metadata={'source': '../data/test.txt'}, page_content='just in'),
 Document(metadata={'source': '../data/test.txt'}, page_content='case.'),
 Document(metadata={'source': '../data/test.txt'}, page_content='Fatal'),
 Document(metadata={'source': '../data/test.txt'}, page_content='Or not')]

In [None]:
### Programming Language
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""
python_splitter = RecursiveCharacterTextSplitter.from_language( #From language for Languages
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE]) #Create documents for str(not docs)

print(python_docs)

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'), Document(metadata={}, page_content='# Call the function\nhello_world()')]


In [14]:
markdown_text = """
# LangChain

⚡ Building applications with LLMs through composability ⚡

## Quick Install

```bash
pip install langchain
```

As an open source project in a rapidly developing field, we are extremely open 
    to contributions.
"""

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
)
md_docs = md_splitter.create_documents(texts = [markdown_text], #List of Texts
    metadatas=[{"source": "https://www.langchain.com"}]) #List of metadata

print(md_docs)

[Document(metadata={'source': 'https://www.langchain.com'}, page_content='# LangChain'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='⚡ Building applications with LLMs through composability ⚡'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='## Quick Install\n\n```bash\npip install langchain'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='```'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='As an open source project in a rapidly developing field, we'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='are extremely open'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='to contributions.')]


### Generate Embeddings

In [15]:
from langchain_ollama import OllamaEmbeddings

model = OllamaEmbeddings(
    base_url='http://127.0.0.1:11434',
    model="deepseek-r1:1.5b"
)

In [16]:
embeddings = model.embed_documents([
    "Hi there!",
    "Oh, hello!",
    "What's your name?",
    "My friends call me World",
    "Hello World!"
])

In [18]:
len(embeddings)

5

In [19]:
## Complete pipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings

pdfloader = PyPDFLoader('../data/test.pdf')
doc = pdfloader.load()

In [21]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splitted_docs = splitter.split_documents(doc)

splitted_docs

[Document(metadata={'producer': 'MiKTeX-dvipdfmx (20220710)', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-07-10T17:14:12-04:00', 'source': '../data/test.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='Alejandro Cespón\nARTiFiCiALINTELLiGENCERESEARCHER · SOFTWAREMiDDEVELOPER\nSanta Clara, Villa Clara, Cuba\nć cesponalejandro@gmail.com | ^ acferriol | ] alejandro-cespon-b36771209 | Ȉ 0000-0002-8584-6958 | Ƹ Alejandro-Cespon-Ferriol | Ǒ alejandrocespon\n| ŵ Alejandro Cespón Ferriol | 24 years\n“Lifeisthis...,Ilikethis”\nHarveySpecter\nEducation\nGranadaUniversity Granada,Spain\nPHDONCOMPUTERSCiENCE Nov. 2024‑Present\n• DoctoralProgrammeinInformationandCommunicationTechnologies(B25/56/1).\nUCLV(CentralUniversity”MartaAbreu”ofLasVillas) VillaClara,Cuba\nMASTERONCOMPUTERSCiENCE Dec. 2023‑Sept. 2024\n• Studyingfromundergraduatebyaspecialtrainingplan\nUCLV(CentralUniversity”MartaAbreu”ofLasVillas) VillaClara,Cuba\nB.S.ONCOMPUTERSCiENCE Sep. 2019‑Dec. 2023\n• Go

In [22]:
embedding_model = OllamaEmbeddings(
    base_url='http://127.0.0.1:11434',
    model="deepseek-r1:1.5b"
)

In [24]:
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in splitted_docs])
embeddings

[[-0.005552281,
  -0.017424496,
  0.014128384,
  0.014555741,
  0.006254114,
  -0.021963235,
  -0.015983272,
  0.018823944,
  0.0026403507,
  -0.019237436,
  -0.020582965,
  0.065632306,
  0.0068130395,
  0.0015379547,
  -0.0010545492,
  -0.0016066234,
  0.010023345,
  0.0042890892,
  -0.019570664,
  0.0015869363,
  0.009677471,
  0.026642052,
  -0.008546325,
  -0.04245476,
  -0.0050431984,
  -0.0035275908,
  -0.033383727,
  -0.04531709,
  -0.016205253,
  0.013955578,
  0.0030691985,
  -0.0052692667,
  0.0074984194,
  -0.008776855,
  -0.003284865,
  0.009277416,
  0.00841886,
  -0.04052571,
  -0.010576697,
  0.015181012,
  -0.008393419,
  -0.0015025861,
  -0.0030670774,
  -0.0030737468,
  -0.0011244768,
  0.01798195,
  0.02487215,
  0.018831091,
  -0.026202133,
  -0.01931684,
  -0.08572697,
  -0.020508604,
  0.018552214,
  0.024783932,
  -0.0031999596,
  -0.0061810696,
  0.019283157,
  -0.00083298876,
  -0.0014481121,
  -0.032434646,
  -0.0043700007,
  -0.021239609,
  0.0010833123,
  -

In [30]:
len(embeddings)

10

### Vector Store