### Chunking

In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

with open("./demo.md") as f:
    file_content = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=170, chunk_overlap=20  # overlap to maintain context betn chunks
)

chunks = text_splitter.split_text(file_content)  # list of split chunks
print(f"Total chunks: {len(chunks)}")

Total chunks: 359


### Embedding

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

hf_model = HuggingFaceEmbeddings(
    model_name="all-mpnet-base-v2",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": False},
    multi_process=True,  # run encode() on multiple GPUs
)

### Vector

In [3]:
from langchain_community.vectorstores import FAISS

# from_texts method takes a list of raw texts and embeds them with the provided embedding model
faiss = FAISS.from_texts(texts=chunks, embedding=hf_model)

# saving the embeddings locally
faiss.save_local(folder_path="faiss_index")

In [4]:
db = FAISS.load_local(folder_path="faiss_index", embeddings=hf_model,
                      allow_dangerous_deserialization=True)

### Query

In [5]:
input = "What is the deadline for spring 2026 semester?"

# FAISS uses the hf_model to convert query into embedding and then searches that embedding against the stored db
# the 2nd arg specifies the top Kth nearest neighbors to retrieve
res = db.similarity_search(query=input, k=2)

In [6]:
[r.page_content for r in res]

['## Application deadlines\n### Domestic\nSpring 2026 semester: December 1\nFall 2026 semester: August 30\n### International applying to US campuses',
 'Spring 2026 semester: November 1\nFall 2026 semester: May 1\n### International applying to Vancouver campus\nSpring 2026 semester: November 1\nFall 2026 semester: June 1']