In [15]:
import numpy as np
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [16]:
'''
1. Read the document.
2. Splitting document.
3. Perform Embedding
'''

'\n1. Read the document.\n2. Splitting document.\n3. Perform Embedding\n'

In [17]:
text = '''
LangChain is an open-source framework for building applications with large language models (LLMs).
It allows developers to combine LLMs with external data sources, APIs, and custom logic.
Retrieval-Augmented Generation (RAG) is a technique that combines information retrieval and LLMs
to answer questions with up-to-date, domain-specific, or long-tail knowledge.

Anurag Awasthi is one of the best trainers of AI.
'''

with open('rag.text', 'w') as f:
    f.write(text)

In [18]:
loader = TextLoader(file_path='rag.text')
doc = loader.load()
doc

[Document(metadata={'source': 'rag.text'}, page_content='\nLangChain is an open-source framework for building applications with large language models (LLMs).\nIt allows developers to combine LLMs with external data sources, APIs, and custom logic.\nRetrieval-Augmented Generation (RAG) is a technique that combines information retrieval and LLMs\nto answer questions with up-to-date, domain-specific, or long-tail knowledge.\n\nAnurag Awasthi is one of the best trainers of AI.\n')]

In [19]:
splitter = RecursiveCharacterTextSplitter(chunk_size= 50, chunk_overlap= 10)
chunks = splitter.split_documents(doc)

for i, chunk in enumerate(chunks):
    print(f'chunk number: {i+1} : data: {chunk.page_content}')

chunk number: 1 : data: LangChain is an open-source framework for
chunk number: 2 : data: for building applications with large language
chunk number: 3 : data: language models (LLMs).
chunk number: 4 : data: It allows developers to combine LLMs with
chunk number: 5 : data: LLMs with external data sources, APIs, and custom
chunk number: 6 : data: custom logic.
chunk number: 7 : data: Retrieval-Augmented Generation (RAG) is a
chunk number: 8 : data: is a technique that combines information
chunk number: 9 : data: retrieval and LLMs
chunk number: 10 : data: to answer questions with up-to-date,
chunk number: 11 : data: domain-specific, or long-tail knowledge.
chunk number: 12 : data: Anurag Awasthi is one of the best trainers of AI.


In [20]:
# Embedding 
from langchain_openai import OpenAIEmbeddings


embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
chunk_list = [c.page_content for c in chunks]
vector = embeddings.embed_documents(chunk_list)

print(f'Vector shape: {len(vector)}')

# We need to store this vector (Embedded format of docs into a vector database, for further uses)


Vector shape: 12


In [21]:
# Initilize the vector storage
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(chunks, embeddings)



In [23]:
# Get result from vector storage now:
query = 'What is langchain ?'

res   = vectorstore.similarity_search(query=query, k= 4)

for doc in res:
    print(doc.page_content)

LangChain is an open-source framework for
for building applications with large language
language models (LLMs).
It allows developers to combine LLMs with
