In [1]:
import numpy as np
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
'''
1. Read the document.
2. Splitting document.
3. Perform Embedding
'''

In [2]:
text = '''
LangChain is an open-source framework for building applications with large language models (LLMs).
It allows developers to combine LLMs with external data sources, APIs, and custom logic.
Retrieval-Augmented Generation (RAG) is a technique that combines information retrieval and LLMs
to answer questions with up-to-date, domain-specific, or long-tail knowledge.

Anurag Awasthi is one of the best trainers of AI.
'''

with open('rag.text', 'w') as f:
    f.write(text)

In [3]:
loader = TextLoader(file_path='rag.text')
doc = loader.load()
doc

[Document(metadata={'source': 'rag.text'}, page_content='\nLangChain is an open-source framework for building applications with large language models (LLMs).\nIt allows developers to combine LLMs with external data sources, APIs, and custom logic.\nRetrieval-Augmented Generation (RAG) is a technique that combines information retrieval and LLMs\nto answer questions with up-to-date, domain-specific, or long-tail knowledge.\n\nAnurag Awasthi is one of the best trainers of AI.\n')]

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size= 5, chunk_overlap= 3)
chunks = splitter.split_documents(doc)

for i, chunk in enumerate(chunks):
    print(f'chunk number: {i+1} : data: {chunk.page_content}')

chunk number: 1 : data: Lang
chunk number: 2 : data: angCh
chunk number: 3 : data: gChai
chunk number: 4 : data: hain
chunk number: 5 : data: is
chunk number: 6 : data: an
chunk number: 7 : data: open
chunk number: 8 : data: pen-s
chunk number: 9 : data: n-sou
chunk number: 10 : data: sourc
chunk number: 11 : data: urce
chunk number: 12 : data: fram
chunk number: 13 : data: ramew
chunk number: 14 : data: mewor
chunk number: 15 : data: work
chunk number: 16 : data: for
chunk number: 17 : data: buil
chunk number: 18 : data: uildi
chunk number: 19 : data: lding
chunk number: 20 : data: appl
chunk number: 21 : data: pplic
chunk number: 22 : data: licat
chunk number: 23 : data: catio
chunk number: 24 : data: tions
chunk number: 25 : data: with
chunk number: 26 : data: larg
chunk number: 27 : data: arge
chunk number: 28 : data: lang
chunk number: 29 : data: angua
chunk number: 30 : data: guage
chunk number: 31 : data: mode
chunk number: 32 : data: odels
chunk number: 33 : data: (LLM
chunk nu

In [None]:
# Embedding 
from langchain_openai import OpenAIEmbeddings


embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
chunk_list = [c.page_content for c in chunks]
vector = embeddings.embed_documents(chunk_list)

print(f'Vector shape: {len(vector)}')

# We need to store this vector (Embedded format of docs into a vector database, for further uses)


Vector shape: 141


In [None]:
# Initilize the vector storage
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(chunks, embeddings)