In [1]:
# 1. loading the text file (document from the files)
# 2. Perform Splitting in chunks
# 3. Peform Embedding on these chunks.

In [13]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [4]:
# 1. Loaded file data 
loader = TextLoader('demo.txt')
doc = loader.load()
print(doc)

[Document(metadata={'source': 'demo.txt'}, page_content='This is an example of using text file in langchain\nlangchain is a powerful framework')]


In [14]:
# 2. Splitter:
splitter = RecursiveCharacterTextSplitter(chunk_size = 5, chunk_overlap= 3)
chunks = splitter.split_documents(doc)


for i, chunk in enumerate(chunks):
    print(f' Chunk {i+1}: \n {chunk.page_content} \n {'-' * 20}')

 Chunk 1: 
 This 
 --------------------
 Chunk 2: 
 is 
 --------------------
 Chunk 3: 
 an 
 --------------------
 Chunk 4: 
 exam 
 --------------------
 Chunk 5: 
 xampl 
 --------------------
 Chunk 6: 
 mple 
 --------------------
 Chunk 7: 
 of 
 --------------------
 Chunk 8: 
 usin 
 --------------------
 Chunk 9: 
 sing 
 --------------------
 Chunk 10: 
 text 
 --------------------
 Chunk 11: 
 file 
 --------------------
 Chunk 12: 
 in 
 --------------------
 Chunk 13: 
 lang 
 --------------------
 Chunk 14: 
 angch 
 --------------------
 Chunk 15: 
 gchai 
 --------------------
 Chunk 16: 
 hain 
 --------------------
 Chunk 17: 
 lang 
 --------------------
 Chunk 18: 
 angch 
 --------------------
 Chunk 19: 
 gchai 
 --------------------
 Chunk 20: 
 hain 
 --------------------
 Chunk 21: 
 is a 
 --------------------
 Chunk 22: 
 powe 
 --------------------
 Chunk 23: 
 owerf 
 --------------------
 Chunk 24: 
 erful 
 --------------------
 Chunk 25: 
 fram 
 ------

In [None]:
# 3. Embedding
# Import Embedding package, 
# Have list of all the chunks
# embed this list of chunks 
# print for illustration 

from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(model='text-embedding-3-small')
chunk_list = [c.page_content for c in chunks]
print(f'Chunk list prepared as per the splitting : \n {chunk_list}')

vectors = embedding.embed_documents(chunk_list)
# print(f' Embedded integers of the chunk list: \n {vectors}')

print(f'Open AI Vector shape: {len(vectors) } chunks x {len(vectors[0])} dims')



Chunk list prepared as per the splitting : 
 ['This', 'is', 'an', 'exam', 'xampl', 'mple', 'of', 'usin', 'sing', 'text', 'file', 'in', 'lang', 'angch', 'gchai', 'hain', 'lang', 'angch', 'gchai', 'hain', 'is a', 'powe', 'owerf', 'erful', 'fram', 'ramew', 'mewor', 'work']
Open AI Vector shape: 28 chunks x 1536 dims


One More step forward, we will see how to compute similarity (Cosine) b/w query and vectors

In [30]:
# 1. Get the user query
# 2. Convert it into vector
# 3. Calculate the cosine b/w query vector and the already existing document vector. 

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

query = "What is RAG?"

q_vector= embedding.embed_query(query)

print(f'Query embedding : {q_vector[:10]}')


sims = cosine_similarity([q_vector], vectors)[0]
print("OpenAI Cosine similarities:", sims)

top_index = int(np.argmax(sims))
print(f'Most similar part: {chunks[top_index].page_content}')

Query embedding : [0.0006628383416682482, 0.025741448625922203, 0.007136902771890163, 0.03336041420698166, -0.03193381801247597, -0.027387520298361778, -0.005036199931055307, -0.040007416158914566, 0.006889991462230682, -0.03235709294676781]
OpenAI Cosine similarities: [0.22935791 0.17299077 0.20265067 0.15566929 0.15858994 0.20354475
 0.13930213 0.16244068 0.22398218 0.17405944 0.17203865 0.16569891
 0.17863051 0.24680803 0.23192548 0.17428342 0.17863051 0.24680803
 0.23192548 0.17428342 0.21975848 0.12811939 0.11822654 0.19905233
 0.17930245 0.30871256 0.13441275 0.17606955]
Most similar part: ramew
