In [55]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings

In [56]:
import os


load_dotenv()
open_ai_key = os.getenv("OPENAI_KEY")

In [57]:
text = "Hola amigo!"

In [58]:
embedding = OpenAIEmbeddings(
    api_key=open_ai_key,
    model="text-embedding-3-small"
)

In [59]:
converted = embedding.embed_query(text)
converted

[0.017818517982959747,
 -0.04433789104223251,
 -0.06450095027685165,
 0.03131265938282013,
 -0.03865889087319374,
 -0.015838682651519775,
 0.015526076778769493,
 0.05387236177921295,
 -0.035584934055805206,
 -0.04933958128094673,
 -0.02987988293170929,
 0.0020954343490302563,
 -0.008290559984743595,
 -0.0402219183743,
 -0.002863922854885459,
 0.04418158903717995,
 -0.037121910601854324,
 -0.015513051301240921,
 -0.021608861163258553,
 0.03613199293613434,
 0.021621884778141975,
 0.017558014020323753,
 -0.017258433625102043,
 -0.010485311970114708,
 0.010048966854810715,
 -0.00026803486980497837,
 -0.02909836918115616,
 0.016568096354603767,
 0.024852143600583076,
 0.010218294337391853,
 0.05569589510560036,
 -0.02922862209379673,
 0.0014970876509323716,
 -0.026558449491858482,
 -0.015460950322449207,
 0.0139890993013978,
 -0.005148223135620356,
 0.007658836431801319,
 -0.023692898452281952,
 -0.005480366293340921,
 0.03084374964237213,
 -0.011670608073472977,
 0.03925805166363716,
 0.0

In [60]:
len(converted)

1536

In [61]:
# Lets try this whole mechanism of ingestion, chunking, vectorization and storage(using chromadb)

In [62]:
from langchain_community.document_loaders import PyPDFLoader

In [63]:
docs = PyPDFLoader(file_path="data/EduTrack_FAQ_assignment.pdf").load()

incorrect startxref pointer(1)
parsing for Object Streams


In [64]:
len(docs)

2

In [65]:
# Chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [66]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 50, chunk_overlap = 10)

In [67]:
chunked_docs = splitter.split_documents(docs)

In [68]:
len(chunked_docs)

106

In [69]:
# Vectorize and storage
from langchain_community.vectorstores import Chroma

In [70]:
db = Chroma.from_documents(chunked_docs, embedding)

In [71]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x135df9790>

In [72]:
# Querying stored data

In [77]:
query = "What type of data, EduTrack analyze?"

In [78]:
results = db.similarity_search(query)
print(results)

[Document(metadata={'source': 'data/EduTrack_FAQ_assignment.pdf', 'creator': '(unspecified)', 'page': 0, 'subject': '(unspecified)', 'trapped': '/False', 'creationdate': '2025-06-05T12:53:08+00:00', 'page_label': '1', 'total_pages': 2, 'producer': 'ReportLab PDF Library - www.reportlab.com', 'keywords': '', 'moddate': '2025-06-05T12:53:08+00:00', 'author': '(anonymous)', 'title': '(anonymous)'}, page_content='EduTrack analyze?'), Document(metadata={'page_label': '1', 'title': '(anonymous)', 'trapped': '/False', 'author': '(anonymous)', 'producer': 'ReportLab PDF Library - www.reportlab.com', 'keywords': '', 'source': 'data/EduTrack_FAQ_assignment.pdf', 'creationdate': '2025-06-05T12:53:08+00:00', 'moddate': '2025-06-05T12:53:08+00:00', 'subject': '(unspecified)', 'total_pages': 2, 'page': 0, 'creator': '(unspecified)'}, page_content='EduTrack analyze?'), Document(metadata={'title': '(anonymous)', 'trapped': '/False', 'moddate': '2025-06-05T12:53:08+00:00', 'subject': '(unspecified)', '

In [79]:
len(results)

4

In [80]:
for result in results:
    print(result.page_content)

EduTrack analyze?
EduTrack analyze?
Q3: What types of student data does EduTrack
Q3: What types of student data does EduTrack
