In [None]:
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [105]:
pdf_docs = PyPDFLoader(file_path="data/EduTrack_FAQ_assignment.pdf").load()
len(pdf_docs)

pdf_docs_str = ""

for doc in pdf_docs:
    pdf_docs_str += doc.page_content

In [106]:
splitter = RecursiveCharacterTextSplitter(chunk_size=360, chunk_overlap = 50)
chunked_docs = splitter.split_documents(pdf_docs)
chunked_docs

# For comaparing accuracy
text_spliter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=30)
chunked_docs_from_str = text_spliter.split_text(pdf_docs_str)

chunked_docs_from_str

['EduTrack – Frequently Asked Questions\nQ1: What is EduTrack used for?\nA1: EduTrack helps educational institutions monitor student engagement, analyze learning\nbehavior, and proactively support at-risk learners through data-driven insights.\nQ2: Which platforms does EduTrack integrate with?\nA2: EduTrack integrates seamlessly with LMS platforms such as Moodle, Canvas,\nBlackboard, Google Classroom, and can be extended to custom LMS solutions via API.\nQ3: What types of student data does EduTrack analyze?\nA3: It analyzes logins, session duration, assignment submission patterns, quiz scores,\ndiscussion participation, video viewing activity, and more.\nQ4: How does EduTrack predict dropout risks?\nA4: The platform uses machine learning algorithms that flag students at risk based on\ninactivity, declining academic performance, missed deadlines, and low participation.\nQ5: How does EduTrack benefit teachers?\nA5: Instructors receive weekly summaries, alerts about disengaged students, a

In [107]:
embedding_model_name = "nomic-embed-text:latest"
embedding = OllamaEmbeddings(
    model=embedding_model_name
)

In [108]:
db = FAISS.from_documents(chunked_docs, embedding)
db2 = FAISS.from_texts(texts=chunked_docs_from_str, embedding=embedding)
print(db)
print(db2)

<langchain_community.vectorstores.faiss.FAISS object at 0x1335c7500>
<langchain_community.vectorstores.faiss.FAISS object at 0x1335c3410>


In [109]:
res = db.similarity_search("Can students access their own dashboards?")
print(res)

res2 = db2.similarity_search("Can students access their own dashboards?")
print(res2)

[Document(id='57a5a190-2e9f-40cf-a1af-36c0ebab90ae', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-06-05T12:53:08+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-06-05T12:53:08+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'data/EduTrack_FAQ_assignment.pdf', 'total_pages': 2, 'page': 1, 'page_label': '2'}, page_content='A18: Yes. It can sync with most SIS platforms to fetch enrollment, demographics, and\nacademic standing data.\nQ19: How does EduTrack notify instructors about at-risk students?\nA19: Faculty receive weekly alerts and visual cues on their dashboards, highlighting students\nwho need attention based on defined risk thresholds.'), Document(id='90544395-41d9-4d39-86fc-63b71279749d', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-06-05T12:53:08+00:00', 'author': '(anonymous)'

In [110]:
res3 = db.similarity_search("Can EduTrack be used in hybrid or blended learning models?")
res4 = db2.similarity_search("Can EduTrack be used in hybrid or blended learning models?")
print(res3)
print(res4)

[Document(id='9777a2d6-f3bc-4b25-b864-4287ab578208', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-06-05T12:53:08+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-06-05T12:53:08+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'data/EduTrack_FAQ_assignment.pdf', 'total_pages': 2, 'page': 1, 'page_label': '2'}, page_content='premium clients.\nQ14: Can EduTrack be used in hybrid or blended learning models?\nA14: Yes. EduTrack consolidates both online interaction data and offline metrics like\nattendance to give a full picture of learner behavior.\nQ15: Does EduTrack support attendance tracking?'), Document(id='3b18570c-638b-4c23-845d-706bdcdad222', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-06-05T12:53:08+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-06-05T12:53:08+00

In [112]:
# I am probably doing something wrong, because res4 is always returnig full stored context! Leaving it for now!

In [118]:
## Storing the db locally
db.save_local('kw1_vector_store')

# and to load it again
db = FAISS.load_local('kw1_vector_store', embeddings=embedding, allow_dangerous_deserialization=True)
res5 = db.similarity_search("Can EduTrack be used in hybrid or blended learning models?")
res5

[Document(id='9777a2d6-f3bc-4b25-b864-4287ab578208', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-06-05T12:53:08+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-06-05T12:53:08+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'data/EduTrack_FAQ_assignment.pdf', 'total_pages': 2, 'page': 1, 'page_label': '2'}, page_content='premium clients.\nQ14: Can EduTrack be used in hybrid or blended learning models?\nA14: Yes. EduTrack consolidates both online interaction data and offline metrics like\nattendance to give a full picture of learner behavior.\nQ15: Does EduTrack support attendance tracking?'),
 Document(id='3b18570c-638b-4c23-845d-706bdcdad222', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-06-05T12:53:08+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-06-05T12:53:08+0