In [26]:
import os
import discord
from discord.ext import commands
import pickle
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import LlamaCpp
from langchain_core.runnables import RunnablePassthrough
import warnings
warnings.filterwarnings("ignore")

In [19]:
DISCORD_TOKEN = "sk-MpxgiXLmODS8OSEUIy1fMsjfecR028VKWXqtYk5n4VT3BlbkFJehlXBhVZlOGUmEcSae9gbAKrEJhzU0md1yBoMuwqcA"
PDF_DIRECTORY = "/Users/ybandla/Documents/Code/rag bot/algoragbot/algorithms_docs"
FAISS_INDEX_PATH = "./faiss_index"

LLM_MODEL_PATH = "./models/llama-2-7b-chat.Q4_K_M.gguf"
USE_LOCAL_LLM = True

In [17]:
def load_pdfs_metadata(directory_path):
    documents = []
    for file in os.listdir(directory_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(directory_path, file)
            try:
                # Determine document type from filename or folder structure
                doc_type = "unknown"
                if "lecture" in file.lower():
                    doc_type = "lecture"
                elif "hw" in file.lower():
                    if "sol" in file.lower():
                        doc_type = "homework_solution"
                    else:
                        doc_type = "homework"
                elif "review" in file.lower():
                    doc_type = "exam"
                elif "midterm" in file.lower():
                    doc_type = "exam"
                elif "practice" in file.lower():
                    doc_type = "practice_problem"
                    
                # Load PDF
                loader = PyPDFLoader(pdf_path)
                docs = loader.load()
                    
                    # Add metadata to each page
                for doc in docs:
                    doc.metadata["source_type"] = doc_type
                    doc.metadata["filename"] = file
                    
                documents.extend(docs)
                print(f"Loaded: {file} as {doc_type}")
            except Exception as e:
                print(f"Error loading {file}: {e}")
    return documents

In [20]:
documents = load_pdfs_metadata(PDF_DIRECTORY)
print(f"Loaded {len(documents)} document pages in total")

Loaded: HW07_sol.pdf as homework_solution
Loaded: HW06_sol.pdf as homework_solution
Loaded: HW05.pdf as homework
Loaded: Lecture 02.pdf as lecture
Loaded: HW04_sol.pdf as homework_solution
Loaded: Lecture 03.pdf as lecture
Loaded: HW10.pdf as homework
Loaded: HW04.pdf as homework
Loaded: Lecture 0.pdf as lecture
Loaded: HW06.pdf as homework
Loaded: PracticeProblems4.pdf as practice_problem
Loaded: Lecture 01.pdf as lecture
Loaded: HW07.pdf as homework
Loaded: HW03.pdf as homework
Loaded: Lecture 10.pdf as lecture
Loaded: Lecture 04.pdf as lecture
Loaded: PracticeProblems1.pdf as practice_problem


parsing for Object Streams


Loaded: Lecture 05.pdf as lecture
Loaded: Lecture 11.pdf as lecture
Loaded: HW02.pdf as homework
Loaded: Lecture 07.pdf as lecture
Loaded: Lecture 13.pdf as lecture
Loaded: PracticeProblems2.pdf as practice_problem
Loaded: PracticeProblems3.pdf as practice_problem
Loaded: Lecture 12.pdf as lecture
Loaded: Lecture 06.pdf as lecture
Loaded: HW05_sol.pdf as homework_solution
Loaded: HW01.pdf as homework
Loaded: HW10_sol.pdf as homework_solution
Loaded: MidtermB.pdf as exam
Loaded: Lecture 08.pdf as lecture
Loaded: MidtermA.pdf as exam
Loaded: HW09_sol.pdf as homework_solution
Loaded: Lecture 09.pdf as lecture
Loaded: HW08_sol.pdf as homework_solution
Loaded: HW01_sol.pdf as homework_solution
Loaded: HW09.pdf as homework
Loaded: HW08.pdf as homework
Loaded: Final Review.pdf as exam
Loaded: HW0.pdf as homework
Loaded: Midterm Review.pdf as exam
Loaded: HW03_sol.pdf as homework_solution
Loaded: HW02_sol.pdf as homework_solution
Loaded 1014 document pages in total


In [21]:
doc_types = {}
for doc in documents:
    doc_type = doc.metadata.get("source_type", "unknown")
    doc_types[doc_type] = doc_types.get(doc_type, 0) + 1

print("Document types distribution:")
for doc_type, count in doc_types.items():
    print(f"- {doc_type}: {count} pages")

Document types distribution:
- homework_solution: 43 pages
- homework: 24 pages
- lecture: 719 pages
- practice_problem: 100 pages
- exam: 128 pages


In [22]:
if documents:
    print(f"Preview of the first document ({documents[0].metadata["filename"]}):")
    preview_text = documents[0].page_content[:500] + "..." if len(documents[0].page_content) > 500 else documents[0].page_content
    print(preview_text)

Preview of the first document (HW07_sol.pdf):
Johnson and Mirzaei CS 3330 Homework 7 Solution
Problem 1
(a) A = [2, 6, 7, 1, 3, 5, 4]
(b) j will never reach 6, as the loop is exclusive. It iterates through 0 to 5 for j.
Value ofj Value ofi Array
0 0 [2,6,7,1,3,5,4]
1 0 [2,6,7,1,3,5,4]
2 0 [2,6,7,1,3,5,4]
3 1 [2,1,7,6,3,5,4]
4 2 [2,1,3,6,7,5,4]
5 2 [2,1,3,6,7,5,4]
(c) A = [2, 1, 3, 4, 7, 5, 6]
Problem 2
(a) Taking either the smallest or the largest element of A as the pivot would lead to the worst partition.
In this case, at each iteration, ...


In [23]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", ".", " ", ""])
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks from {len(documents)} document pages")

Created 1560 chunks from 1014 document pages


In [15]:
if chunks:
    print(f"Example chunk (from{chunks[0].metadata["filename"]}):\n")
    preview_chunk = chunks[0].page_content[:300] + "..." if len(chunks[0].page_content) > 300 else chunks[0].page_content
    print(preview_chunk)
    print("\nChunk metadata:", chunks[0].metadata)

Example chunk (fromHW07_sol.pdf):

Johnson and Mirzaei CS 3330 Homework 7 Solution
Problem 1
(a) A = [2, 6, 7, 1, 3, 5, 4]
(b) j will never reach 6, as the loop is exclusive. It iterates through 0 to 5 for j.
Value ofj Value ofi Array
0 0 [2,6,7,1,3,5,4]
1 0 [2,6,7,1,3,5,4]
2 0 [2,6,7,1,3,5,4]
3 1 [2,1,7,6,3,5,4]
4 2 [2,1,3,6,7,5,4]
...

Chunk metadata: {'producer': 'pdfTeX-1.40.26', 'creator': 'TeX', 'creationdate': '2024-11-19T19:48:06+00:00', 'moddate': '2024-11-19T19:48:06+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'trapped': '/False', 'source': '/Users/ybandla/Documents/Code/rag bot/algoragbot/algorithms_docs/HW07_sol.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1', 'source_type': 'homework_solution', 'filename': 'HW07_sol.pdf'}


In [28]:
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents=chunks, embedding = embeddings)
vectorstore.save_local(FAISS_INDEX_PATH)
print(f"Vector store created and saved to {FAISS_INDEX_PATH}")

NameError: name 'init_empty_weights' is not defined