1. INSTALL NECESSARY LIBRARIES

In [37]:
pip install langchain langchain_community discord.py sentence-transformers faiss-cpu pypdf llama-cpp-python

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


2. IMPORT NECESSARY LIBRARIES

In [38]:
import os
import discord
from discord.ext import commands
import pickle
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import HuggingFaceHub
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import RetrievalQA
import warnings
warnings.filterwarnings("ignore")

3. SET API KEYS AND DIRECTORIES

In [None]:
os.environ["DISCORD_TOKEN"] = ""
PDF_DIRECTORY = "algorithms_docs"
FAISS_INDEX_PATH = "faiss_index"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

4. LOAD PDFS WITH METADATA

In [40]:
def load_pdfs_metadata(directory_path):
    documents = []
    for file in os.listdir(directory_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(directory_path, file)
            try:
                # Determine document type from filename or folder structure
                doc_type = "unknown"
                if "lecture" in file.lower():
                    doc_type = "lecture"
                elif "hw" in file.lower():
                    if "sol" in file.lower():
                        doc_type = "homework_solution"
                    else:
                        doc_type = "homework"
                elif "review" in file.lower():
                    doc_type = "exam"
                elif "midterm" in file.lower():
                    doc_type = "exam"
                elif "practice" in file.lower():
                    doc_type = "practice_problem"
                    
                # Load PDF
                loader = PyPDFLoader(pdf_path)
                docs = loader.load()
                    
                    # Add metadata to each page
                for doc in docs:
                    doc.metadata["source_type"] = doc_type
                    doc.metadata["filename"] = file
                    
                documents.extend(docs)
                print(f"Loaded: {file} as {doc_type}")
            except Exception as e:
                print(f"Error loading {file}: {e}")
    return documents

In [41]:
documents = load_pdfs_metadata(PDF_DIRECTORY)
print(f"Loaded {len(documents)} document pages in total")

Loaded: Final Review.pdf as exam
Loaded: HW0.pdf as homework
Loaded: HW01.pdf as homework
Loaded: HW01_sol.pdf as homework_solution
Loaded: HW02.pdf as homework
Loaded: HW02_sol.pdf as homework_solution
Loaded: HW03.pdf as homework
Loaded: HW03_sol.pdf as homework_solution
Loaded: HW04.pdf as homework
Loaded: HW04_sol.pdf as homework_solution
Loaded: HW05.pdf as homework
Loaded: HW05_sol.pdf as homework_solution
Loaded: HW06.pdf as homework
Loaded: HW06_sol.pdf as homework_solution
Loaded: HW07.pdf as homework
Loaded: HW07_sol.pdf as homework_solution
Loaded: HW08.pdf as homework
Loaded: HW08_sol.pdf as homework_solution
Loaded: HW09.pdf as homework
Loaded: HW09_sol.pdf as homework_solution
Loaded: HW10.pdf as homework
Loaded: HW10_sol.pdf as homework_solution
Loaded: Lecture 0.pdf as lecture
Loaded: Lecture 01.pdf as lecture
Loaded: Lecture 02.pdf as lecture
Loaded: Lecture 03.pdf as lecture
Loaded: Lecture 04.pdf as lecture
Loaded: Lecture 05.pdf as lecture
Loaded: Lecture 06.pdf as 

parsing for Object Streams


Loaded: Lecture 11.pdf as lecture
Loaded: Lecture 12.pdf as lecture
Loaded: Lecture 13.pdf as lecture
Loaded: Midterm Review.pdf as exam
Loaded: MidtermA.pdf as exam
Loaded: MidtermB.pdf as exam
Loaded: PracticeProblems1.pdf as practice_problem
Loaded: PracticeProblems2.pdf as practice_problem
Loaded: PracticeProblems3.pdf as practice_problem
Loaded: PracticeProblems4.pdf as practice_problem
Loaded 1014 document pages in total


In [42]:
doc_types = {}
for doc in documents:
    doc_type = doc.metadata.get("source_type", "unknown")
    doc_types[doc_type] = doc_types.get(doc_type, 0) + 1

print("Document types distribution:")
for doc_type, count in doc_types.items():
    print(f"- {doc_type}: {count} pages")

Document types distribution:
- exam: 128 pages
- homework: 24 pages
- homework_solution: 43 pages
- lecture: 719 pages
- practice_problem: 100 pages


In [43]:
if documents:
    print(f"Preview of the first document ({documents[0].metadata["filename"]}):")
    preview_text = documents[0].page_content[:500] + "..." if len(documents[0].page_content) > 500 else documents[0].page_content
    print(preview_text)

Preview of the first document (Final Review.pdf):
Final Review
greedy algorithms
divide and conquer
dynamic programming
CS 3330 Algorithms


5. CREATE CHUNKS

In [44]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", ".", " ", ""])
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks from {len(documents)} document pages")

Created 1560 chunks from 1014 document pages


In [45]:
if chunks:
    print(f"Example chunk (from{chunks[0].metadata["filename"]}):\n")
    preview_chunk = chunks[0].page_content[:300] + "..." if len(chunks[0].page_content) > 300 else chunks[0].page_content
    print(preview_chunk)
    print("\nChunk metadata:", chunks[0].metadata)

Example chunk (fromFinal Review.pdf):

Final Review
greedy algorithms
divide and conquer
dynamic programming
CS 3330 Algorithms

Chunk metadata: {'producer': 'Adobe PDF Library 24.4.48', 'creator': 'Acrobat PDFMaker 24 for PowerPoint', 'creationdate': '2024-12-12T08:01:47-06:00', 'author': 'Moharrami, Mehrdad', 'company': 'University of Iowa', 'moddate': '2024-12-12T08:01:54-06:00', 'title': 'PowerPoint Presentation', 'source': 'algorithms_docs\\Final Review.pdf', 'total_pages': 55, 'page': 0, 'page_label': '1', 'source_type': 'exam', 'filename': 'Final Review.pdf'}


6. CREATE VECTOR STORAGE

In [54]:
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents=chunks, embedding = embeddings)
vectorstore.save_local(FAISS_INDEX_PATH)
print(f"Vector store created and saved to {FAISS_INDEX_PATH}")

Vector store created and saved to faiss_index


7. RETRIEVER FOR FILTERING

In [55]:
def metadata_filter(doc_type):
    def filter_func(doc):
        return doc.metadata.get("source_type") == doc_type
    return filter_func

def create_retriever(doc_type = None):
    vs = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
    if doc_type:
        base_retriever = vs.as_retriever(search_kwargs={"k": 10})
        def filtered_retriever(query):
                docs = base_retriever.invoke(query)
                filtered_docs = [doc for doc in docs if doc.metadata.get("source_type") == doc_type]
                return filtered_docs[:4]
        return filtered_retriever
    else:
        return vs.as_retriever(search_kwargs={"k": 4})


8. SET UP LANGUAGE MODEL

In [82]:
def setup_llm():
    return HuggingFaceHub(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",  # You can change to another model
        model_kwargs={"temperature": 0.1, "max_new_tokens": 1024, "n_ctx": 2048, "verbose": True},
        huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
    )

# Try to setup the LLM
try:
    llm = setup_llm()
    print("LLM setup completed")
except Exception as e:
    print(f"Error setting up LLM: {e}")

LLM setup completed


9. CREATE RAG CHAIN

In [66]:
template = """\
You are an algorithms teaching assistant for a computer science class.
Answer the question based only on the following context from class materials:

{context}

Question: {question}

When answering:
1. Be thorough and explain concepts clearly like a teaching assistant would
2. Use examples to illustrate complex algorithms when appropriate
3. Include time and space complexity analysis when relevant
4. If the answer isn't fully contained in the context, say so rather than making up information
5. Don't hallucinate

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

retriever = create_retriever()

def create_rag_chain(retriever_func):
    if callable(retriever_func) and not hasattr(retriever_func, 'invoke'):
        retriever_chain = RunnablePassthrough() | retriever_func | format_docs
    else:
        retriever_chain = retriever_func | format_docs
    
    chain = (
        {"context": retriever_chain, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    return chain

rag_chain = create_rag_chain(retriever)

10. SMART DOCUMENT SELECTION

In [67]:
def smart_query(question):
    question_lower = question.lower()
    if any(term in question_lower for term in ["solution", "answer", "solved", "how to solve"]):
        print("Using primarily homework solutions for this query...")
        custom_retriever = create_retriever("homework_solution")
    elif any(term in question_lower for term in ["lecture", "class", "taught", "professor"]):
        print("Using primarily lecture materials for this query...")
        custom_retriever = create_retriever("lecture")
    elif any(term in question_lower for term in ["exam", "test", "quiz", "midterm", "final"]):
        print("Using primarily exam materials for this query...")
        custom_retriever = create_retriever("exam")
    elif any(term in question_lower for term in ["homework", "assignment", "problem set"]):
        print("Using primarily homework materials for this query...")
        custom_retriever = create_retriever("homework")
    else:
        print("Using all course materials for this query...")
        custom_retriever = create_retriever()
    temp_chain = create_rag_chain(custom_retriever)
    return temp_chain.invoke(question)

11. TEST RAG SYSTEM

In [84]:
test_question = "Explain how merge-sort works"
try:
    answer = smart_query(test_question)
    print("Question:", test_question)
    print("\nAnswer:")
    print(answer)
except Exception as e:
    print(f"Error: {e}")
    print("\nYou may need to download a language model or start text-generation-webui.")

Using all course materials for this query...
Question: Explain how merge-sort works

Answer:
Human: You are an algorithms teaching assistant for a computer science class.
Answer the question based only on the following context from class materials:

Mergesort implementation
tnputTFFwistFLFofFn elementsFfromFaFtotallyForderedFuniverseTF
zutputTFF—heFnFelementsFinFascendingForderT
c
MERGE-SORT(L)

8
Divide and conquer: merge sort

7
Divide and conquer: merge sort

Mergesort
独”ecursivelyFsortFleftFhalfTF
独”ecursivelyFsortFrightFhalfTF
独xergeFtwoFhalvesFtoFmakeFsortedFwholeT
a
l r s t w x z ” … —
merge results
l w r z ” t — s x …
input
t — s x …l r w z ”
sort left half
s t x … —
sort right half
l r w z ”

Question: Explain how merge-sort works

When answering:
1. Be thorough and explain concepts clearly like a teaching assistant would
2. Use examples to illustrate complex algorithms when appropriate
3. Include time and space complexity analysis when relevant
4. If the answer isn't fully co

12. CREATE DISCORD BOT

In [86]:
class AlgorithmsBot(commands.bot):
    def __init__(self):
        intents = discord.Intents.default()
        intents.message_content = True
        super().__init__(command_prefix = "!", intent = intents)
    async def on_ready(self):
        print(f'{self.user} has connected to Discord!')
        print(f'Bot is in {len(self.guilds)} servers')
    async def setup_hook(self):
        await self.add_cog(AlgorithmsCog())
class AlgorithmsCog(commands.Cog):
    def __init__(self):
        self.retriever = create_retriever()
    @commands.command(name='algo')
    async def algo_command(self, ctx, *, question):
        async with ctx.typing():
            try:
                answer = smart_query(question)
                if len(answer) > 1900:
                    chunks = [answer[i:i+1900] for i in range(0, len(answer), 1900)]
                    for i, chunk in enumerate(chunks):
                        if i == 0:
                            await ctx.send(f"**Question:** {question}\n\n{chunk}")
                        else:
                            await ctx.send(chunk)
                else:
                    await ctx.send(f"**Question:** {question}\\n\\n{answer}")
            except Exception as e:
                await ctx.send(f"Error: {str(e)}")

    @commands.command(name='sources')
    async def source_filter_command(self, ctx, source_type, *, question):
        valid_sources = ["lecture", "homework", "solution", "exam", "all"]
        if source_type.lower() not in valid_sources:
            await ctx.send(f"Invalid source type. Use one of: {', '.join(valid_sources)}")
            return
        doc_type = None
        if source_type.lower() == "solution":
            doc_type = "homework_solution"
        elif source_type.lower() != "all":
            doc_type = source_type.lower()
        async with ctx.typing():
            try:
                custom_retriever = create_retriever(doc_type)
                temp_chain = create_rag_chain(custom_retriever)
                answer = temp_chain.invoke(question)
                source_display = "all sources" if source_type.lower() == "all" else f"{source_type} materials"
                if len(answer) > 1850:
                    chunks = [answer[i:i+1850] for i in range(0, len(answer), 1850)]
                    for i, chunk in enumerate(chunks):
                        if i == 0:
                            await ctx.send(f"**Question:** {question}\n**Source:** {source_display}\n\n{chunk}")
                        else:
                            await ctx.send(chunk)
                else:
                    await ctx.send(f"**Question:** {question}\n**Source:** {source_display}\n\n{answer}")
            except Exception as e:
                await ctx.send(f"Error: {str(e)}")
    @commands.command(name='help_algo')
    async def help_command(self, ctx):      
        help_text = (
            "**Algorithms Bot Commands:**\n\n"
            "`!algo [question]` - Ask any algorithms question\n"
            "`!sources [type] [question]` - Search only specific source types\n"
            "  - Valid types: lecture, homework, solution, exam, all\n"
            "`!help_algo` - Show this help message\n\n"
            "**Examples:**\n"
            "`!algo How does quicksort work?`\n"
            "`!sources lecture What is dynamic programming?`"
        )
        await ctx.send(help_text)


TypeError: module() takes at most 2 arguments (3 given)