In [1]:
!pip install -r ../requirements.txt

Collecting langgraph (from -r ../requirements.txt (line 13))
  Downloading langgraph-0.2.35-py3-none-any.whl.metadata (13 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.0.0 (from langgraph->-r ../requirements.txt (line 13))
  Downloading langgraph_checkpoint-2.0.1-py3-none-any.whl.metadata (4.6 kB)
Collecting msgpack<2.0.0,>=1.1.0 (from langgraph-checkpoint<3.0.0,>=2.0.0->langgraph->-r ../requirements.txt (line 13))
  Downloading msgpack-1.1.0-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Downloading langgraph-0.2.35-py3-none-any.whl (108 kB)
   ---------------------------------------- 0.0/108.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/108.7 kB ? eta -:--:--
   --- ------------------------------------ 10.2/108.7 kB ? eta -:--:--
   -------------- ------------------------ 41.0/108.7 kB 393.8 kB/s eta 0:00:01
   ---------------------- ---------------- 61.4/108.7 kB 469.7 kB/s eta 0:00:01
   --------------------------------- ----- 92.2/108.7 kB 479.1 kB/s eta 0:00:01


In [9]:
!python --version

Python 3.12.4


# Importing

In [8]:
import os
from dotenv import load_dotenv
load_dotenv()
import streamlit as st
from IPython.display import Markdown
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_experimental.text_splitter import SemanticChunker
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.chat_message_histories import SQLChatMessageHistory, ChatMessageHistory
from langchain_community.vectorstores import FAISS
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import BaseMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langgraph.graph.message import add_messages
from typing_extensions import Annotated, TypedDict
from typing import Sequence
import warnings
warnings.filterwarnings("ignore")

In [7]:
# We define a dict representing the state of the application.
# This state has the same input and output keys as `rag_chain`.
class State(TypedDict):
    input: str
    chat_history: Annotated[Sequence[BaseMessage], add_messages]
    context: str
    answer: str

In [124]:
class chatmodel():
    
    def __init__(self) -> None:
        self._workflow = StateGraph(state_schema=State)
        # Define the (single) node in the graph
        self._workflow.add_edge(START, "model")
        self._workflow.add_node("model", self._call_model)

        memory = MemorySaver()
        # In the invocation process, _app can now handle config for personalized queries
        self._app = self._workflow.compile(checkpointer=memory)
        
    def _call_model(self, state: State, config: dict = None):
        # Use thread_id from config if provided
        thread_id = config.get("configurable", {}).get("thread_id", None)

        response = self._rag_chain.invoke(state)
        return {
            "chat_history": [
                HumanMessage(state["input"]),
                AIMessage(response["answer"]),
            ],
            "context": response["context"],
            "answer": response["answer"],
            "thread_id": thread_id  # Including thread_id in response if required
        }
    
    def ask_query(self, input_text: str, config: dict = None):
        state = {
            "input": input_text,
            "chat_history": [],
            "context": "",
            "answer": ""
        }
        # Invoke _app with state and config for user-specific query handling
        return self._app.invoke(state, config=config)

    def _initialize_api(
            self, 
            key_groq: str, 
            key_hf: str):
        '''
            Assigns Groq and HF API key to an object variable

            Args:
                self: reference to object
                key: Groq API Key

            Returns: None
        '''
        self._groq_api_key = os.getenv(key=key_groq)
        self._hf_api_key = os.getenv(key=key_hf)
        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_API_KEY"] = os.getenv(key="LANGCHAIN_API_KEY")
        
    def _initialize_model(
            self, 
            model_name: str, 
            temperature: float,
            embedding_model_name: str):
        self._model_name = model_name
        self._temperature = temperature
        self._model = ChatGroq(model=self._model_name, groq_api_key=self._groq_api_key, temperature=self._temperature)
        self._embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
    
    def _initialize_semantic_chunker(
            self,
            breakpoint_threshold_type: str,

    ):
        self._semantic_chunker = SemanticChunker(self._embeddings, breakpoint_threshold_type=breakpoint_threshold_type)
        # self._semantic_chunks = self._semantic_chunker.create_documents([d.page_content for d in self._pages])
        self._semantic_chunks = []
        for page in self._pages:
            for chunk in self._semantic_chunker.create_documents([page["content"]]):
                chunk.metadata["pdf_name"] = page["pdf_name"]
                self._semantic_chunks.append(chunk)

    def _initialize_retriever_chain(
            self
    ):
        self._vectorstore = FAISS.from_documents(self._semantic_chunks, self._embeddings)
        self._retriever = self._vectorstore.as_retriever()

        # Contextualize question
        contextualize_q_system_prompt = (
            "Given a chat history and the latest user question "
            "which might reference context in the chat history, "
            "formulate a standalone question which can be understood "
            "without the chat history. Do NOT answer the question, "
            "just reformulate it if needed and otherwise return it as is."
        )

        contextualize_q_prompt = ChatPromptTemplate(
            [
                ("system", contextualize_q_system_prompt),
                MessagesPlaceholder("chat_history"),
                ("human", "{input}"),
            ]
        )

        self._history_aware_retriever = create_history_aware_retriever(self._model, self._retriever, contextualize_q_prompt)
        
        # Incorporate the history aware retriever into a question-answering chain.
        self._system_prompt = (
            "You are an assistant for helping students for questions regarding academic policies or bylaws. You have been provided information from official sources."
            "Use ONLY the following pieces of retrieved context to answer."
            "the question. If the answer can be quoted from the PDFs then do that."
            "If the question is not related to academic policies/bylaws then DO NOT ANSWER. If the question is relevant to academic policies/bylaws and you do not know the answer" 
            "then say that you DO NOT know. Please Keep the answer moderately concise."
            "\n\n"
            "{context}"
        )

        self._prompt = ChatPromptTemplate.from_messages(
                    [
                        ("system", self._system_prompt),
                        MessagesPlaceholder("chat_history"),
                        ("human", "{input}"),
                    ]
                )
        
        self._question_answer_chain = create_stuff_documents_chain(self._model, self._prompt)
        self._rag_chain = create_retrieval_chain(self._history_aware_retriever, self._question_answer_chain)


    async def _load_pdfs(
            self,
            paths: list
    ):
        """
        Load multiple PDFs and store their content in self._pages with metadata.

        Args:
            paths: List of PDF paths.
        """
        self._pages = []
        for path in paths:
            loader = PyPDFLoader(path)
            async for page in loader.alazy_load():
                # Append each page along with the PDF name as metadata
                self._pages.append({
                    "content": page.page_content,
                    "pdf_name": path  # Add the PDF filename as metadata
                })        

In [125]:
# params

model_name = "gemma2-9b-it"
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
temperature = 0.1

In [126]:
model_obj = chatmodel()
model_obj._initialize_api("GROQ_API_KEY", "HF_TOKEN")
model_obj._initialize_model(model_name=model_name, temperature=temperature, embedding_model_name=embedding_model_name)

In [127]:
file_paths = ["auditing_courses_0.pdf","academic_accommodation_for_students_with_disabilities_amended_july_17_2017.pdf"]
await model_obj._load_pdfs(file_paths)

In [128]:
# model_obj._pages

In [129]:
model_obj._initialize_semantic_chunker(breakpoint_threshold_type="percentile")
model_obj._initialize_retriever_chain()

In [85]:
# Markdown(model_obj._semantic_chunks[4].page_content)

In [130]:
model_obj._vectorstore.similarity_search("What are the facilities for disabled students?", k=3)

[Document(metadata={'pdf_name': 'academic_accommodation_for_students_with_disabilities_amended_july_17_2017.pdf'}, page_content='2. The\tunivers ity\trecognizes\tits\tresponsibility\tto\tmaintain\tequal\trights\tfor\tall\tstudents\twith\tdisabilities\tin\ta\tfair\tand\t\nnon- discriminatory\tmanner. The\tuniversity\twill\tuphold\tits\tresponsibilities\tby\tensuring\tthe\tphysical\taccessibility\tof\t\nthe\tcampus,\tas\twell\tas,\tby\tcrea ting\tan\tenvironment\tthat\tis\tfree\tfrom\tbias\tand,\tinstead,\tembodies\tinclusiveness,\t\nmutual\trespect\tand\tunderstanding. 3. The\tUniversity\tof\tWindsor’s\t Student\tAccessibility\tServices \twill\twork\twith\tstudents\tand\tfaculty\tmembers\tto\t\nidentify\tpotential\tbarriers\tto\tacademic\tparticipation\tand\tto\tdetermine\tappropriate\taccommodations\tto\taddress\tthose\tbarriers\tfor\tstudents\twith\tdisabilities. This\twill\tbe\tdone\ton\ta\tcase\tby\tcase\tbasis\tprovided\tthat\tstudents\thave\tprovided\t Student\tAccessibility\tServ

USER 1

In [132]:
config = {"configurable": {"thread_id": "user1"}}
result = model_obj.ask_query(
    input_text="What are the steps for auditing courses?",
    config=config
)
Markdown(result['answer'])

Here are the steps to audit a course, according to the provided text:

1. **Contact the Office of the Registrar:** Reach out to them just before the course begins.
2. **Consultation:** The Office of the Registrar will work with the relevant faculty to determine if auditing is possible, considering space availability. 
3. **"Audit" Registration:** If approved, you'll be registered as an audit student. 



Let me know if you have any other questions. 


In [133]:
config = {"configurable": {"thread_id": "user1"}}
result = model_obj.ask_query(
    input_text="When was the policy on auditing courses approved by Senate?",
    config=config
)
Markdown(result['answer'])

The policy on auditing courses was approved by the Senate on June 7, 2006.  


In [134]:
# model_obj._vectorstore.similarity_search_with_score("When was the policy on auditing courses approved by Senate?", k=4)
model_obj._vectorstore.similarity_search_with_relevance_scores("When was the policy on auditing courses approved by Senate?", k=4)

[(Document(metadata={'pdf_name': 'auditing_courses_0.pdf'}, page_content='Page\t1\tof\t1\t\t\n\t\nSENATE\tPOLICY \t\n\t\nPolicy\ton\tAuditing\tCourses \t\n(Senate-approved:\tJune\t7,\t2006) \t\n\t\nAn\taudit\tstudent\tis\tone\twho,\tupon\tregistration,\tis\tadmitted\tto\ta\tcourse,\tand\tmay\tparticipate\tin\tclass\t discussion,\tbut\t\nwho\tmay\tnot\tsubmit\tassignments,\twrite\texaminations,\tor\tbe\tgraded\tin\tany\tway. Audit\tstudents\tmust\tbe\tadmitted\tto\tthe\tuniversity.'),
  0.7300665047613706),
 (Document(metadata={'pdf_name': 'auditing_courses_0.pdf'}, page_content='Audited\tcourses\twill\tappear\ton\tthe\tstudent\'s\ttranscript\tbut\tno\tcredit\twill\tbe\tgranted\tfor\tthe\tcourse. Audited\t courses\t\ndo\tnot\tcount\ttoward\ta\tstudent\'s\tload\tfor\tOSAP\tpurposes. Not\tall\tcourses\tcan\tbe\taudited. Lab\tcourses\tcannot\tbe\taudited. Only\tcourses\twith\tsufficient\tclassroom\tspace\tand\t\nenrolment\tcapacity\tcan\tbe\taudited. Audit\tstudents\tmust\thave\tcompleted\

In [135]:
config = {"configurable": {"thread_id": "user1"}}
result = model_obj.ask_query(
    input_text="What all questions did I ask about earlier?",
    config=config
)
print(result['answer'])

You asked the following questions:

1. What are the steps for auditing courses?
2. When was the policy on auditing courses approved by Senate? 


Let me know if you have any other questions!


In [136]:
config = {"configurable": {"thread_id": "user1"}}
result = model_obj.ask_query(
    input_text="How many stars in the universe?",
    config=config
)
print(result['answer'])

That's a question that has fascinated astronomers for centuries!  

Unfortunately, there's no single, definitive answer to how many stars are in the universe. Here's why:

* **The universe is vast:**  It's incredibly large, and we can only observe a tiny fraction of it.
* **Stars are constantly being born and dying:** The number is always in flux.
* **Limits of our technology:**  We can't see all stars, especially the faintest ones.

**Estimates:**

Scientists estimate that there are **hundreds of billions of galaxies** in the observable universe. Each galaxy contains **billions or even trillions of stars**.  

So, a rough estimate is that there are **an unfathomably large number of stars** in the universe, likely in the **quadrillions or even higher**. 


Let me know if you have any other space-related questions! 



USER 2

In [137]:
config = {"configurable": {"thread_id": "user2"}}
result = model_obj.ask_query(
    input_text="What is the purpose of academic accomodation for disabled students?",
    config=config
)
# print(result['answer'])

In [138]:
Markdown(result['answer'])

"The overarching purpose of academic accommodations is to ensure that students with disabilities may fulfill the essential requirements of their academic programs as well as learn and demonstrate learning in ways that accurately reflect each student’s potential." 


In [139]:
result['context']

[Document(metadata={'pdf_name': 'academic_accommodation_for_students_with_disabilities_amended_july_17_2017.pdf'}, page_content='This\tincludes\tcourse\trequirements\tand\tstandards”. Essential\trequirements\tcan\ttherefore\tbe\tdefined\tby\ttwo\tfactors:\ta\tskill\tthat\tmust\tbe\tnecessarily\tdemonstrated\tin\t\norder\tto\tmeet\tthe\t objectives\tof\ta\tcourse;\tand\ta\tskill\tthat\tmust\tbe\tdemonstrated\tin\ta\tprescribed\tmanner. Academic\tAccommodations \t\n\tAn\taccommodation\trefers\tto\tany\tservice,\tequipment,\tor\tarrangement\tthat\tis\tput\tin\tplace\tto\tsupport\ta\tstudent\twith\ta\t\ndisability\tin\tthe\tuniversity\t setting\tand\tis\tmeant\tto\tenable\tstudents\tto\tbest\tperform\tthe\tessential\trequirements\tof\t\ntheir\tacademic\tprogram. Functional\tlimitations\tthat\tlead\tto\taccommodations\twill\tbe\tidentified\tby\tmedical\t\nprofessionals\twho\tare\tsupporting\tthe\tstudent. Accommodations\tdo\t not\tundermine\tor\tcompromise\tthe\tessential\t\nrequirements\to

In [140]:
model_obj._vectorstore.similarity_search_with_relevance_scores("What is the purpose of academic accomodation for disabled students?", k=4)

[(Document(metadata={'pdf_name': 'academic_accommodation_for_students_with_disabilities_amended_july_17_2017.pdf'}, page_content='This\tincludes\tcourse\trequirements\tand\tstandards”. Essential\trequirements\tcan\ttherefore\tbe\tdefined\tby\ttwo\tfactors:\ta\tskill\tthat\tmust\tbe\tnecessarily\tdemonstrated\tin\t\norder\tto\tmeet\tthe\t objectives\tof\ta\tcourse;\tand\ta\tskill\tthat\tmust\tbe\tdemonstrated\tin\ta\tprescribed\tmanner. Academic\tAccommodations \t\n\tAn\taccommodation\trefers\tto\tany\tservice,\tequipment,\tor\tarrangement\tthat\tis\tput\tin\tplace\tto\tsupport\ta\tstudent\twith\ta\t\ndisability\tin\tthe\tuniversity\t setting\tand\tis\tmeant\tto\tenable\tstudents\tto\tbest\tperform\tthe\tessential\trequirements\tof\t\ntheir\tacademic\tprogram. Functional\tlimitations\tthat\tlead\tto\taccommodations\twill\tbe\tidentified\tby\tmedical\t\nprofessionals\twho\tare\tsupporting\tthe\tstudent. Accommodations\tdo\t not\tundermine\tor\tcompromise\tthe\tessential\t\nrequirements\t

In [141]:
config = {"configurable": {"thread_id": "user2"}}
result = model_obj.ask_query(
    input_text="What is my name?",
    config=config
)
print(result['answer'])

As an AI, I do not have access to personal information about you, including your name. 



In [142]:
config = {"configurable": {"thread_id": "user2"}}
result = model_obj.ask_query(
    input_text="My name is Arnab",
    config=config
)
print(result['answer'])

Hello Arnab, it's nice to meet you! 

Is there anything I can help you with regarding academic policies or bylaws? 



In [143]:
config = {"configurable": {"thread_id": "user2"}}
result = model_obj.ask_query(
    input_text="What is my name?",
    config=config
)
print(result['answer'])

Your name is Arnab.  

Is there anything else I can help you with?
