## Generate Question Answer pairs from a PDF using LangGraph - iterative refinement

#### Imports

In [1]:
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings


from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.output_parsers import StrOutputParser

from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableConfig
from langchain_chroma import Chroma


import operator
from typing import List, Literal, TypedDict
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.runnables import RunnableConfig
from langchain import hub

from decouple import config

from langgraph.graph import END, START, StateGraph

#### Trace in LangSmith

In [2]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = config('LANGCHAIN_API_KEY')

#### Load LLM and Embeddings

* Here I am using LM Studio for locally running the LLM.

In [3]:
llm = ChatOpenAI(api_key="lm-studio", base_url="http://localhost:1234/v1", temperature=0.5)

In [4]:
embeddings = OpenAIEmbeddings(
    api_key="sk-1234", 
    base_url="http://localhost:1234/v1/", 
    model="nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q4_K_M.gguf",
    check_embedding_ctx_length=False
    )

#### Step-1: Load PDF

In [5]:
pdf_path = "/home/acer/Downloads/NIPS-2012-imagenet-classification-with-deep-convolutional-neural-networks-Paper.pdf"
# pdf_path = "/home/acer/workspace/personal/llms/data/sdgp.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

#### Step-2: Create Question and Answer splitter

1. Create question splitter
2. From question splitter, create answer splitter. This is to ensure that you generate answers from the question context only.

In [None]:
question_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
questions = question_splitter.split_documents(docs)

print(questions[0].page_content)

In [None]:
answer_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
answers = answer_splitter.split_documents(questions)

print(answers[0].page_content)

#### Step-3: Create question generation prompt, prompt template, initial question generation chain

In [8]:
initial_question_generation_prompt = """
You are an expert in creating questions based on a provided text corpus.
Your goal is to help prepare questions for student tests and exams.
You should do this by asking questions from the text provided below:

------------
{context}
------------

Please ensure that you do not loose any important information.

QUESTIONS:
"""

In [9]:
initial_question_prompt_template = PromptTemplate(template=initial_question_generation_prompt, input_variables=["context"])

In [10]:
initial_question_generation_chain = initial_question_prompt_template | llm | StrOutputParser()

#### Step-4: Create question refinemnent prompt, prompt template, question refinement chain

In [11]:
refine_question_prompt = """
You are an expert in creating and refining questions based on a provided text corpus.
Your goal is to help a faculty prepare questions for student tests and exams.
Here are some of the questions already prepared: \n {existing_questions} \n.

You have an option to refine the existing questions or add new ones if necessary based on the new context provided below: \n
When you have a refined question, just replace the existing question with the new one. 
At the end provide a comlpete list of only the refined questions with a question mark.
\n ------------ \n

{context}

\n ------------ \n


QUESTIONS:
"""


In [12]:
refine_question_prompt_template = PromptTemplate(template=refine_question_prompt, input_variables=["context", "existing_questions"])

In [13]:
refine_summary_chain = refine_question_prompt_template | llm | StrOutputParser()

#### Step-5: Create State Graph using LangGraph

1. Create state dictionary - shared datastructure between nodes

In [14]:
class State(TypedDict):
    contents: List[str]
    index: int
    summary: str

2. Define the functions to be executed for each node

In [15]:
async def generate_initial_summary(state: State, config: RunnableConfig):
    summary = await initial_question_generation_chain.ainvoke(
        state["contents"][0],
        config,
    )

    return {"summary": summary, "index": 1}
    

In [16]:
async def refine_summary(state: State, config: RunnableConfig):
    content = state["contents"][state["index"]]
    summary = await refine_summary_chain.ainvoke(
        {"existing_questions" : state["summary"], "context" : content},
        config,
    )

    return {"summary": summary, "index": state["index"] + 1}

3. Define stopping criteria

In [17]:
def should_refine(state: State) -> Literal["refine_summary", END]:
    if state["index"] >= len(state["contents"]):
        return END
    else:
        return "refine_summary"

#### Step-6: Create a Graph, add node, add edges, compile, and display

In [None]:
graph = StateGraph(State)
graph.add_node("generate_initial_summary", generate_initial_summary)
graph.add_node("refine_summary", refine_summary)

In [None]:
graph.add_edge(START, "generate_initial_summary")
graph.add_conditional_edges("generate_initial_summary", should_refine)
graph.add_conditional_edges("refine_summary", should_refine)

In [20]:
app = graph.compile()

In [None]:
from IPython.display import Image
Image(app.get_graph().draw_mermaid_png())

#### Step-7: Execute the graph to generate initial questions and then refine them

In [None]:
questions_list = []
async for step in app.astream(
    {"contents": [doc.page_content for doc in questions]},
    stream_mode = "values",
):

    if summary := step.get("summary"):
        print(summary)
        questions_list.append(summary)

In [23]:
ques_list = questions_list[-1].split("\n")
filtered_ques_list = [element for element in ques_list if element.endswith('?')]

In [None]:
print(filtered_ques_list)

#### Step-8: Store answers in vector store, create a qa chain, ask generated questions, generate answers

In [25]:
vector_store = Chroma.from_documents(documents=answers, embedding=embeddings)

In [26]:
# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


qa_chain = (
    {
        "context": vector_store.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)


In [None]:
for question in filtered_ques_list:
    print(f"Question: {question}")
    response = qa_chain.invoke(question)
    print(f"Answer: {response}")
    print("---")

#### Further Work
* Refine question generation prompt to generate better questions
* Structured output?
* Recursion limit set