In [2]:
# install dependencies
! pip install langchain_ollama langchain_core 'markitdown[all]' pydantic langgraph



In [9]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="Accessing the 'model_fields' attribute on the instance is deprecated.*",
)

In [4]:
# spin up the llm model, I've ran a ollama on a remote server and port forwarded it to my 11434 port
from langchain_ollama import ChatOllama
llm = ChatOllama(model="llama3.1:8b")

In [5]:
from langchain_core.tools import tool
from markitdown import MarkItDown
from pydantic import BaseModel, Field
import os

md = MarkItDown(client=llm)

@tool
def get_markdown_pdf(file_name: str) -> str:
    """
    Given a file name, return the markdown content of the PDF if the file exists.
    """
    basedir = "pdf_repo"
    files = os.listdir(basedir)
    if file_name not in files:
        return f"File {file_name} not found in {basedir}, please check the file name."
    file_path = os.path.join(basedir, file_name)
    result = md.convert(file_path)
    if result is None:
        return f"Failed to convert {file_name} to markdown."
    return result.text_content

# Example usage
get_markdown_pdf("a.pdf")

  get_markdown_pdf("a.pdf")


'系\u3000所\u3000別\n\n組\u3000\u3000\u3000別\n\n所 組 代 碼\n\n身\u3000分\u3000別\n\n招 生 名 額\n\n報 名 資 格\n附 加 規 定\n\n報 名 審 查\n資       料\n\n審查\n\n筆試\n\n考試\n項目\n\n口試\n\n審 查\n方 式\n\n佔總成\n績比例\n\n參 加\n資 格\n\n筆 試\n科 目\n\n筆試日\n期地點\n\n筆試注\n意事項\n\n佔總成\n績比例\n\n參 加\n資 格\n\n口試日\n期地點\n\n口試注\n意事項\n\n佔總成\n績比例\n\n生醫電子與資訊學研究所碩士班\n\n丙組(主修生醫資訊)\n\n9210\n\n一般生\n\n7\n\n一、學位證書或學生證\n二、歷年成績單\n三、名次證明書\n四、1.履歷；2.申請動機與個人陳述（查照其他規定第二點）\n五、個人基本資料表（查照其他規定第三點）\n六、其他有助於審查之資料：如研究報告、競賽榮譽證書等皆可列入參考\n七、一封以上推薦函(推薦人或考生皆可上傳，不限格式)\n(報名期間補件請洽所辦公室，上傳截止日後除推薦函外不受理資料補件。)\n\n就所繳資料加以審查\n\n100%\n\n0%\n\n0%\n\n其 他 規 定\n\n一、報名請至研教組網頁。\n二、本所網頁已公告「必繳報名審查資料」之格式，請務必依規定格式繳交。\n三、個人基本資料表：報名前請至本所網站填寫後上傳至報名系統。此「個人基本\n\u3000\u3000資料表」為主要審查之參考，請正確且詳實填寫相關項目。\n四、考生請於鍵入報名資料時確認報考組別。\n五、報名系統關閉後，補繳推薦函需於10月16日(三)中午12:00前mail寄至yuchun\n\u3000\u3000chu@ntu.edu.tw。\n\n放 榜 梯 次\n\n於第1梯次放榜\n\n聯 絡 電 話\n\n(02)33664961\n\n網     址\n\nhttp://www.bebi.ntu.edu.tw\n\n114學年度-229\n\n'

In [6]:
from langgraph.prebuilt import create_react_agent
from langchain_core.messages import HumanMessage
from langgraph.checkpoint.memory import MemorySaver

memory = MemorySaver()

agent_executor = create_react_agent(llm, [get_markdown_pdf], checkpointer=memory)


In [7]:
from langchain_core.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a professional pdf analyzer."),
    ("human", "Please compare the following files: {file_name1}, {file_name2}.")
])

# Example usage
prompted_messages = prompt_template.invoke({"file_name1": "a.pdf", "file_name2": "b.pdf"})
prompted_messages.to_messages()

[SystemMessage(content='You are a professional pdf analyzer.', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Please compare the following files: a.pdf, b.pdf.', additional_kwargs={}, response_metadata={})]

In [10]:
config = {"configurable": {"thread_id": "abc134"}}

# 先用 prompt_template 生成一組訊息
prompted_messages = prompt_template.invoke({"file_name1": input("請輸入你想比較的文件1: "), "file_name2": input("請輸入你想比較的文件2: ")})

# stream
for chunk in agent_executor.stream(
    {"messages": prompted_messages.to_messages()},
    config,
):
    print(chunk)
    print("----")


{'agent': {'messages': [AIMessage(content='', additional_kwargs={}, response_metadata={'model': 'llama3.1:8b', 'created_at': '2025-04-26T12:27:40.235670386Z', 'done': True, 'done_reason': 'stop', 'total_duration': 3210384097, 'load_duration': 51231040, 'prompt_eval_count': 187, 'prompt_eval_duration': 1289929223, 'eval_count': 42, 'eval_duration': 1866685007, 'model_name': 'llama3.1:8b'}, id='run-22510df5-842a-4b22-8c28-c325b66a076c-0', tool_calls=[{'name': 'get_markdown_pdf', 'args': {'file_name': 'a.pdf'}, 'id': '874cad62-81f6-4e39-a987-5177298d0039', 'type': 'tool_call'}, {'name': 'get_markdown_pdf', 'args': {'file_name': 'b.pdf'}, 'id': '6c36a08e-ffa0-410c-857b-6f87e3768f1f', 'type': 'tool_call'}], usage_metadata={'input_tokens': 187, 'output_tokens': 42, 'total_tokens': 229})]}}
----
{'tools': {'messages': [ToolMessage(content='系\u3000所\u3000別\n\n組\u3000\u3000\u3000別\n\n所 組 代 碼\n\n身\u3000分\u3000別\n\n招 生 名 額\n\n報 名 資 格\n附 加 規 定\n\n報 名 審 查\n資       料\n\n審查\n\n筆試\n\n考試\n項目\n\n口試\n\n審

In [13]:
for chunk in agent_executor.stream(
    {"messages": [HumanMessage(content="What's the biggest difference tho?")]}, config
):
    print(chunk)
    print("----")

{'agent': {'messages': [AIMessage(content='', additional_kwargs={}, response_metadata={'model': 'llama3.1:8b', 'created_at': '2025-04-26T11:40:35.614033419Z', 'done': True, 'done_reason': 'stop', 'total_duration': 2622053248, 'load_duration': 51951208, 'prompt_eval_count': 1659, 'prompt_eval_duration': 1522296464, 'eval_count': 22, 'eval_duration': 1017401065, 'model_name': 'llama3.1:8b'}, id='run-145e7885-a656-44c5-a51b-fca4d6f96c10-0', tool_calls=[{'name': 'get_markdown_pdf', 'args': {'file_name': 'diff_a.pdf'}, 'id': '4530903e-604a-4e39-b1ac-b00e4f6939b2', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1659, 'output_tokens': 22, 'total_tokens': 1681})]}}
----
{'tools': {'messages': [ToolMessage(content='File diff_a.pdf not found in pdf_repo, please check the file name.', name='get_markdown_pdf', id='c095c36e-2c06-4ef5-91b3-123c1631e3d3', tool_call_id='4530903e-604a-4e39-b1ac-b00e4f6939b2')]}}
----
{'agent': {'messages': [AIMessage(content='Based on my previous analysis, one 

In [14]:
# 先用 prompt_template 生成一組訊息
prompted_messages = prompt_template.invoke({"file_name1": input("請輸入你想比較的文件1: "), "file_name2": input("請輸入你想比較的文件2: ")})
config = {"configurable": {"thread_id": "abc2223"}}
## example using streaming 
# stream
for step, metadata in agent_executor.stream(
    {"messages": prompted_messages.to_messages()},
    config,
    stream_mode="messages",
):
    if metadata["langgraph_node"] == "agent" and (text := step.text()):
        print(text, end="|")

Based on the comparison of the two PDF files, here are some observations:

1. Both PDFs appear to be related to graduate school admission at National Taiwan University.
2. The content is mostly identical, with minor differences in formatting and layout.
3. There are two different sections or "組" mentioned: 甲組 (Group A) and 丙組 (Group C).
4. Group A has a specific requirement for the applicant's major to be "生醫電子一" (Biomedical Electronics I), while Group C has a more general requirement for the applicant's major.
5. The two PDFs have different numbers of applicants accepted: 16 in Group A and 7 in Group C.

In summary, while the content is mostly identical, there are some differences between the two PDFs related to specific requirements and applicant numbers for each group.|

In [None]:
for step, metadata in agent_executor.stream(
    {"messages": HumanMessage(content="Which group interests you the most?")},
    config,
    stream_mode="messages",
):
    if metadata["langgraph_node"] == "agent" and (text := step.text()):
        print(text, end="|")