In [1]:
%pip install -qU langchain langchain-community langchain-openai chromadb arxiv tiktoken pymupdf #opentelemetry-sdk==1.31.0

In [2]:
import os
os.environ["OPENAI_API_KEY"] ='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'
LLM_MODEL = "gpt-4o-mini"
EMBED_MODEL = "text-embedding-3-large"
CHROMA_DIR = "./chroma_arxiv"
COLLECTION_NAME = "arxiv_top_tier"

In [3]:
from langchain_community.retrievers import ArxivRetriever
import re
import arxiv
import fitz

def build_top_tier_query(user_query: str) -> str:
    venues = ['(NeurIPS)','(ICML)', '(ICLR)', '(ACL)']#, '(CVPR)', '(ICCV)', '(ECCV)', '(AAAI)', '(KDD)', '(ACL)', '(EMNLP)']
    venue_filter = " OR ".join([f"jr:{v} OR co:{v}" for v in venues])
    #return f"({venue_filter}) AND ({user_query})"
    return f"({user_query})"


retriever = ArxivRetriever(
    load_max_docs=15,
    doc_content_chars_max=30000
)

def extract_common_fields_from_doc(d):
    m = d.metadata or {}
    title     = m.get("title") or m.get("Title")  # pretty_print_docs와 동일
    url       = m.get("entry_id") or m.get("Entry ID") or m.get("Entry_ID") or m.get("url") or m.get("pdf_url")
    authors   = m.get("Authors")
    published = str(m.get("Published") or m.get("published") or m.get("publish_date"))
    content   = (d.page_content or "").strip()
    return title, url, authors, published, content


def pretty_print_docs(docs, max_chars=1200):
    lines = []
    for i, d in enumerate(docs, 1):
        title, url, authors, published, content = extract_common_fields_from_doc(d)
        if len(content) > max_chars:
            content = content[:max_chars] + " ..."
        block = [
            f"### [{i}] {title}",
            f"- url: {url}",
            f"- published: {published}",
            f"- authors: {authors}",
            f"- content:\n{content}",
        ]
        lines.append("\n".join(block))
    return "\n\n".join(lines)

def extract_arxiv_id_from_url(url: str) -> str:
    if not url:
        return ""
    m = re.search(r'arxiv\.org/(abs|pdf)/([0-9]+\.[0-9]+)', url)
    return m.group(2) if m else ""

def fetch_full_text_from_arxiv_id(arxiv_id: str, char_limit: int = None) -> str:
    if not arxiv_id:
        return ""
    search = arxiv.Search(id_list=[arxiv_id])
    client = arxiv.Client()
    results = list(client.results(search))
    if not results:
        return ""
    pdf_path = results[0].download_pdf()
    text = ""
    with fitz.open(pdf_path) as doc:
        text = "".join(page.get_text() for page in doc)
    if char_limit is not None and len(text) > char_limit:
        text = text[:char_limit]
    return text.strip()

In [4]:
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import TokenTextSplitter
from langchain_community.vectorstores import Chroma

embeddings = OpenAIEmbeddings(model=EMBED_MODEL)
text_splitter = TokenTextSplitter(
    chunk_size=3000, chunk_overlap=1000
)
vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=CHROMA_DIR,
)

def upsert_into_chroma(docs, fulltext_char_limit: int = None):
    texts, metadatas, ids = [], [], []
    for d in docs:
        title, url, authors, published, abstract = extract_common_fields_from_doc(d)
        arxiv_id = extract_arxiv_id_from_url(url) or (re.sub(r'\W+', '-', str(title))[:50] if title else "")

        # PDF 전문 텍스트 우선 수집 (실패 시 초록으로 폴백)
        full_text = ""
        try:
            full_text = fetch_full_text_from_arxiv_id(arxiv_id, char_limit=fulltext_char_limit)
        except Exception:
            full_text = ""
        if not full_text:
            full_text = abstract or ""
        if not full_text.strip():
            continue

        chunks = text_splitter.split_text(full_text)
        for j, chunk in enumerate(chunks):
            texts.append(chunk)
            metadatas.append({
                "paper_title": title,
                "paper_url": url,
                "authors": authors,
                "published": published,
                "arxiv_id": arxiv_id,
                "chunk_index": j,
            })
            ids.append(f"{arxiv_id}::{j}")

    if texts:
        vectorstore.add_texts(texts=texts, metadatas=metadatas, ids=ids)
        vectorstore.persist()

def run_arxiv_search_with_top_tier(user_query: str):
    query = build_top_tier_query(user_query)
    docs = retriever.invoke(query)
    rendered = pretty_print_docs(docs)
    upsert_into_chroma(docs, fulltext_char_limit=None)
    return docs, rendered

  vectorstore = Chroma(


In [5]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(model=LLM_MODEL, temperature=1)

summary_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are an expert AI research summarizer. Using the provided arXiv documents, "
     "first write a concise overview of the research landscape related to the user's query. "
     "Then provide a bullet list of references with: [Title](URL) — Published — Authors. "
     "The base language is Korean for explanations (not titles/authors/model names)."),
    ("human",
     "User query:\n{user_query}\n\nDocuments:\n{docs_rendered}")
])

summary_chain = summary_prompt | llm | StrOutputParser()

In [6]:
from langchain_core.runnables import RunnableParallel, RunnableLambda

local_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

rag_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "Answer the user's question using ONLY the provided context chunks from previously saved papers. "
     "Cite the paper titles inline when relevant. If not in context, say you don't have it."),
    ("human", "Question: {question}\n\nContext:\n{context}")
])

def format_docs(docs):
    lines = []
    for d in docs:
        m = d.metadata or {}
        title = m.get("paper_title", "(unknown)")
        lines.append(f"[{title}] (chunk {m.get('chunk_index', '?')}):\n{d.page_content}\n")
    return "\n\n".join(lines)

rag_chain = (
    {"context": local_retriever | RunnableLambda(format_docs), "question": RunnableLambda(lambda x: x)}
    | rag_prompt
    | llm
    | StrOutputParser()
)

In [7]:
# ================================
# 첫 질의 파이프라인
# ================================
def run_pipeline(user_query: str):
    docs, rendered = run_arxiv_search_with_top_tier(user_query)
    print("=== [Retrieved Docs for Agent] ===\n")  # UPDATED
    print(rendered)

    print("\n\n=== [Summarized Overview + References] ===\n")
    overview = summary_chain.invoke({"user_query": user_query, "docs_rendered": rendered})
    print(overview)
    # 후속판단용으로 '추천 7편의 제목 목록' 보관
    global RECOMMENDED_TITLES
    RECOMMENDED_TITLES = []
    for d in docs:
        t, _, _, _, _ = extract_common_fields_from_doc(d)
        if t:
            RECOMMENDED_TITLES.append(t)
    return #overview

# ================================
# 후속 질의용 Agent 라우팅
# ================================
route_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a router. Decide whether the user's follow-up question is primarily about any of the following papers. "
     "If yes, answer EXACTLY 'RAG' (and list related titles after a pipe), else answer EXACTLY 'NO_RAG'. "
     "Do not add extra words.\n\nPapers:\n{titles}"),
    ("human", "Question: {question}")
])

route_chain = route_prompt | llm | StrOutputParser()

nonrag_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a helpful AI research assistant. Answer the question concisely in Korean. "
     "Do not assume access to local papers."),
    ("human", "{question}")
])
nonrag_chain = nonrag_prompt | llm | StrOutputParser()

def ask_followup_agent(question: str):
    titles_str = "\n".join(f"- {t}" for t in globals().get("RECOMMENDED_TITLES", []))
    decision = route_chain.invoke({"titles": titles_str, "question": question}).strip()
    use_rag = decision.startswith("RAG")
    print(f"[Router] decision = {decision}")

    if use_rag:
        print("=== [RAG Answer From Local Chroma] ===\n")
        ans = rag_chain.invoke(question)
    else:
        print("=== [Non-RAG LLM Answer] ===\n")
        ans = nonrag_chain.invoke({"question": question})
    print(ans)
    return #ans

In [8]:
run_pipeline("CT denoising")


  vectorstore.persist()


=== [Retrieved Docs for Agent] ===

### [1] Self-Supervised Learning based CT Denoising using Pseudo-CT Image Pairs
- url: http://arxiv.org/abs/2104.02326v1
- published: 2021-04-06
- authors: Dongkyu Won, Euijin Jung, Sion An, Philip Chikontwe, Sang Hyun Park
- content:
Recently, Self-supervised learning methods able to perform image denoising
without ground truth labels have been proposed. These methods create
low-quality images by adding random or Gaussian noise to images and then train
a model for denoising. Ideally, it would be beneficial if one can generate
high-quality CT images with only a few training samples via self-supervision.
However, the performance of CT denoising is generally limited due to the
complexity of CT noise. To address this problem, we propose a novel
self-supervised learning-based CT denoising method. In particular, we train
pre-train CT denoising and noise models that can predict CT noise from Low-dose
CT (LDCT) using available LDCT and Normal-dose CT (NDCT)

In [9]:
ask_followup_agent("세 번째 논문에서 Bilateral Filter가 어떻게 학습 가능하다는거야? 이는 non-trainable한 필터아냐?")

[Router] decision = RAG | Trainable Joint Bilateral Filters for Enhanced Prediction Stability in Low-dose CT
=== [RAG Answer From Local Chroma] ===

세 번째 논문에서는 Bilateral Filter가 학습 가능하다는 것을 어떻게 설명하고 있는지에 대해 설명합니다. 일반적으로 Bilateral Filter는 non-trainable한 필터로 알려져 있지만, 이 연구에서는 "trainable joint bilateral filter (JBF)"를 도입하여 이를 해결합니다. 이 JBF는 깊이 신경망에서 사용할 수 있도록 완전히 미분 가능하게 설계되었으며, 입력 이미지 및 가이드 이미지에 대한 예측을 기반으로 필터 파라미터를 학습합니다.

특히, JBF는 가이드 이미지를 통해 추가 정보를 고려하여 노이즈 제거를 수행하는데, 이 과정에서 네 개의 kernel width 파라미터가 학습 가능한 가중치로 사용됩니다. 이 필터는 [Trainable Joint Bilateral Filters for Enhanced Prediction Stability in Low-dose CT] 논문에 의해 제안된 것으로, 기존의 Bilateral Filter와 달리 깊이 학습 모델의 일부분으로 통합되어 데이터 기반 최적화가 가능하다는 점에서 차별화됩니다 (chunk 0, chunk 1).

이러한 접근법은 기존의 Bilateral Filter의 특징을 유지하면서도 딥러닝 기반 모델의 높은 표현력을 결합하여 이미지 품질 향상을 꾀하며, 필터의 파라미터는 학습 과정 중에 효과적으로 조정될 수 있습니다 (chunk 3).
