In [None]:
# -*- coding: utf-8 -*-
"""
Daily Research Monitor ‚Äì Hugging Face Top Papers
Startdatum: 29.10.2025
"""

import os
import sqlite3
import datetime
import requests, json
from bs4 import BeautifulSoup
from huggingface_hub import HfApi
import arxiv
from pypdf import PdfReader
from smolagents import tool, CodeAgent, LiteLLMModel

# ==============================
# Datenbank-Funktionen
# ==============================

DB_FILE = "papers.db"

def init_db():
    """Erstellt Tabelle, falls sie noch nicht existiert."""
    conn = sqlite3.connect(DB_FILE)
    cur = conn.cursor()
    cur.execute("""
        CREATE TABLE IF NOT EXISTS papers (
            id TEXT PRIMARY KEY,
            title TEXT,
            authors TEXT,
            summary TEXT,
            date_processed TEXT
        )
    """)
    conn.commit()
    conn.close()

def already_processed(paper_id):
    """Pr√ºft, ob Paper-ID bereits in DB existiert."""
    conn = sqlite3.connect(DB_FILE)
    cur = conn.cursor()
    cur.execute("SELECT 1 FROM papers WHERE id=?", (paper_id,))
    found = cur.fetchone()
    conn.close()
    return found is not None

def add_paper(paper):
    """Speichert neues Paper in DB."""
    conn = sqlite3.connect(DB_FILE)
    cur = conn.cursor()

    # Autorenfeld in String umwandeln, falls Liste o.√§.
    authors = paper.get("authors", "")
    if isinstance(authors, (list, tuple)):
        authors = ", ".join(authors)

    cur.execute("""
        INSERT OR IGNORE INTO papers (id, title, authors, summary, date_processed)
        VALUES (?, ?, ?, ?, ?)
    """, (
        str(paper.get("id", "")),
        str(paper.get("title", "")),
        authors,
        str(paper.get("summary", "")),
        datetime.date(2025, 10, 29).isoformat()
    ))
    conn.commit()
    conn.close()

# ==============================
# Tools
# ==============================

@tool
def get_top_three_papers() -> list:
    """
    Returns a list of the top 3 daily papers (titles) from Hugging Face.
    """
    url = "https://huggingface.co/papers"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    containers = soup.find_all('div', class_='SVELTE_HYDRATER contents')
    titles = []

    for container in containers:
        data_props = container.get('data-props', '')
        if not data_props:
            continue
        try:
            data_json = json.loads(data_props.replace('&quot;', '"'))
            if 'dailyPapers' in data_json:
                for paper in data_json['dailyPapers'][:3]:
                    titles.append(paper['title'])
                break
        except json.JSONDecodeError:
            continue

    return titles or ["No papers found."]


@tool
def get_paper_id_by_title(title: str) -> str:
    """
    Returns the arXiv paper ID by its title.

    Args:
        title: The paper title for which to get the ID.
    """
    api = HfApi()
    papers = api.list_papers(query=title)
    if papers:
        paper = next(iter(papers))
        return paper.id
    else:
        return "No paper ID found."


@tool
def download_paper_by_id(paper_id: str) -> str:
    """
    Downloads the arXiv paper by ID and saves it as 'paper_<id>.pdf'.

    Args:
        paper_id: The arXiv ID of the paper to download (e.g., "1706.03762").
    """
    try:
        client = arxiv.Client()
        search = arxiv.Search(id_list=[paper_id])
        paper = next(client.results(search))
        filename = f"paper_{paper_id}.pdf"
        paper.download_pdf(filename=filename)
        return filename
    except Exception as e:
        return f"Error downloading paper: {e}"


@tool
def read_pdf_file(file_path: str) -> str:
    """
    Reads the first 3 pages of a PDF and returns text.

    Args:
        file_path: Path to the PDF file.
    """
    try:
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages[:3]:
            text += page.extract_text() or ""
        return text.strip()
    except Exception as e:
        return f"Error reading {file_path}: {e}"

# ==============================
# Hauptlogik
# ==============================

def main():
    init_db()
    model = LiteLLMModel(model_id="ollama_chat/glm-4.6:cloud")

    agent = CodeAgent(
        tools=[get_top_three_papers, get_paper_id_by_title, download_paper_by_id, read_pdf_file],
        model=model,
        stream_outputs=False
    )

    result = agent.run("""
    Fetch the top 3 papers from Hugging Face daily papers.
    For each paper:
    - get its arXiv ID,
    - download it,
    - read the first pages,
    - extract author list if available,
    - and summarize it.
    Return structured JSON with: title, id, authors, summary.
    """)

    new_papers = []

    # ü©π Abfanglogik f√ºr unterschiedliche Typen
    if hasattr(result, "text"):           # AgentText-Objekt ‚Üí Text extrahieren
        import json
        try:
            parsed = json.loads(result.text)
            papers = parsed if isinstance(parsed, list) else parsed.get("papers", [])
        except Exception:
            print("‚ö†Ô∏è  Output war kein valides JSON. Keine neuen Paper gefunden.")
            papers = []
    elif isinstance(result, list):
        papers = result
    elif isinstance(result, dict):
        papers = result.get("papers", [])
    else:
        papers = []

    for paper in papers:
        if not already_processed(paper["id"]):
            add_paper(paper)
            new_papers.append(paper)

    if not new_papers:
        print("No new papers found.")
    else:
        print("\n=== NEW PAPERS ===")
        for p in new_papers:
            print(f"- {p['title']}")

if __name__ == "__main__":
    main()