# Tools

In [56]:
# ai_scraper_langgraph.py

from typing import List, Dict, Optional
from langchain_core.messages import ToolMessage, SystemMessage, AIMessage
from langgraph.graph import StateGraph,START, END,MessagesState
from langgraph.prebuilt import ToolNode
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from bs4 import BeautifulSoup
from langgraph.prebuilt import tools_condition
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from dotenv import load_dotenv
import re


load_dotenv()

# ------------------- STATE -------------------


class ScraperState(MessagesState):
    html: Optional[str] = None
    result: Dict[str, Optional[str]] = {}

# ------------------- TOOLS -------------------

def extract_title(html: str) -> str:
    """
    Extracts the most probable news title from the HTML content.

    Args:
        html (str): The HTML content of the web page.

    Returns:
        str: The extracted title, or empty string if not found.
    """
    soup = BeautifulSoup(html, "html.parser")
    candidates = soup.find_all(["h1", "h2", "div"])

    best_score = 0
    best_candidate = None

    for tag in candidates:
        text = tag.get_text(strip=True)
        if not text or len(text) < 15:
            continue

        score = 0
        if tag.name == "h1":
            score += 2
        elif tag.name == "h2":
            score += 1

        keywords = ["casino", "gaming", "launch", "opens", "regulation", "market"]
        for kw in keywords:
            if kw in text.lower():
                score += 2

        if len(text) > 150:
            score -= 1

        if score > best_score:
            best_score = score
            best_candidate = tag

    return best_candidate.get_text(strip=True) if best_candidate else ""

def extract_kicker(html: str) -> str:
    """
    Extracts a short headline or context (kicker) from the HTML.

    Args:
        html (str): The HTML content of the web page.

    Returns:
        str: The kicker text, or empty string if not found.
    """
    soup = BeautifulSoup(html, "html.parser")
    divs = soup.find_all("div")
    for d in divs:
        text = d.get_text(strip=True)
        if text and len(text.split()) < 10 and len(text) > 5:
            return text
    return ""

def extract_image(html: str) -> str:
    """
    Extracts the most relevant image URL (usually from article or banner).

    Args:
        html (str): The HTML content of the web page.

    Returns:
        str: The image URL, or empty string if not found.
    """
    soup = BeautifulSoup(html, "html.parser")
    images = soup.find_all("img")
    for img in images:
        src = img.get("src")
        if src and re.search(r"(casino|gaming|event|banner).*\\.(jpg|png)", src, re.IGNORECASE):
            return src
    return images[0].get("src") if images else ""

def extract_link(html: str) -> str:
    """
    Extracts the main article or news link from the HTML.

    Args:
        html (str): The HTML content of the web page.

    Returns:
        str: The URL (href) of the most relevant link, or empty string.
    """
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a", href=True)
    for a in links:
        text = a.get_text(strip=True).lower()
        if "read more" in text or "ver más" in text or len(text) > 10:
            return a["href"]
    return links[0]["href"] if links else ""


In [61]:





llm_prompt = PromptTemplate.from_template("""

You are an intelligent assistant specialized in extracting structured and meaningful information from raw HTML.

Below, you will find a raw HTML snippet. Your job is to analyze this HTML and extract the following fields related to a news article:

- **Title**: The main headline of the article. Prefer <h1>, <h2>, or elements with class names like "title", "headline", or similar.
- **Kicker**: A short introductory phrase or category often found above the title (sometimes labeled as "kicker" or "volanta").
- **Image URL**: The URL of the most representative image related to the article. Look for <img> tags near the title or in a container for the article.
- **Article Link**: The main URL (usually inside <a href="...">) that leads to the full article.

### Input HTML:

{html}

### Instructions:

- Focus on extracting values that are **complete**, **readable**, and **clearly related to a news article**.
- Avoid template content, unrelated links, or empty tags.
- If multiple options exist, **choose the one that best matches a real article** based on context (e.g., close to a headline or kicker).
- If any field is not found, return `"Not found"` for that field.
- Present your final output in this format:

json

  "title": "...",
  "kicker": "...",
  "image_url": "...",
  "link": "..."

"""
 )

TOOLS = [extract_title, extract_kicker, extract_image, extract_link]
llm = ChatOpenAI(model="gpt-4", temperature=0)
llm_with_tools = llm.bind_tools(TOOLS)

def assistant_node(state: ScraperState):
    if not state["html"]:
        raise ValueError("Missing HTML content")
    prompt_with_html = llm_prompt.invoke({"html":state["html"]}).text
    
    system_msg = SystemMessage(content=prompt_with_html)
    print(system_msg)
    result = llm_with_tools.invoke([system_msg] + state["messages"])
    return {"messages": state["messages"] + [result]}

def router(state: ScraperState):
    if state["messages"] and hasattr(state["messages"][-1], "tool_calls") and state["messages"][-1].tool_calls:
        return "tools"
    return "collect"

def collect_results(state: ScraperState):
    result = {}
    for msg in state["messages"]:
        if isinstance(msg, ToolMessage):
            result[msg.name] = msg.content
    
    return {"result": result, "messages": state["messages"]}

# ------------------- LANGGRAPH SETUP -------------------

builder = StateGraph(ScraperState)

builder.add_node("assistant", assistant_node)
builder.add_node("tools", ToolNode(TOOLS))
builder.add_node("collect", collect_results)

builder.add_edge(START, "assistant")
builder.add_conditional_edges("assistant", router, {"tools": "tools", "collect":"collect"})
builder.add_edge("tools", "assistant")
builder.add_edge("collect", END)

scraper_graph = builder.compile()

# ------------------- SCRAPER RUNNER -------------------

def get_html(url, max_chars=4000):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=options)
    driver.get(url)
    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")

    # Eliminar tags innecesarios
    for tag in soup(["script", "style", "footer", "nav", "header", "noscript", "meta", "link"]):
        tag.decompose()

    extracted = []

    # Buscar texto visible directamente en <h1>, <h2>, <p> y <a>
    for tag in soup.find_all(["h1", "h2", "p", "a"]):
        text = tag.get_text(strip=True)
        if len(text) > 30:
            extracted.append(text)
        if sum(len(x) for x in extracted) > max_chars:
            break

    # Extraer también las primeras 1–2 imágenes con src relevante
    images = soup.find_all("img")
    img_urls = []
    for img in images:
        src = img.get("src")
        if src and "logo" not in src.lower():
            img_urls.append(f"<img src='{src}'/>")
        if len(img_urls) >= 2:
            break

    output_html = "\n".join(img_urls + extracted)
    return output_html
def run_scraper_on_url(url: str) -> Dict[str, Optional[str]]:
    html = get_html(url)
    input_state = ScraperState(messages=[], html=html)
   
    result = scraper_graph.invoke(input_state)
    return result["result"]


In [60]:

url = "https://www.yogonet.com/international/"
html = get_html(url)
input = {
       "messages": [],
        "html" :   html
     
}


events = scraper_graph.stream(input,
    # Maximum number of steps to take in the graph
    {"recursion_limit": 150},
)
messages = scraper_graph.invoke(input)

for m in messages['messages']:
    m.pretty_print()

content='\n\nYou are an intelligent assistant specialized in extracting structured and meaningful information from raw HTML.\n\nBelow, you will find a raw HTML snippet. Your job is to analyze this HTML and extract the following fields related to a news article:\n\n- **Title**: The main headline of the article. Prefer <h1>, <h2>, or elements with class names like "title", "headline", or similar.\n- **Kicker**: A short introductory phrase or category often found above the title (sometimes labeled as "kicker" or "volanta").\n- **Image URL**: The URL of the most representative image related to the article. Look for <img> tags near the title or in a container for the article.\n- **Article Link**: The main URL (usually inside <a href="...">) that leads to the full article.\n\n### Input HTML:\n\n<img src=\'https://imagenesyogonet.b-cdn.net/data/imagenes/2025/04/11/74582/1744379971-grand-island-casino-resort-apertura-01.jpg\'/>\n<img src=\'https://imagenesyogonet.b-cdn.net/data/imagenes/2024/0

[AIMessage(content='As an AI model, I don\'t have the ability to directly interact with tools or functions. However, I can guide you on how to use these functions to extract the required information from the HTML source.\n\n1. **Title**: Use the `extract_title` function. Pass the HTML content as a parameter to this function. It will return the most probable news title from the HTML content.\n\n```javascript\nfunctions.extract_title({html: "your_html_content"});\n```\n\n2. **Kicker**: Use the `extract_kicker` function. Pass the HTML content as a parameter to this function. It will return a short headline or context from the HTML.\n\n```javascript\nfunctions.extract_kicker({html: "your_html_content"});\n```\n\n3. **Image URL**: Use the `extract_image` function. Pass the HTML content as a parameter to this function. It will return the most relevant image URL from the HTML.\n\n```javascript\nfunctions.extract_image({html: "your_html_content"});\n```\n\n4. **Link of the URL**: Use the `extr