In [1]:
import sys
import os

PROJECT_ROOT = r"C:\Users\Angel\OneDrive - Universidad Complutense de Madrid (UCM)\Documentos\MASTER\99_tfm\tfm_newsletter_ai"

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [None]:
from scraping.scraper_base import BaseScraper
import time

class HuggingFaceScraper(BaseScraper):
    def __init__(self, max_pages=50, sleep_time=1.0, scrape_community=False):
        super().__init__("Hugging Face Blog")
        self.base_url = "https://huggingface.co/blog"
        self.max_pages = max_pages
        self.sleep_time = sleep_time
        self.scrape_community = scrape_community

    def get_article_links(self):
        links = set()

        for page in range(1, self.max_pages + 1):
            url = self.base_url if page == 1 else f"{self.base_url}?p={page}"
            soup = self.get_soup(url)

            if soup is None:
                break

            # Buscar todos los enlaces a artículos
            blog_links = soup.find_all("a", href=True)
            
            for link in blog_links:
                href = link["href"]
                
                # Si scrapeamos community, incluimos todo
                if self.scrape_community:
                    if href.startswith("/blog/") and href != "/blog/":
                        full_url = f"https://huggingface.co{href}"
                        links.add(full_url)
                # Si NO scrapeamos community, excluimos esos artículos
                else:
                    if (href.startswith("/blog/") and 
                        not href.startswith("/blog/community") and 
                        href != "/blog/"):
                        full_url = f"https://huggingface.co{href}"
                        links.add(full_url)

            time.sleep(self.sleep_time)

        return list(links)

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        content_div = soup.find("div", {"class": "prose"})

        if not title or not content_div:
            return None

        paragraphs = content_div.find_all("p")

        if not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True),
            content=self.clean_text(paragraphs)
        )

In [2]:
# Opción 1: Solo artículos del blog principal (sin community)
scraper = HuggingFaceScraper(max_pages=3, sleep_time=1, scrape_community=False)
links = scraper.get_article_links()
print(f"Artículos del blog principal: {len(links)}")

# Opción 2: Todos los artículos (blog + community)
scraper_all = HuggingFaceScraper(max_pages=3, sleep_time=1, scrape_community=True)
links_all = scraper_all.get_article_links()
print(f"Todos los artículos: {len(links_all)}")

Artículos del blog principal: 62
Todos los artículos: 64


In [6]:
links_all

['https://huggingface.co/blog/virustotal',
 'https://huggingface.co/blog/gpt-oss-on-intel-xeon',
 'https://huggingface.co/blog/nvidia/nvidia-isaac-for-healthcare',
 'https://huggingface.co/blog/LinkedIn/gpt-oss-agentic-rl',
 'https://huggingface.co/blog/novita/sglang-glm4-moe',
 'https://huggingface.co/blog/ibm-granite/granite-4-nano',
 'https://huggingface.co/blog/nvidia/nemotron-personas-japan-ja',
 'https://huggingface.co/blog/burtenshaw/openenv-scaling',
 'https://huggingface.co/blog/huggingface/shifting-compute-landscape',
 'https://huggingface.co/blog/zilliz/zilliz-semantic-highlight-model',
 'https://huggingface.co/blog/Fannyjrd/interpreto',
 'https://huggingface.co/blog/Arm/arm-at-pytorch-conference',
 'https://huggingface.co/blog/amd/openroboticshackathon',
 'https://huggingface.co/blog/ggml-org/anthropic-messages-api-in-llamacpp',
 'https://huggingface.co/blog/aisheets-unlock-images',
 'https://huggingface.co/blog/vibegame',
 'https://huggingface.co/blog/rteb',
 'https://hugg

In [7]:
from scraping.scraper_base import BaseScraper
import time

class HuggingFaceScraper(BaseScraper):
    def __init__(self, max_pages=50, sleep_time=1.0):
        super().__init__("Hugging Face Blog")
        self.base_url = "https://huggingface.co/blog"
        self.max_pages = max_pages
        self.sleep_time = sleep_time

    def get_article_links(self):
        links = set()

        for page in range(1, self.max_pages + 1):
            url = self.base_url if page == 1 else f"{self.base_url}?p={page}"
            soup = self.get_soup(url)

            if soup is None:
                break

            # Buscar todos los enlaces a artículos
            blog_links = soup.find_all("a", href=True)
            
            for link in blog_links:
                href = link["href"]
                
                if href.startswith("/blog/") and href != "/blog/":
                    full_url = f"https://huggingface.co{href}"
                    links.add(full_url)

            time.sleep(self.sleep_time)

        return list(links)

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        content_div = soup.find("div", {"class": "prose"})

        if not title or not content_div:
            return None

        paragraphs = content_div.find_all("p")

        if not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True),
            content=self.clean_text(paragraphs)
        )

In [28]:
scraper = HuggingFaceScraper(max_pages=100, sleep_time=2)
links = scraper.get_article_links()
print(f"Artículos del blog principal: {len(links)}")

articles = []

for url in links:
    article = scraper.scrape_article(url)
    if article:
        articles.append(article)
        time.sleep(0.5)



Artículos del blog principal: 732




KeyboardInterrupt: 

In [32]:
import requests
from datetime import datetime, timedelta

class HuggingFaceAPIScraper:
    def __init__(self):
        self.base_url = "https://huggingface.co/api"
    
    def get_blog_posts(self, limit=50, days_ago=7):
        """Obtiene posts del blog usando la API oficial (GRATIS)"""
        
        # Endpoint oficial para posts
        url = f"{self.base_url}/posts"
        
        try:
            response = requests.get(url, params={"limit": limit})
            response.raise_for_status()
            posts = response.json()
            
            # Filtrar por fecha (ej: última semana)
            filtered_posts = []
            cutoff_date = datetime.now() - timedelta(days=days_ago)
            
            for post in posts:
                post_date = datetime.fromisoformat(post.get("createdAt", "").replace("Z", "+00:00"))
                
                if post_date >= cutoff_date:
                    filtered_posts.append({
                        "title": post.get("title", ""),
                        "url": f"https://huggingface.co/blog/{post.get('slug', '')}",
                        "author": post.get("author", {}).get("name", ""),
                        "date": post_date.strftime("%Y-%m-%d"),
                        "summary": post.get("summary", ""),
                        "tags": post.get("tags", []),
                        "likes": post.get("likesCount", 0)
                    })
            
            return filtered_posts
            
        except Exception as e:
            print(f"Error con API: {e}")
            return []

# USO (GRATIS):
scraper = HuggingFaceAPIScraper()
posts = scraper.get_blog_posts(limit=20, days_ago=30)

print(f"Encontrados {len(posts)} posts recientes:")
for post in posts[:5]:
    print(f"- {post['date']}: {post['title']} (Likes: {post['likes']})")

Error con API: 'str' object has no attribute 'get'
Encontrados 0 posts recientes:


In [34]:
from scraping.sources.scraper_aws import AWSScraper

aws = AWSScraper(
    blogs=[
        "machine-learning",
        "infrastructure-and-automation",
        "iot",
        "big-data"
    ],
    lang="en",
    max_pages=2
)

links = aws.get_article_links()
print(len(links))


80


In [4]:
import feedparser
import requests

class AIBusinessScraper:
    def __init__(self):
        self.rss_url = "https://aibusiness.com/feed/"
    
    def get_articles_via_rss(self):
        """Usa RSS feed (legal y fácil)"""
        feed = feedparser.parse(self.rss_url)
        articles = []
        
        for entry in feed.entries[:20]:  # Últimos 20
            articles.append({
                "title": entry.title,
                "url": entry.link,
                "date": entry.published if 'published' in entry else "",
                "summary": entry.summary if 'summary' in entry else "",
                "content": entry.content[0].value if 'content' in entry else ""
            })
        
        return articles

# Uso simple y legal
scraper = AIBusinessScraper()
articles = scraper.get_articles_via_rss()

len(articles)

0

In [4]:
len(articles)

0

In [6]:
from scraping.scraper_base import BaseScraper
import time
import logging

class HuggingFaceScraper(BaseScraper):
    def __init__(self, max_pages=10, sleep_time=2.0):
        super().__init__("Hugging Face Blog", base_domains=["huggingface.co"])
        self.base_url = "https://huggingface.co/blog"
        self.max_pages = max_pages
        self.sleep_time = sleep_time

    def get_article_links(self):
        links = set()
        for page in range(1, self.max_pages + 1):
            url = self.base_url if page == 1 else f"{self.base_url}?p={page}"
            soup = self.get_soup(url)
            if soup is None:
                break

            # Buscar enlaces a artículos (normalmente dentro de article o en h2)
            for link in soup.find_all("a", href=True):
                href = link['href']
                if href.startswith("/blog/") and href != "/blog/" and "/blog/community" not in href:
                    full_url = "https://huggingface.co" + href
                    links.add(full_url)

            time.sleep(self.sleep_time)

        return list(links)

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        content_div = soup.find("div", class_="prose")
        if not content_div:
            content_div = soup.find("div", class_="markdown")
        if not content_div:
            return None

        paragraphs = content_div.find_all("p")
        if not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True) if title else "Sin título",
            content=self.clean_text(paragraphs)
        )

In [9]:
from scraping.normalization import normalize_article
import pandas as pd

scraper = HuggingFaceScraper(max_pages=3, sleep_time=1)

links = scraper.get_article_links()

articles = []
for url in links:
    article = scraper.scrape_article(url)
    if article:
        articles.append(article)

normalized_articles = [normalize_article(article) for article in articles]

df = pd.DataFrame(normalized_articles)

df_clean = df[df["is_valid"]].copy()
df_clean.shape

INFO:numexpr.utils:NumExpr defaulting to 16 threads.


(61, 9)

In [10]:
df_clean

Unnamed: 0,source,url,title,content,scraping_date,content_length,word_count,language,is_valid
0,Hugging Face Blog,https://huggingface.co/blog/upskill,We got Claude to teach open models how to writ...,What are agent skills? 1. Get the teacher (Cla...,2026-02-15 21:44:54.273143,8669,1442,en,True
1,Hugging Face Blog,https://huggingface.co/blog/nvidia/nvidia-isaa...,How to Build a Healthcare Robot from Simulatio...,Simulation has been a cornerstone in medical i...,2026-02-15 21:44:54.731683,3598,519,en,True
2,Hugging Face Blog,https://huggingface.co/blog/huggingface/shifti...,On the Shifting Global Compute Landscape,"The status quo of AI chip usage, that was once...",2026-02-15 21:44:55.422200,20200,2942,en,True
3,Hugging Face Blog,https://huggingface.co/blog/flymy-ai/craft-1,CRAFT: Continuous Reasoning and Agentic Feedba...,CRAFT adds thinking into text to image generat...,2026-02-15 21:44:56.078311,1796,278,en,True
4,Hugging Face Blog,https://huggingface.co/blog/transformersjs-v4,Transformers.js v4 Preview: Now Available on NPM!,Performance & Runtime Improvements Repository ...,2026-02-15 21:44:56.841038,6726,979,en,True
...,...,...,...,...,...,...,...,...,...
56,Hugging Face Blog,https://huggingface.co/blog/intel-deepmath,DeepMath: A lightweight math reasoning Agent w...,Why DeepMath? How It Works Training with GRPO...,2026-02-15 21:45:32.709601,6683,925,en,True
57,Hugging Face Blog,https://huggingface.co/blog/MiniMax-AI/alignin...,Aligning to What? Rethinking Agent Generalizat...,The Real Agent Alignment Problem: Benchmarks o...,2026-02-15 21:45:33.287329,3620,589,en,True
58,Hugging Face Blog,https://huggingface.co/blog/virustotal,Hugging Face and VirusTotal collaborate to str...,Why this matters How the collaboration works B...,2026-02-15 21:45:33.869750,1751,268,en,True
59,Hugging Face Blog,https://huggingface.co/blog/lerobotxnvidia-hea...,Building a Healthcare Robot from Simulation to...,"A hands-on guide to collecting data, training ...",2026-02-15 21:45:34.593898,3372,486,en,True


In [12]:
import feedparser
import logging
from scraping.scraper_base import BaseScraper

class AIBusinessScraper(BaseScraper):
    """
    Usa RSS porque el sitio web tiene Cloudflare y es difícil de scrapear directamente.
    """
    def __init__(self):
        super().__init__("AI Business", base_domains=["aibusiness.com"])
        self.feed_url = "https://aibusiness.com/feed/"

    def get_article_links(self):
        feed = feedparser.parse(self.feed_url)
        links = [entry.link for entry in feed.entries]
        return links

    def scrape_article(self, url):
        # Podríamos intentar scrapear el contenido de la página, pero con Cloudflare es complicado.
        # Mejor devolvemos solo la URL y título del feed, y el contenido lo obtenemos del feed también.
        # O podemos usar un método alternativo con requests-html, pero no garantizado.
        # Aquí optamos por devolver lo que da el feed.
        feed = feedparser.parse(self.feed_url)
        for entry in feed.entries:
            if entry.link == url:
                return self.build_article(
                    url=url,
                    title=entry.title,
                    content=entry.summary if hasattr(entry, 'summary') else entry.description
                )
        return None
    
scraper = AIBusinessScraper()

links = scraper.get_article_links()
len(links)
# articles = []
# for url in links:
#     article = scraper.scrape_article(url)
#     if article:
#         articles.append(article)

# normalized_articles = [normalize_article(article) for article in articles]

# df = pd.DataFrame(normalized_articles)

# df_clean = df[df["is_valid"]].copy()
# df_clean.shape

0

In [15]:
from scraping.scraper_base import BaseScraper
import time

class AllenAIScraper(BaseScraper):
    def __init__(self, max_pages=5, sleep_time=1.0):
        super().__init__("Allen AI", base_domains=["allenai.org"])
        self.base_url = "https://allenai.org/research"
        self.max_pages = max_pages
        self.sleep_time = sleep_time

    def get_article_links(self):
        links = set()
        # El blog de Allen AI parece tener paginación por ?page=
        for page in range(1, self.max_pages + 1):
            url = f"{self.base_url}?page={page}" if page > 1 else self.base_url
            soup = self.get_soup(url)
            if soup is None:
                break

            # Buscar enlaces a artículos (probablemente dentro de article o h2)
            for a in soup.find_all("a", href=True):
                href = a['href']
                if href.startswith("/research/") and href != "/research/":
                    full_url = "https://allenai.org" + href
                    links.add(full_url)
                elif href.startswith("https://allenai.org/research/"):
                    links.add(href)

            time.sleep(self.sleep_time)

        return list(links)

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        # El contenido puede estar en un div con clase "blog-post" o "content"
        content_div = soup.find("div", class_="blog-post") or soup.find("div", class_="content") or soup.find("article")
        if not content_div:
            return None

        paragraphs = content_div.find_all("p")
        if not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True) if title else "Sin título",
            content=self.clean_text(paragraphs)
        )
    

scraper = AllenAIScraper(max_pages=6, sleep_time=1)
links = scraper.get_article_links()
len(links)
# articles = []
# for url in links:
#     article = scraper.scrape_article(url)
#     if article:
#         articles.append(article)

# normalized_articles = [normalize_article(article) for article in articles]

# df = pd.DataFrame(normalized_articles)

# df_clean = df[df["is_valid"]].copy()
# df_clean.shape

0

In [16]:
from scraping.scraper_base import BaseScraper
import time
import xml.etree.ElementTree as ET
import requests

class OpenAIScraper(BaseScraper):
    def __init__(self, max_articles=50, sleep_time=1.0):
        super().__init__("OpenAI", base_domains=["openai.com"])
        self.sitemap_url = "https://openai.com/sitemap.xml"
        self.max_articles = max_articles
        self.sleep_time = sleep_time

    def get_article_links(self):
        # Obtener todas las URLs del sitemap que correspondan a blog
        try:
            response = requests.get(self.sitemap_url, headers=self.headers, timeout=10)
            response.raise_for_status()
            root = ET.fromstring(response.content)
            # Espacio de nombres típico de sitemap
            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            urls = []
            for url in root.findall('sm:url', ns):
                loc = url.find('sm:loc', ns).text
                if '/blog/' in loc:
                    urls.append(loc)
            return urls[:self.max_articles]
        except Exception as e:
            logging.warning(f"[OpenAI] Error obteniendo sitemap: {e}")
            return []

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        # El contenido puede estar en un artículo o div con clase específica
        content_div = soup.find("article") or soup.find("div", class_="prose") or soup.find("div", class_="content")
        if not content_div:
            return None

        paragraphs = content_div.find_all("p")
        if not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True) if title else "Sin título",
            content=self.clean_text(paragraphs)
        )
    
scraper = OpenAIScraper(max_articles=30, sleep_time=1)
links = scraper.get_article_links()
len(links)

0

In [3]:
from scraping.scraper_base import BaseScraper
import time
import logging

class MicrosoftAIScraper(BaseScraper):
    def __init__(self, sources=None, max_pages=5, sleep_time=2.0):
        super().__init__("Microsoft AI", base_domains=["microsoft.com", "azure.microsoft.com"])
        self.sources = sources or [
            {"name": "news", "url": "https://news.microsoft.com/source/topics/ai/", "pagination": "page/{page}/"},
            # {"name": "azure", "url": "https://azure.microsoft.com/en-us/blog/", "pagination": "page/{page}/"},
            {"name": "ai blog", "url": "https://blogs.microsoft.com/ai/", "pagination": "page/{page}/"}
        ]
        self.max_pages = max_pages
        self.sleep_time = sleep_time

    def get_article_links(self):
        links = set()
        for source in self.sources:
            base = source["url"]
            for page in range(1, self.max_pages + 1):
                if page == 1:
                    url = base
                else:
                    url = base + source["pagination"].format(page=page)
                soup = self.get_soup(url)
                if soup is None:
                    break

                # Buscar enlaces a artículos
                for a in soup.find_all("a", href=True):
                    href = a['href']
                    # Filtrar enlaces que parezcan artículos (que contengan fecha o /año/mes/)
                    if any(pat in href for pat in ['/202', '/20', '/article', '/post']):
                        if href.startswith('/'):
                            full_url = "https://" + source["url"].split('/')[2] + href
                        elif href.startswith('http'):
                            full_url = href
                        else:
                            full_url = base.rstrip('/') + '/' + href.lstrip('/')
                        if any(domain in full_url for domain in self.base_domains):
                            links.add(full_url)

                time.sleep(self.sleep_time)

        return list(links)

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        # Contenido puede variar según el blog
        content_div = (soup.find("article") or 
                       soup.find("div", class_="entry-content") or 
                       soup.find("div", class_="post-content") or
                       soup.find("div", class_="content"))
        if not content_div:
            return None

        paragraphs = content_div.find_all("p")
        if not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True) if title else "Sin título",
            content=self.clean_text(paragraphs)
        )
    
# scraper = MicrosoftAIScraper(max_pages=2, sleep_time=1)
# links = scraper.get_article_links()
# len(links)

In [4]:
scraper = MicrosoftAIScraper(max_pages=30, sleep_time=1)
links = scraper.get_article_links()
len(links)



45

In [5]:
class GoogleNewsScraper(BaseScraper):
    def __init__(self):
        super().__init__("Google News")
        self.base_url = "https://news.google.com"
    
    def get_ai_news(self):
        # Google News permite scraping fácil
        soup = self.get_soup(f"{self.base_url}/search?q=AI+artificial+intelligence&hl=es")
        links = []
        for a in soup.find_all("a", href=True):
            if "article" in a["href"]:
                links.append(self.base_url + a["href"][1:])  # Remover el . inicial
        return links


In [6]:
scraper = GoogleNewsScraper()
links = scraper.get_ai_news()

In [7]:
len(links)

0

In [8]:
from scraping.scraper_base import BaseScraper
import time
import logging

class OpenAIScraper(BaseScraper):
    """
    Scraper para el blog de OpenAI (https://openai.com/blog)
    Funciona con BeautifulSoup sin necesidad de JavaScript.
    """

    def __init__(self, max_pages=5, sleep_time=2.0):
        super().__init__("OpenAI Blog", base_domains=["openai.com"])
        self.base_url = "https://openai.com/blog"
        self.max_pages = max_pages
        self.sleep_time = sleep_time

        # Headers realistas para evitar bloqueos
        self.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
        })

    def get_article_links(self):
        """
        Obtiene los enlaces a los artículos del blog de OpenAI.
        OpenAI usa paginación por scroll infinito, pero la página inicial ya carga ~20 artículos.
        """
        links = set()

        # OpenAI no tiene paginación tradicional (?page=2), pero la URL base carga todos los recientes.
        # Para obtener más, podemos usar el parámetro ?page (aunque no es oficial, a veces funciona)
        for page in range(1, self.max_pages + 1):
            if page == 1:
                url = self.base_url
            else:
                # Parámetro de paginación no oficial, pero a veces devuelve artículos antiguos
                url = f"{self.base_url}?page={page}"

            logging.info(f"Scrapeando página {page}: {url}")
            soup = self.get_soup(url)

            if soup is None:
                logging.warning(f"No se pudo obtener {url}")
                continue

            # Buscar enlaces a artículos individuales
            # Estructura típica: <a href="/blog/titulo-del-articulo"> dentro de un contenedor
            article_links = soup.select('a[href^="/blog/"]')

            # Filtrar enlaces que no sean artículos (como /blog, /blog?page=...)
            for a in article_links:
                href = a.get("href")
                if href and href.startswith("/blog/") and href != "/blog":
                    # Excluir enlaces que contengan parámetros
                    if "?" not in href and "#" not in href:
                        full_url = f"https://openai.com{href}"
                        links.add(full_url)

            # Si no se encuentran más enlaces, salir
            if len(article_links) == 0:
                break

            time.sleep(self.sleep_time)

        logging.info(f"Total enlaces encontrados: {len(links)}")
        return list(links)

    def scrape_article(self, url):
        """
        Extrae el título y contenido de un artículo de OpenAI.
        """
        soup = self.get_soup(url)
        if soup is None:
            return None

        # Título: suele estar en un h1
        title_elem = soup.find("h1")
        if not title_elem:
            # Alternativa: meta og:title
            meta_title = soup.find("meta", property="og:title")
            title = meta_title["content"] if meta_title else "Sin título"
        else:
            title = title_elem.get_text(strip=True)

        # Contenido: OpenAI usa un div con clase "prose" o "article-content"
        content_div = soup.select_one("div.prose, div.article-content, article")
        if not content_div:
            logging.warning(f"No se encontró contenido en {url}")
            return None

        # Extraer párrafos
        paragraphs = content_div.find_all("p")
        if not paragraphs:
            return None

        # Limpiar y filtrar párrafos muy cortos o irrelevantes
        valid_paragraphs = []
        for p in paragraphs:
            text = p.get_text(strip=True)
            if len(text) > 30 and not text.startswith("Share"):
                valid_paragraphs.append(p)

        if not valid_paragraphs:
            return None

        content = self.clean_text(valid_paragraphs)

        return self.build_article(
            url=url,
            title=title,
            content=content
        )


# Ejemplo de uso rápido (para probar)
if __name__ == "__main__":
    scraper = OpenAIScraper(max_pages=1, sleep_time=3)
    links = scraper.get_article_links()
    print(f"Artículos encontrados: {len(links)}")

    if links:
        # Probar el primer artículo
        articulo = scraper.scrape_article(links[0])
        if articulo:
            print(f"Título: {articulo['title']}")
            print(f"Contenido: {articulo['content'][:200]}...")

INFO:root:Scrapeando página 1: https://openai.com/blog
INFO:root:Total enlaces encontrados: 0


Artículos encontrados: 0


In [9]:
# En tu script principal
scraper = OpenAIScraper(max_pages=1, sleep_time=3)
links = scraper.get_article_links()[:5]  # Solo 5 para prueba

articulos = []
for url in links:
    articulo = scraper.scrape_article(url)
    if articulo:
        articulos.append(articulo)
    time.sleep(2)  # Delay entre artículos

print(f"Obtenidos {len(articulos)} artículos")

INFO:root:Scrapeando página 1: https://openai.com/blog
INFO:root:Total enlaces encontrados: 0


Obtenidos 0 artículos


In [10]:
import feedparser
from datetime import datetime, timedelta

class AINewsletterCollector:
    """Recolector de noticias para newsletter de IA"""
    
    def __init__(self):
        self.sources = [
            {"name": "OpenAI", "url": "https://openai.com/news.xml"},
            {"name": "DeepMind", "url": "https://deepmind.google/blog/rss.xml"},
            {"name": "Anthropic", "url": "https://www.anthropic.com/rss.xml"},
            {"name": "Hugging Face", "url": "https://huggingface.co/blog/feed.xml"},
            {"name": "TechCrunch AI", "url": "https://techcrunch.com/tag/artificial-intelligence/feed/"},
            {"name": "MIT AI", "url": "http://news.mit.edu/topic/mitartificial-intelligence2-rss.xml"},
        ]
    
    def get_news_last_days(self, days=7):
        """Obtiene noticias de los últimos X días"""
        all_news = []
        cutoff = datetime.now() - timedelta(days=days)
        
        for source in self.sources:
            print(f"Procesando {source['name']}...")
            feed = feedparser.parse(source["url"])
            
            for entry in feed.entries:
                # Intentar obtener fecha
                pub_str = entry.get('published', entry.get('updated', ''))
                try:
                    pub_date = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z')
                    pub_date = pub_date.replace(tzinfo=None)  # Quitar timezone
                except:
                    # Si no podemos parsear, asumir que es reciente
                    pub_date = datetime.now()
                
                if pub_date >= cutoff:
                    all_news.append({
                        'source': source['name'],
                        'title': entry.get('title', ''),
                        'url': entry.get('link', ''),
                        'date': pub_date.strftime('%Y-%m-%d'),
                        'summary': entry.get('summary', '')[:300],
                    })
        
        # Ordenar por fecha
        all_news.sort(key=lambda x: x['date'], reverse=True)
        return all_news
    
    def generate_newsletter_html(self, news):
        """Genera HTML para newsletter"""
        html = "<h1>📰 Newsletter IA Semanal</h1>\n"
        
        # Agrupar por fuente
        by_source = {}
        for item in news:
            by_source.setdefault(item['source'], []).append(item)
        
        for source, items in by_source.items():
            html += f"<h2>{source}</h2>\n<ul>\n"
            for item in items:
                html += f'  <li><a href="{item["url"]}">{item["title"]}</a> - {item["date"]}<br>'
                html += f'  <small>{item["summary"]}</small></li>\n'
            html += "</ul>\n"
        
        return html



In [11]:
# Uso
collector = AINewsletterCollector()
news = collector.get_news_last_days(days=7)
print(f"Noticias de los últimos 7 días: {len(news)}")

# Generar HTML
html = collector.generate_newsletter_html(news)
with open("newsletter_ia_semanal.html", "w", encoding="utf-8") as f:
    f.write(html)

Procesando OpenAI...
Procesando DeepMind...
Procesando Anthropic...
Procesando Hugging Face...
Procesando TechCrunch AI...
Procesando MIT AI...
Noticias de los últimos 7 días: 741


In [13]:
html[:500]

'<h1>📰 Newsletter IA Semanal</h1>\n<h2>Hugging Face</h2>\n<ul>\n  <li><a href="https://huggingface.co/blog/custom-cuda-kernels-agent-skills">Custom Kernels for All from Codex and Claude</a> - 2026-02-16<br>  <small></small></li>\n  <li><a href="https://huggingface.co/blog/openenv-turing">OpenEnv in Practice: Evaluating Tool-Using Agents in Real-World Environments</a> - 2026-02-16<br>  <small></small></li>\n  <li><a href="https://huggingface.co/blog/transformersjs-v4">Transformers.js v4 Preview: Now Av'

In [14]:
import feedparser
import time
import logging
from datetime import datetime, timedelta
from urllib.parse import urlparse
from scraping.scraper_base import BaseScraper  # tu clase base

# Configuración básica de logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class RSSContentScraper(BaseScraper):
    """
    Extiende BaseScraper para añadir extracción de contenido específica por dominio.
    """
    def __init__(self, source_name, base_domains):
        super().__init__(source_name, base_domains)
        # Headers más realistas (opcional, ya los tiene BaseScraper)
        self.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
        })

    def extract_full_content(self, url):
        """
        Intenta extraer el contenido completo de un artículo según el dominio.
        Retorna el texto del contenido o None si falla.
        """
        parsed = urlparse(url)
        domain = parsed.netloc.replace("www.", "")

        soup = self.get_soup(url)
        if not soup:
            return None

        # Diccionario de selectores por dominio (puedes ampliarlo)
        selectors = {
            "openai.com": "div.prose, article",
            "deepmind.google": "div[class*='blog-post'], article",
            "anthropic.com": "div.content, article",
            "huggingface.co": "div.prose",
            "techcrunch.com": "div.article-content, div.post-content",
            "news.mit.edu": "div.entry-content",
            "ai.meta.com": "div.blog-content",
            "wired.com": "div.article__body",
            "aibusiness.com": "div.article-content",  # probar, quizás no funcione
        }

        selector = selectors.get(domain, "article, main, .content, .post-content")  # fallback genérico
        content_div = soup.select_one(selector)

        if not content_div:
            # Intento más genérico: buscar el primer div con muchos párrafos
            candidates = soup.find_all('div')
            for div in candidates:
                p_count = len(div.find_all('p'))
                if p_count > 3:
                    content_div = div
                    break

        if not content_div:
            logging.warning(f"No se pudo encontrar contenido para {url}")
            return None

        paragraphs = content_div.find_all('p')
        if not paragraphs:
            return None

        # Limpiar párrafos muy cortos o sospechosos (publicidad, etc.)
        clean_paras = []
        for p in paragraphs:
            text = p.get_text(strip=True)
            if len(text) > 40 and not any(x in text.lower() for x in ['copyright', 'all rights reserved', 'compartir en']):
                clean_paras.append(p)

        if not clean_paras:
            return None

        return self.clean_text(clean_paras)


class AINewsletterCollector:
    """
    Recolecta noticias de múltiples fuentes vía RSS y luego enriquece con contenido completo.
    """

    def __init__(self, days=7):
        self.days = days
        self.sources = [
            {"name": "OpenAI", "url": "https://openai.com/news.xml", "domain": "openai.com"},
            {"name": "DeepMind", "url": "https://deepmind.google/blog/rss.xml", "domain": "deepmind.google"},
            {"name": "Anthropic", "url": "https://www.anthropic.com/rss.xml", "domain": "anthropic.com"},
            {"name": "Hugging Face", "url": "https://huggingface.co/blog/feed.xml", "domain": "huggingface.co"},
            {"name": "TechCrunch AI", "url": "https://techcrunch.com/tag/artificial-intelligence/feed/", "domain": "techcrunch.com"},
            {"name": "MIT AI", "url": "http://news.mit.edu/topic/mitartificial-intelligence2-rss.xml", "domain": "news.mit.edu"},
        ]
        # Inicializar scrapers por dominio (para reutilizar)
        self.scrapers = {}
        for source in self.sources:
            domain = source["domain"]
            if domain not in self.scrapers:
                self.scrapers[domain] = RSSContentScraper(source["name"], [domain])

    def fetch_articles_from_rss(self):
        """Obtiene artículos básicos (sin contenido completo) de los últimos días."""
        articles = []
        cutoff = datetime.now() - timedelta(days=self.days)

        for source in self.sources:
            logging.info(f"Obteniendo RSS de {source['name']}...")
            feed = feedparser.parse(source["url"])

            for entry in feed.entries:
                # Parsear fecha
                pub_str = entry.get('published', entry.get('updated', ''))
                try:
                    pub_date = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z')
                    pub_date = pub_date.replace(tzinfo=None)
                except:
                    pub_date = datetime.now()  # si no hay fecha, asumimos reciente

                if pub_date >= cutoff:
                    # Intentar obtener contenido del RSS (a veces viene completo)
                    rss_content = ''
                    if 'content' in entry and entry.content:
                        rss_content = entry.content[0].value
                    elif 'summary' in entry:
                        rss_content = entry.summary

                    articles.append({
                        'source': source['name'],
                        'title': entry.get('title', ''),
                        'url': entry.get('link', ''),
                        'date': pub_date.strftime('%Y-%m-%d'),
                        'rss_content': rss_content,  # guardamos lo que trae el RSS
                        'full_content': None,        # lo rellenaremos después
                    })
            time.sleep(1)  # pequeño delay entre fuentes

        # Ordenar por fecha
        articles.sort(key=lambda x: x['date'], reverse=True)
        logging.info(f"Total artículos obtenidos del RSS: {len(articles)}")
        return articles

    def enrich_with_full_content(self, articles, max_articles=10, delay_between=3):
        """
        Para cada artículo, intenta obtener el contenido completo:
        - Si el RSS ya trae contenido sustancial (>500 chars), lo usa.
        - Si no, hace scraping con el scraper correspondiente.
        Solo procesa los primeros `max_articles` para no saturar.
        """
        enriched = []
        for i, art in enumerate(articles[:max_articles]):
            logging.info(f"Procesando {i+1}/{min(max_articles, len(articles))}: {art['title'][:60]}...")

            # 1. Comprobar si el RSS ya trae contenido suficiente
            if art['rss_content'] and len(art['rss_content']) > 500:
                art['full_content'] = art['rss_content']
                logging.info("  Usando contenido del RSS")
            else:
                # 2. Scraping
                domain = art['url'].split('/')[2].replace('www.', '')
                scraper = self.scrapers.get(domain)
                if not scraper:
                    # Buscar por dominio en sources (por si hay varios dominios por fuente)
                    for source in self.sources:
                        if source['name'] == art['source']:
                            domain = source['domain']
                            scraper = self.scrapers.get(domain)
                            break
                if scraper:
                    content = scraper.extract_full_content(art['url'])
                    if content:
                        art['full_content'] = content
                        logging.info("  Scraping exitoso")
                    else:
                        art['full_content'] = art['rss_content'] or "Contenido no disponible"
                        logging.warning("  Scraping falló, se usa contenido parcial")
                else:
                    art['full_content'] = art['rss_content'] or "Contenido no disponible"
                    logging.warning(f"  No hay scraper para dominio {domain}")

            enriched.append(art)
            if i < max_articles - 1:
                time.sleep(delay_between)  # delay entre artículos

        return enriched

    def generate_newsletter_html(self, articles):
        """Genera HTML con título, fecha, enlace y contenido completo."""
        html = "<h1>📰 Newsletter IA Semanal</h1>\n"
        for art in articles:
            html += f"<h2><a href='{art['url']}'>{art['title']}</a></h2>\n"
            html += f"<p><strong>{art['source']}</strong> - {art['date']}</p>\n"
            html += f"<div>{art['full_content']}</div>\n"
            html += "<hr>\n"
        return html


# ========== EJEMPLO DE USO ==========
if __name__ == "__main__":
    collector = AINewsletterCollector(days=7)

    # Paso 1: obtener artículos del RSS
    articulos_rss = collector.fetch_articles_from_rss()
    print(f"Artículos encontrados en RSS: {len(articulos_rss)}")

    # Paso 2: enriquecer con contenido completo (solo los 5 primeros para pruebas)
    articulos_completos = collector.enrich_with_full_content(articulos_rss, max_articles=5, delay_between=5)

    # Paso 3: generar HTML final
    html_final = collector.generate_newsletter_html(articulos_completos)

    with open("newsletter_completa.html", "w", encoding="utf-8") as f:
        f.write(html_final)

    print("Newsletter generada: newsletter_completa.html")

INFO:root:Obteniendo RSS de OpenAI...
INFO:root:Obteniendo RSS de DeepMind...
INFO:root:Obteniendo RSS de Anthropic...
INFO:root:Obteniendo RSS de Hugging Face...
INFO:root:Obteniendo RSS de TechCrunch AI...
INFO:root:Obteniendo RSS de MIT AI...
INFO:root:Total artículos obtenidos del RSS: 741
INFO:root:Procesando 1/5: Custom Kernels for All from Codex and Claude...


Artículos encontrados en RSS: 741


INFO:root:  Scraping exitoso
INFO:root:Procesando 2/5: OpenEnv in Practice: Evaluating Tool-Using Agents in Real-Wo...
INFO:root:  Scraping exitoso
INFO:root:Procesando 3/5: Transformers.js v4 Preview: Now Available on NPM!...
INFO:root:  Scraping exitoso
INFO:root:Procesando 4/5: Introducing SyGra Studio...
INFO:root:  Scraping exitoso
INFO:root:Procesando 5/5: Nemotron ColEmbed V2: Raising the Bar for Multimodal Retriev...
INFO:root:  Scraping exitoso


Newsletter generada: newsletter_completa.html


In [17]:
len(html_final)


30794

In [22]:
import feedparser
import time
import logging
from scraping.scraper_base import BaseScraper

logging.basicConfig(level=logging.INFO)

class RSSContentCollector2:
    """
    Colector que combina RSS + scraping para obtener contenido completo.
    Devuelve artículos en el mismo formato que tus scrapers directos.
    """

    def __init__(self):
        # Fuentes: nombre y feed RSS
        self.sources = [
            {"name": "OpenAI", "rss": "https://openai.com/news.xml"},
            {"name": "DeepMind", "rss": "https://deepmind.google/blog/rss.xml"},
            {"name": "Anthropic", "rss": "https://www.anthropic.com/rss.xml"},
            # {"name": "Hugging Face", "rss": "https://huggingface.co/blog/feed.xml"},
            # {"name": "TechCrunch AI", "rss": "https://techcrunch.com/tag/artificial-intelligence/feed/"},
            {"name": "MIT AI", "rss": "http://news.mit.edu/topic/mitartificial-intelligence2-rss.xml"},
        ]
        # Scrapers por dominio (reutilizables)
        self.scrapers = {}

    def _get_scraper(self, source_name, url):
        """Obtiene o crea un scraper para el dominio de la URL."""
        from urllib.parse import urlparse
        domain = urlparse(url).netloc.replace("www.", "")
        
        if domain not in self.scrapers:
            # Crear un scraper específico para este dominio
            class DynamicScraper(BaseScraper):
                def __init__(self, name, domain):
                    super().__init__(name, base_domains=[domain])
                    self.headers.update({
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                    })
                
                def scrape_article(self, url):
                    soup = self.get_soup(url)
                    if not soup:
                        return None
                    
                    # Título
                    title = soup.find("h1")
                    if not title:
                        title = soup.find("meta", property="og:title")
                        title = title["content"] if title else "Sin título"
                    else:
                        title = title.get_text(strip=True)
                    
                    # Contenido - buscar contenedor principal
                    content_div = (
                        soup.find("div", class_="prose") or
                        soup.find("article") or
                        soup.find("main") or
                        soup.find("div", class_="content")
                    )
                    
                    if not content_div:
                        return None
                    
                    paragraphs = content_div.find_all("p")
                    if not paragraphs:
                        return None
                    
                    # Filtrar párrafos cortos o basura
                    valid_paras = [p for p in paragraphs if len(p.get_text(strip=True)) > 50]
                    
                    return self.build_article(
                        url=url,
                        title=title,
                        content=self.clean_text(valid_paras)
                    )
            
            self.scrapers[domain] = DynamicScraper(source_name, domain)
        
        return self.scrapers[domain]

    def collect_articles(self, max_articles=20, days=7):
        """
        Recoge artículos de todas las fuentes.
        Devuelve lista de dicts con: source, url, title, content
        """
        from datetime import datetime, timedelta
        import time
        
        all_articles = []
        cutoff = datetime.now() - timedelta(days=days)
        
        for source in self.sources:
            logging.info(f"Procesando {source['name']}...")
            feed = feedparser.parse(source["rss"])
            
            for entry in feed.entries:
                # Filtrar por fecha si es posible
                pub_str = entry.get('published', entry.get('updated', ''))
                try:
                    pub_date = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z')
                    pub_date = pub_date.replace(tzinfo=None)
                    if pub_date < cutoff:
                        continue
                except:
                    pass  # si no hay fecha, lo incluimos
                
                url = entry.get('link', '')
                title = entry.get('title', '')
                
                if not url or not title:
                    continue
                
                # Obtener scraper y scrapear contenido
                scraper = self._get_scraper(source['name'], url)
                article = scraper.scrape_article(url)
                
                if article:
                    all_articles.append(article)
                    logging.info(f"  ✓ {title[:60]}...")
                else:
                    logging.warning(f"  ✗ Falló: {title[:60]}...")
                
                # Delay entre artículos para no saturar
                time.sleep(2)
                
                if len(all_articles) >= max_articles:
                    break
            
            if len(all_articles) >= max_articles:
                break
        
        return all_articles

In [23]:
collector = RSSContentCollector2()

# Obtener máximo 15 artículos (ajusta según necesites)
articulos = collector.collect_articles(max_articles=300, days=7)

print(f"\nTotal artículos obtenidos: {len(articulos)}")

# Mostrar ejemplo del primero
if articulos:
    print("\nEjemplo del primer artículo:")
    for key, value in articulos[0].items():
        if key == 'content':
            print(f"  {key}: {value[:200]}...")
        else:
            print(f"  {key}: {value}")

INFO:root:Procesando OpenAI...
INFO:root:Procesando DeepMind...
INFO:root:Procesando Anthropic...
INFO:root:Procesando MIT AI...
INFO:root:  ✓ New J-PAL research and policy initiative to test and scale A...
INFO:root:  ✓ Accelerating science with AI and simulations...
INFO:root:  ✓ Using synthetic biology and AI to address global antimicrobi...
INFO:root:  ✓ AI algorithm enables tracking of vital white matter pathways...
INFO:root:  ✓ 3 Questions: Using AI to help Olympic skaters land a quint...



Total artículos obtenidos: 5

Ejemplo del primer artículo:
  source: MIT AI
  url: https://news.mit.edu/2026/new-j-pal-research-policy-initiative-to-test-scale-ai-innovations-fight-poverty-0212
  title: New J-PAL research and policy initiative to test and scale AI innovations to fight poverty
  content: The Abdul Latif Jameel Poverty Action Lab (J-PAL) at MIT has awarded funding to eight new research studies to understand how artificial intelligence innovations can be used in the fight against povert...
  scraping_date: 2026-02-16 12:10:02.621821


In [25]:
len(articulos)

5