In [None]:
import sys
import os

PROJECT_ROOT = r"C:\Users\Angel\OneDrive - Universidad Complutense de Madrid (UCM)\Documentos\MASTER\99_tfm\tfm_newsletter_ai"

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [None]:
from scraping.scraper_base import BaseScraper
import time

class HuggingFaceScraper(BaseScraper):
    def __init__(self, max_pages=50, sleep_time=1.0, scrape_community=False):
        super().__init__("Hugging Face Blog")
        self.base_url = "https://huggingface.co/blog"
        self.max_pages = max_pages
        self.sleep_time = sleep_time
        self.scrape_community = scrape_community

    def get_article_links(self):
        links = set()

        for page in range(1, self.max_pages + 1):
            url = self.base_url if page == 1 else f"{self.base_url}?p={page}"
            soup = self.get_soup(url)

            if soup is None:
                break

            # Buscar todos los enlaces a artículos
            blog_links = soup.find_all("a", href=True)
            
            for link in blog_links:
                href = link["href"]
                
                # Si scrapeamos community, incluimos todo
                if self.scrape_community:
                    if href.startswith("/blog/") and href != "/blog/":
                        full_url = f"https://huggingface.co{href}"
                        links.add(full_url)
                # Si NO scrapeamos community, excluimos esos artículos
                else:
                    if (href.startswith("/blog/") and 
                        not href.startswith("/blog/community") and 
                        href != "/blog/"):
                        full_url = f"https://huggingface.co{href}"
                        links.add(full_url)

            time.sleep(self.sleep_time)

        return list(links)

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        content_div = soup.find("div", {"class": "prose"})

        if not title or not content_div:
            return None

        paragraphs = content_div.find_all("p")

        if not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True),
            content=self.clean_text(paragraphs)
        )

In [2]:
# Opción 1: Solo artículos del blog principal (sin community)
scraper = HuggingFaceScraper(max_pages=3, sleep_time=1, scrape_community=False)
links = scraper.get_article_links()
print(f"Artículos del blog principal: {len(links)}")

# Opción 2: Todos los artículos (blog + community)
scraper_all = HuggingFaceScraper(max_pages=3, sleep_time=1, scrape_community=True)
links_all = scraper_all.get_article_links()
print(f"Todos los artículos: {len(links_all)}")

Artículos del blog principal: 62
Todos los artículos: 64


In [6]:
links_all

['https://huggingface.co/blog/virustotal',
 'https://huggingface.co/blog/gpt-oss-on-intel-xeon',
 'https://huggingface.co/blog/nvidia/nvidia-isaac-for-healthcare',
 'https://huggingface.co/blog/LinkedIn/gpt-oss-agentic-rl',
 'https://huggingface.co/blog/novita/sglang-glm4-moe',
 'https://huggingface.co/blog/ibm-granite/granite-4-nano',
 'https://huggingface.co/blog/nvidia/nemotron-personas-japan-ja',
 'https://huggingface.co/blog/burtenshaw/openenv-scaling',
 'https://huggingface.co/blog/huggingface/shifting-compute-landscape',
 'https://huggingface.co/blog/zilliz/zilliz-semantic-highlight-model',
 'https://huggingface.co/blog/Fannyjrd/interpreto',
 'https://huggingface.co/blog/Arm/arm-at-pytorch-conference',
 'https://huggingface.co/blog/amd/openroboticshackathon',
 'https://huggingface.co/blog/ggml-org/anthropic-messages-api-in-llamacpp',
 'https://huggingface.co/blog/aisheets-unlock-images',
 'https://huggingface.co/blog/vibegame',
 'https://huggingface.co/blog/rteb',
 'https://hugg

In [7]:
from scraping.scraper_base import BaseScraper
import time

class HuggingFaceScraper(BaseScraper):
    def __init__(self, max_pages=50, sleep_time=1.0):
        super().__init__("Hugging Face Blog")
        self.base_url = "https://huggingface.co/blog"
        self.max_pages = max_pages
        self.sleep_time = sleep_time

    def get_article_links(self):
        links = set()

        for page in range(1, self.max_pages + 1):
            url = self.base_url if page == 1 else f"{self.base_url}?p={page}"
            soup = self.get_soup(url)

            if soup is None:
                break

            # Buscar todos los enlaces a artículos
            blog_links = soup.find_all("a", href=True)
            
            for link in blog_links:
                href = link["href"]
                
                if href.startswith("/blog/") and href != "/blog/":
                    full_url = f"https://huggingface.co{href}"
                    links.add(full_url)

            time.sleep(self.sleep_time)

        return list(links)

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        content_div = soup.find("div", {"class": "prose"})

        if not title or not content_div:
            return None

        paragraphs = content_div.find_all("p")

        if not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True),
            content=self.clean_text(paragraphs)
        )

In [28]:
scraper = HuggingFaceScraper(max_pages=100, sleep_time=2)
links = scraper.get_article_links()
print(f"Artículos del blog principal: {len(links)}")

articles = []

for url in links:
    article = scraper.scrape_article(url)
    if article:
        articles.append(article)
        time.sleep(0.5)



Artículos del blog principal: 732




KeyboardInterrupt: 

In [32]:
import requests
from datetime import datetime, timedelta

class HuggingFaceAPIScraper:
    def __init__(self):
        self.base_url = "https://huggingface.co/api"
    
    def get_blog_posts(self, limit=50, days_ago=7):
        """Obtiene posts del blog usando la API oficial (GRATIS)"""
        
        # Endpoint oficial para posts
        url = f"{self.base_url}/posts"
        
        try:
            response = requests.get(url, params={"limit": limit})
            response.raise_for_status()
            posts = response.json()
            
            # Filtrar por fecha (ej: última semana)
            filtered_posts = []
            cutoff_date = datetime.now() - timedelta(days=days_ago)
            
            for post in posts:
                post_date = datetime.fromisoformat(post.get("createdAt", "").replace("Z", "+00:00"))
                
                if post_date >= cutoff_date:
                    filtered_posts.append({
                        "title": post.get("title", ""),
                        "url": f"https://huggingface.co/blog/{post.get('slug', '')}",
                        "author": post.get("author", {}).get("name", ""),
                        "date": post_date.strftime("%Y-%m-%d"),
                        "summary": post.get("summary", ""),
                        "tags": post.get("tags", []),
                        "likes": post.get("likesCount", 0)
                    })
            
            return filtered_posts
            
        except Exception as e:
            print(f"Error con API: {e}")
            return []

# USO (GRATIS):
scraper = HuggingFaceAPIScraper()
posts = scraper.get_blog_posts(limit=20, days_ago=30)

print(f"Encontrados {len(posts)} posts recientes:")
for post in posts[:5]:
    print(f"- {post['date']}: {post['title']} (Likes: {post['likes']})")

Error con API: 'str' object has no attribute 'get'
Encontrados 0 posts recientes:


In [34]:
from scraping.sources.scraper_aws import AWSScraper

aws = AWSScraper(
    blogs=[
        "machine-learning",
        "infrastructure-and-automation",
        "iot",
        "big-data"
    ],
    lang="en",
    max_pages=2
)

links = aws.get_article_links()
print(len(links))


80


In [38]:
import feedparser
import requests

class AIBusinessScraper:
    def __init__(self):
        self.rss_url = "https://aibusiness.com/feed/"
    
    def get_articles_via_rss(self):
        """Usa RSS feed (legal y fácil)"""
        feed = feedparser.parse(self.rss_url)
        articles = []
        
        for entry in feed.entries[:20]:  # Últimos 20
            articles.append({
                "title": entry.title,
                "url": entry.link,
                "date": entry.published if 'published' in entry else "",
                "summary": entry.summary if 'summary' in entry else "",
                "content": entry.content[0].value if 'content' in entry else ""
            })
        
        return articles

# Uso simple y legal
scraper = AIBusinessScraper()
articles = scraper.get_articles_via_rss()

In [39]:
len(articles)

0