In [None]:
import sys
import os

PROJECT_ROOT = r"C:\Users\Angel\OneDrive - Universidad Complutense de Madrid (UCM)\Documentos\MASTER\99_tfm\tfm_newsletter_ai"

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

Project root: C:\Users\Angel\OneDrive - Universidad Complutense de Madrid (UCM)\Documentos\MASTER\99_tfm\tfm_newsletter_ai


In [None]:
from scraping.sources.scraper_xataka import XatakaScraper
import pandas as pd

scraper = XatakaScraper()

links = scraper.get_article_links()
print(f"Artículos encontrados: {len(links)}")

articles = []
for url in links[:5]:  # solo 5 para pruebas
    article = scraper.scrape_article(url)
    if article:
        articles.append(article)

df = pd.DataFrame(articles)
df.head()

In [None]:
from scraping.sources.scraper_xataka import XatakaScraper
from scraping.sources.scraper_openai import OpenAIScraper
import pandas as pd

scrapers = [
    XatakaScraper(),
    OpenAIScraper()
]

articles = []

for scraper in scrapers:
    links = scraper.get_article_links()
    for url in links[:3]:  # pocos para test
        article = scraper.scrape_article(url)
        if article:
            articles.append(article)

df = pd.DataFrame(articles)
df[["source", "title"]]


In [None]:
from scraping.sources.scraper_xataka import XatakaScraper
from scraping.sources.scraper_huggingface import HuggingFaceScraper

scrapers = [
    XatakaScraper(),
    HuggingFaceScraper()
]

articles = []

for scraper in scrapers:
    links = scraper.get_article_links()
    for url in links[:3]:  # pocos para test
        article = scraper.scrape_article(url)
        if article:
            articles.append(article)

df = pd.DataFrame(articles)
df[["source", "title"]]
df

In [None]:
from scraping.normalization import normalize_article

normalized_articles = [
    normalize_article(a) for a in articles
]

df = pd.DataFrame(normalized_articles)
df[["source", "word_count", "language", "is_valid"]]



In [None]:
df_clean = df[df["is_valid"]].copy()
df_clean.shape

output_path = "data/raw/articles_normalized.csv"

df_clean.to_csv(output_path, index=False)

# Más fuentes

In [1]:
from scraping.sources.scraper_xataka import XatakaScraper
from scraping.sources.scraper_huggingface import HuggingFaceScraper
from scraping.sources.scraper_techcrunch import TechCrunchScraper
from scraping.sources.scraper_aws import AWSScraper
from scraping.sources.scraper_wired import WiredScraper
from scraping.sources.scraper_microsoft import MicrosoftNewsScraper
from scraping.sources.scraper_aibusiness import AIBusinessScraper

scrapers = [
    XatakaScraper(),
    # HuggingFaceScraper(max_pages=3),
    TechCrunchScraper(max_pages=50),
    AWSScraper(max_pages=50,
               blogs=["machine-learning",
                    "infrastructure-and-automation",
                    "iot",
                    "big-data"
                    ]
            ),
    WiredScraper(max_pages=50)
    # MicrosoftNewsScraper(),
    # AIBusinessScraper(max_pages=2)
]

articles = []

for scraper in scrapers:
    links = scraper.get_article_links()
    for url in links:
        article = scraper.scrape_article(url)
        if article:
            articles.append(article)

len(links)



1193

In [1]:
from scraping.sources.scraper_microsoft import MicrosoftNewsScraper
ms = MicrosoftNewsScraper(max_pages=2)
links = ms.get_article_links()
print(len(links))
print(links[:5])

0
[]


  k = self.parse_starttag(i)


In [None]:
import feedparser
import logging
from scraping.scraper_base import BaseScraper

class TechCrunchScraper(BaseScraper):

    def __init__(self, tags=None):
        super().__init__("TechCrunch")

        self.base_feed_url = "https://techcrunch.com/tag"
        self.tags = tags or [
            "artificial-intelligence",
            "cloud-computing",
            "robotics"
        ]

    def get_article_links(self):
        links = set()

        for tag in self.tags:
            feed_url = f"{self.base_feed_url}/{tag}/feed/"
            logging.info(f"[TechCrunch RSS] Leyendo {feed_url}")

            feed = feedparser.parse(feed_url)

            for entry in feed.entries:
                if "link" in entry:
                    links.add(entry.link)

        logging.info(f"[TechCrunch RSS] Total links: {len(links)}")
        return list(links)

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        paragraphs = soup.select("div.article-content p")

        if not title or not paragraphs:
            return None

        return self.build_article(
            url=url,
            title=title.get_text(strip=True),
            content=self.clean_text(paragraphs)
        )


techcrunch = TechCrunchScraper(
    tags=[
        "artificial-intelligence",
        "cloud-computing",
        "robotics"
    ]
)

links = techcrunch.get_article_links()
print(len(links))
print(links[:5])

In [1]:
from scraping.scraper_base import BaseScraper
import logging

class TechCrunchScraper(BaseScraper):

    def __init__(self, tags=None, max_pages=20):
        super().__init__("TechCrunch")

        self.tags = tags or [
            "artificial-intelligence",
            "cloud-computing",
            "robotics"
        ]

        self.base_url = "https://techcrunch.com/tag"
        self.max_pages = max_pages

    def get_article_links(self):
        links = []

        for tag in self.tags:
            for page in range(1, self.max_pages + 1):

                if page == 1:
                    url = f"{self.base_url}/{tag}/"
                else:
                    url = f"{self.base_url}/{tag}/page/{page}/"

                soup = self.get_soup(url)
                if soup is None:
                    break

                articles = soup.select("a.loop-card__title-link")

                if not articles:
                    logging.info(f"[TechCrunch] No más artículos en {tag}, page {page}")
                    break

                for a in articles:
                    href = a.get("href")
                    if href and href.startswith("https://techcrunch.com/"):
                        links.append(href)

        return list(dict.fromkeys(links))

    def scrape_article(self, url):
        soup = self.get_soup(url)
        if soup is None:
            return None

        title = soup.find("h1")
        paragraphs = soup.find_all("p")

        if not title or not paragraphs:
            return None

        return self.build_article(
            url,
            title.get_text(strip=True),
            self.clean_text(paragraphs)
        )

    
techcrunch = TechCrunchScraper(
    max_pages=2
)

links = techcrunch.get_article_links()
print(len(links))
print(links[:5])

187
['https://techcrunch.com/2026/01/13/ai-drug-discovery-startup-converge-bio-pulls-in-25m-from-bessemer-and-execs-from-meta-openai-and-wiz/', 'https://techcrunch.com/2025/10/31/meta-bought-1-gw-of-solar-this-week/', 'https://techcrunch.com/2025/08/26/how-one-ai-startup-is-helping-rice-farmers-battle-climate-change/', 'https://techcrunch.com/2025/08/20/harvard-dropouts-to-launch-always-on-ai-smart-glasses-that-listen-and-record-every-conversation/', 'https://techcrunch.com/2025/08/20/meta-to-add-100-mw-of-solar-power-from-u-s-gear/']


In [None]:
from scraping.sources.scraper_techcrunch import TechCrunchScraper

techcrunch = TechCrunchScraper(
    max_pages=2
)

links = techcrunch.get_article_links()
print(len(links))
print(links[:5])

In [2]:
import pandas as pd
from scraping.normalization import normalize_article

normalized_articles = [
    normalize_article(a) for a in articles
]

df = pd.DataFrame(normalized_articles)
df[["source", "word_count", "language", "is_valid"]]

df

INFO:numexpr.utils:NumExpr defaulting to 16 threads.


Unnamed: 0,source,url,title,content,scraping_date,content_length,word_count,language,is_valid
0,Xataka,https://www.xataka.com/empresas-y-economia/mas...,"MásMóvil compra Sofiathinks, la startup sevill...",Desarrollar hardware en España no es tan habit...,2026-01-27 22:20:23.210903,2311,361,es,True
1,Xataka,https://www.xataka.com/robotica-e-ia/plan-indu...,El plan industrial de EEUU se desmorona porque...,La IA generativa es tontísima. Es la opinión d...,2026-01-27 22:20:23.439998,5739,983,es,True
2,Xataka,https://www.xataka.com/robotica-e-ia/no-coca-c...,"No, Coca-Cola no está usando inteligencia arti...",La fiebre por la inteligencia artificial está ...,2026-01-27 22:20:24.200502,2926,489,es,True
3,Xataka,https://www.xataka.com/componentes/huawei-kunp...,Huawei Kunpeng 920: así es el SoC de 7 nm con ...,Huawei no se anda con «chiquitas»: según la ma...,2026-01-27 22:20:24.502001,3463,568,es,True
4,Xataka,https://www.xataka.com/empresas-y-economia/rea...,Realme más allá del smartphone: así quieren co...,Realme se ha convertido rápidamente en una mar...,2026-01-27 22:20:24.722319,3769,614,es,True
...,...,...,...,...,...,...,...,...,...
6946,Wired ES,https://es.wired.com/articulos/deepseek-r1-pue...,El futuro es la eficiencia: DeepSeek R1 puede ...,"La inteligencia artificial, o ya popularmente ...",2026-01-28 00:18:13.756831,6678,1101,es,True
6947,Wired ES,https://es.wired.com/articulos/demanda-energet...,La demanda energética de la IA está fuera de c...,"En estos momentos, es imposible ignorar la int...",2026-01-28 00:18:14.107712,10412,1670,es,True
6948,Wired ES,https://es.wired.com/articulos/el-36-por-cient...,El 36% de las personas se sienten cómodas leye...,El uso de la inteligencia artificial (IA) en l...,2026-01-28 00:18:14.539258,5331,859,es,True
6949,Wired ES,https://es.wired.com/articulos/x-lanza-oficial...,"X lanza oficialmente Grok 3, su nuevo modelo d...","Tras muchos meses de espera, xAI, la empresa d...",2026-01-28 00:18:15.263931,3882,657,es,True


In [None]:
import pandas as pd
from scraping.normalization import normalize_article

normalized_articles = [
    normalize_article(a) for a in articles
]

df = pd.DataFrame(normalized_articles)
df[["source", "word_count", "language", "is_valid"]]

df_clean = df[df["is_valid"]].copy()
df_clean.shape

output_path = "data/raw/even_more_articles_normalized.csv"

df_clean.to_csv(output_path, index=False, sep=";")
df_clean = pd.read_csv(output_path, sep=";")  # para verificar que se guarda bien
df_clean.to_parquet("data/raw/even_more_articles_normalized.parquet", index=False)

In [13]:
from config.paths import RAW_DATA_DIR
import os
import pandas as pd

df = pd.read_csv(
    os.path.join(RAW_DATA_DIR, "even_more_articles_normalized.csv"),
    sep=";"
)

df_filtered = df[df.source != "Wired ES"]

df_filtered.shape

(5754, 9)

In [10]:
df.groupby("source").size()

source
AWS Blog      1447
TechCrunch    3715
Wired ES      1193
Xataka         592
dtype: int64

In [3]:
import pandas
output_path = "data/raw/even_more_articles_normalized.csv"
output_path = os.path.join(PROJECT_ROOT,"data","raw","even_more_articles_normalized.csv")

df_clean = pandas.read_csv(output_path, sep=";")  # para verificar que se guarda bien
# df_clean.to_parquet("data/raw/even_more_articles_normalized.parquet", index=False)

In [None]:
df_clean.iloc[200:]

In [5]:
from config.paths import PA

print(PA)

ImportError: cannot import name 'PA' from 'config.paths' (C:\Users\Angel\OneDrive - Universidad Complutense de Madrid (UCM)\Documentos\MASTER\99_tfm\tfm_newsletter_ai\config\paths.py)

In [3]:
from config.paths import RAW_DATA_DIR
print(RAW_DATA_DIR)

C:\Users\Angel\OneDrive - Universidad Complutense de Madrid (UCM)\Documentos\MASTER\99_tfm
