# AI-powered Newsletter Generator
**TFM – End-to-end pipeline**

Este notebook implementa un pipeline completo para:
- Agregar noticias tecnológicas de múltiples fuentes
- Representarlas semánticamente mediante embeddings
- Agruparlas automáticamente por áreas temáticas
- Curarlas y priorizarlas para la generación de una newsletter

El foco no está solo en la parte técnica, sino en justificar cada decisión
desde un punto de vista de **valor para negocio**.

# 1. Libraries and other imports

In [None]:
import sys
import os

PROJECT_ROOT = r"C:\Users\Angel\OneDrive - Universidad Complutense de Madrid (UCM)\Documentos\MASTER\99_tfm\tfm_newsletter_ai"

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from config.paths import (
    RAW_DATA_DIR,
    PROCESSED_DATA_DIR,
    DIAGNOSTICS_DIR,
    NEWSLETTER_DIR,
    MODEL_DIR
)

In [None]:
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from config.load_config import load_config
from sklearn.cluster import KMeans, HDBSCAN


# Scraping imports
# from scraping.scraper_base import BaseScraper
from scraping.normalization import normalize_article
from scraping.sources.scraper_xataka import XatakaScraper
from scraping.sources.scraper_huggingface import HuggingFaceScraper
from scraping.sources.scraper_techcrunch import TechCrunchScraper
from scraping.sources.scraper_aws import AWSScraper
from scraping.sources.scraper_wired import WiredScraper
from scraping.sources.scraper_microsoft import MicrosoftNewsScraper
from scraping.sources.scraper_aibusiness import AIBusinessScraper

# NLP imports
from nlp.preprocessing import basic_preprocess
from nlp.embeddings import SentenceTransformerEmbedder
from nlp.cleaning_tfidf import clean_for_tfidf, compute_tfidf
from nlp.clustering import fit_kmeans, find_optimal_k, compute_similarity_to_centroid
from nlp.interpretation import top_terms_per_cluster, name_clusters
from nlp.scoring import compute_source_score, compute_novelty_scores, compute_recency_score, compute_final_score

ModuleNotFoundError: No module named 'nlp'

# 2. Web Scraping

In [None]:
scrapers = [
    XatakaScraper(),
    # HuggingFaceScraper(max_pages=3),
    TechCrunchScraper(max_pages=50),
    AWSScraper(max_pages=50,
               blogs=["machine-learning",
                    "infrastructure-and-automation",
                    "iot",
                    "big-data"
                    ]
            ),
    WiredScraper(max_pages=50)
    # MicrosoftNewsScraper(),
    # AIBusinessScraper(max_pages=2)
]

articles = []

for scraper in scrapers:
    links = scraper.get_article_links()
    for url in links:
        article = scraper.scrape_article(url)
        if article:
            articles.append(article)

len(links)

normalized_articles = [normalize_article(article) for article in articles]

df = pd.DataFrame(normalized_articles)

df_clean = df[df["is_valid"]].copy()
df_clean.shape

In [None]:
# Save cleaned data
df_clean.to_csv(os.path.join(RAW_DATA_DIR, "definite_articles.csv"), index=False, sep=";")

df_clean.to_parquet(os.path.join(RAW_DATA_DIR, "definite_articles.parquet"), index=False)

# 3. NLP

## 3.1 Embeddings

In [None]:
if df_clean.empty:
    df_clean = pd.read_parquet(os.path.join(RAW_DATA_DIR, "definite_articles.parquet"))

df_clean["text_for_embedding"] = (
    df_clean["title"] + ". " + df_clean["content"]
).apply(basic_preprocess)

models_to_test = [
    "miniLM_multilingual",
    "distiluse_multilingual",
    "mpnet_en"
]

embeddings_by_model = {}

for model_name in models_to_test:
    embedder = SentenceTransformerEmbedder(model_name)
    embeddings = embedder.encode(df_clean["text_for_embedding"].tolist())
    embeddings_by_model[model_name] = embeddings
    df_clean[f"embedding_{model_name}"] = embeddings.tolist()

df_clean.to_parquet(os.path.join(PROCESSED_DATA_DIR, "definite_articles_with_embeddings.parquet"))

In [None]:
EMBEDDING_MODEL = "miniLM_multilingual"
X_embeddings = np.vstack(
    df_clean[f"embedding_{EMBEDDING_MODEL}"].values
)

optimal_k = find_optimal_k(X_embeddings, k_min = 4, k_max = 12)
optimal_k

In [None]:
kmeans = fit_kmeans(X_embeddings, n_clusters = optimal_k)
df_clean["cluster"] = kmeans.labels_

## 3.2 TF-IDF

In [None]:
if df_clean.empty:
    df_clean = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "definite_articles_with_embeddings.pkl"))

df["text_tfidf"] = df_clean.apply(
    lambda row: clean_for_tfidf(row["content"], row["language"]),
    axis=1
)

X_tfidf, tfidf_vectorizer = compute_tfidf(df_clean["text_tfidf"])

In [None]:
cluster_terms = top_terms_per_cluster(
    X_tfidf,
    df_clean["cluster"],
    tfidf_vectorizer ,
    top_n=10
)

cluster_names = name_clusters(cluster_terms)
df_clean["cluster_name"] = df_clean["cluster"].map(cluster_names)