# AI-powered Newsletter Generator
**TFM ‚Äì End-to-end pipeline**

Este notebook implementa un pipeline completo para:
- Agregar noticias tecnol√≥gicas de m√∫ltiples fuentes
- Representarlas sem√°nticamente mediante embeddings
- Agruparlas autom√°ticamente por √°reas tem√°ticas
- Curarlas y priorizarlas para la generaci√≥n de una newsletter

El foco no est√° solo en la parte t√©cnica, sino en justificar cada decisi√≥n
desde un punto de vista de **valor para negocio**.

# 1. Libraries and other imports

In [4]:
import sys
import os

PROJECT_ROOT = r"C:\Users\Angel\OneDrive - Universidad Complutense de Madrid (UCM)\Documentos\MASTER\99_tfm\tfm_newsletter_ai"

if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from config.paths import (
    RAW_DATA_DIR,
    PROCESSED_DATA_DIR,
    DIAGNOSTICS_DIR,
    NEWSLETTER_DIR,
    MODEL_DIR
)

In [5]:
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from config.load_config import load_config
from sklearn.cluster import KMeans, HDBSCAN
from IPython.display import display, HTML


# Scraping imports
from scraping.scraper_base import BaseScraper
from scraping.normalization import normalize_article
from scraping.sources.scraper_xataka import XatakaScraper
from scraping.sources.scraper_huggingface import HuggingFaceScraper
from scraping.sources.scraper_techcrunch import TechCrunchScraper
from scraping.sources.scraper_aws import AWSScraper
from scraping.sources.scraper_wired import WiredScraper
from scraping.sources.scraper_microsoft import MicrosoftNewsScraper
from scraping.sources.scraper_aibusiness import AIBusinessScraper
from scraping.sources.scraper_openai import OpenAIScraper

# NLP imports
from nlp.preprocessing import basic_preprocess
from nlp.embeddings import SentenceTransformerEmbedder
from nlp.cleaning_tfidf import clean_for_tfidf, compute_tfidf
from nlp.clustering import fit_kmeans, find_optimal_k, compute_similarity_to_centroid
from nlp.interpretation import top_terms_per_cluster, name_clusters
from nlp.scoring import compute_source_score, compute_novelty_scores, compute_recency_score, compute_final_score

# 2. Web Scraping

In [None]:

scrapers = [
    XatakaScraper(),
    HuggingFaceScraper(max_pages=50, sleep_time=2),
    TechCrunchScraper(max_pages=50),
    AWSScraper(max_pages=50,
               blogs=["machine-learning",
                    "infrastructure-and-automation",
                    "iot",
                    "big-data"
                    ]
            ),
    WiredScraper(max_pages=50)
    # # MicrosoftNewsScraper(),
    # # AIBusinessScraper(),
    # OpenAIScraper()
]

from time import sleep
import random

articles = []

for scraper in scrapers:
    links = scraper.get_article_links()
    for url in links:
        article = scraper.scrape_article(url)
        if article:
            articles.append(article)
        sleep(1+random.uniform(0, 3))

len(links)

normalized_articles = [normalize_article(article) for article in articles]

df = pd.DataFrame(normalized_articles)

df_clean = df[df["is_valid"]].copy()
df_clean.shape

(7678, 9)

In [None]:
# Save cleaned data
df_clean.to_csv(os.path.join(RAW_DATA_DIR, "definite_articles.csv"), index=False, sep=";")
df_clean.to_pickle(os.path.join(RAW_DATA_DIR, "definite_articles.pkl"))
df_clean.to_parquet(os.path.join(RAW_DATA_DIR, "definite_articles.parquet"), index=False)

# 3. NLP

## 3.1 Embeddings

In [26]:
if 'df_clean' not in locals() or df_clean.empty:
    df_clean = pd.read_pickle(os.path.join(RAW_DATA_DIR, "definite_articles.pkl"))

df_clean["text_for_embedding"] = (
    df_clean["title"] + ". " + df_clean["content"]
).apply(basic_preprocess)

models_to_test = [
    "miniLM_multilingual",
    "distiluse_multilingual",
    "mpnet_en"
]

embeddings_by_model = {}

for model_name in models_to_test:
    embedder = SentenceTransformerEmbedder(model_name)
    embeddings = embedder.encode(df_clean["text_for_embedding"].tolist())
    embeddings_by_model[model_name] = embeddings
    df_clean[f"embedding_{model_name}"] = embeddings.tolist()

df_clean.to_parquet(os.path.join(PROCESSED_DATA_DIR, "definite_articles_with_embeddings.parquet"))
df_clean.to_pickle(os.path.join(PROCESSED_DATA_DIR, "definite_articles_with_embeddings.pkl"))

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/240 [00:00<?, ?it/s]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/distiluse-base-multilingual-cased-v2


Batches:   0%|          | 0/240 [00:00<?, ?it/s]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Batches:   0%|          | 0/240 [00:00<?, ?it/s]

In [17]:
if 'df_clean' not in locals() or df_clean.empty:
    df_clean = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "definite_articles_with_embeddings.pkl"))

df_no_wired = df_clean[~df_clean["source"].str.contains("Wired", case=False, na=False)].copy()

# Create embeddings for each model
models_names = ["miniLM_multilingual", "distiluse_multilingual", "mpnet_en"]
var_emb1 = np.vstack(df_no_wired[f"embedding_{models_names[0]}"].values)
var_emb2 = np.vstack(df_no_wired[f"embedding_{models_names[1]}"].values)
var_emb3 = np.vstack(df_no_wired[f"embedding_{models_names[2]}"].values)

# Find optimal k for each model
optimal_k1, scores1 = find_optimal_k(var_emb1, k_min=4, k_max=12)
optimal_k2, scores2 = find_optimal_k(var_emb2, k_min=4, k_max=12)
optimal_k3, scores3 = find_optimal_k(var_emb3, k_min=4, k_max=12)

print(f"Optimal k for {models_names[0]}: {optimal_k1}")
print(f"Optimal k for {models_names[1]}: {optimal_k2}")
print(f"Optimal k for {models_names[2]}: {optimal_k3}")

Optimal k for miniLM_multilingual: 5
Optimal k for distiluse_multilingual: 5
Optimal k for mpnet_en: 4


In [9]:
scores1, scores2, scores3

({4: np.float64(0.06697685694394702),
  5: np.float64(0.08959622504079924),
  6: np.float64(0.07579938788502158),
  7: np.float64(0.08477468668931956),
  8: np.float64(0.08581159379752108),
  9: np.float64(0.08026068897747898),
  10: np.float64(0.06099539506644597),
  11: np.float64(0.07109295975101365),
  12: np.float64(0.06316784819323495)},
 {4: np.float64(0.3292829678608477),
  5: np.float64(0.33068088213379837),
  6: np.float64(0.05648172617268138),
  7: np.float64(0.05422970686438908),
  8: np.float64(0.05420400658382348),
  9: np.float64(0.0581838171757661),
  10: np.float64(0.05403503002803746),
  11: np.float64(0.056382463566916244),
  12: np.float64(0.05567519347729249)},
 {4: np.float64(0.13428056520618917),
  5: np.float64(0.10852472425648775),
  6: np.float64(0.13424625344534427),
  7: np.float64(0.11699992347266841),
  8: np.float64(0.11322286815076396),
  9: np.float64(0.10913348864307991),
  10: np.float64(0.10923094818766808),
  11: np.float64(0.10748423482756943),
  1

In [18]:
# Fit KMeans for each model
kmeans1 = fit_kmeans(var_emb1, k=optimal_k1)
kmeans2 = fit_kmeans(var_emb2, k=optimal_k2)
kmeans3 = fit_kmeans(var_emb3, k=optimal_k3)

# Add cluster assignments for each model (using the first model for primary clustering)
df_no_wired["cluster"] = kmeans1[1]
df_no_wired["cluster_model2"] = kmeans2[1]
df_no_wired["cluster_model3"] = kmeans3[1]

## 3.2 TF-IDF

In [None]:
!python -m spacy download es_core_news_sm
!python -m spacy download en_core_web_sm

In [None]:
if 'df_no_wired' not in locals() or df_no_wired.empty:
    df_no_wired = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "definite_articles_with_embeddings.pkl"))
    df_no_wired = df_no_wired[~df_no_wired["source"].str.contains("Wired", case=False, na=False)].copy()

if 'var_emb1' not in locals():
    models_names = ["miniLM_multilingual", "distiluse_multilingual", "mpnet_en"]
    var_emb1 = np.vstack(df_no_wired[f"embedding_{models_names[0]}"].values)
    var_emb2 = np.vstack(df_no_wired[f"embedding_{models_names[1]}"].values)
    var_emb3 = np.vstack(df_no_wired[f"embedding_{models_names[2]}"].values)

df_no_wired["text_tfidf"] = df_no_wired.apply(
    lambda row: clean_for_tfidf(row["content"], row["language"]),
    axis=1
)

X_tfidf, tfidf_vectorizer = compute_tfidf(df_no_wired["text_tfidf"])

In [40]:
cluster_terms = top_terms_per_cluster(
    X_tfidf,
    df_no_wired["cluster"].values,
    tfidf_vectorizer ,
    top_n=10
)

cluster_names = name_clusters(cluster_terms)
df_no_wired["cluster_name"] = df_no_wired["cluster"].map(cluster_names)

df_no_wired.to_pickle(os.path.join(PROCESSED_DATA_DIR, "definite_articles_clustered.pkl"))

## 3.3 Duplicates and ranking

In [19]:
df_processed = df_no_wired.copy()

models_names = ["miniLM_multilingual", "distiluse_multilingual", "mpnet_en"]
# Create embeddings for each model
var_emb1 = np.vstack(df_processed[f"embedding_{models_names[0]}"].values)
var_emb2 = np.vstack(df_processed[f"embedding_{models_names[1]}"].values)
var_emb3 = np.vstack(df_processed[f"embedding_{models_names[2]}"].values)

# Create similarity matrices for each model
similarity_matrix1 = cosine_similarity(var_emb1)
similarity_matrix2 = cosine_similarity(var_emb2)
similarity_matrix3 = cosine_similarity(var_emb3)

# Average similarity matrix from all models
similarity_matrix = (similarity_matrix1 + similarity_matrix2 + similarity_matrix3) / 3

In [15]:
similarity_matrix.shape[0]

6485

In [33]:
SIM_THRESHOLD = 0.85

duplicates = []
n = similarity_matrix.shape[0]

for i in range(n):
    for j in range(i + 1, n):
        if similarity_matrix[i, j] > SIM_THRESHOLD:
            duplicates.append((i, j, similarity_matrix[i, j]))

to_drop = set()

indices_validos = set(df_processed.index)

for i, j, sim in duplicates:
    if i in indices_validos and j in indices_validos:
        if df_processed.loc[i, "word_count"] >= df_processed.loc[j, "word_count"]:
            to_drop.add(j)
        else:
            to_drop.add(i)

df_curated = df_processed.drop(index=to_drop).reset_index(drop=True)

models_names = ["miniLM_multilingual", "distiluse_multilingual", "mpnet_en"]
# Recreate embeddings for curated dataset
var_emb1 = np.vstack(df_curated[f"embedding_{models_names[0]}"].values)
var_emb2 = np.vstack(df_curated[f"embedding_{models_names[1]}"].values)
var_emb3 = np.vstack(df_curated[f"embedding_{models_names[2]}"].values)

In [34]:
# Compute scores using all three models
# novelty_score1 = compute_novelty_scores(var_emb1, df_curated["cluster"].values)
novelty_score2 = compute_novelty_scores(var_emb2, df_curated["cluster_model2"].values)
novelty_score3 = compute_novelty_scores(var_emb3, df_curated["cluster_model3"].values)

# Average novelty score from all models
df_curated["novelty_score"] = (novelty_score2)
df_curated["recency_score"] = compute_recency_score(df_curated)
df_curated["source_score"] = compute_source_score(df_curated)

# Compute similarity to centroid for each model
# sim_to_centroid1 = compute_similarity_to_centroid(var_emb1, df_curated["cluster"].values, kmeans1[2])
sim_to_centroid2 = compute_similarity_to_centroid(var_emb2, df_curated["cluster_model2"].values, kmeans2[2])
sim_to_centroid3 = compute_similarity_to_centroid(var_emb3, df_curated["cluster_model3"].values, kmeans3[2])

# Average similarity to centroid
df_curated["similarity_to_centroid"] = (sim_to_centroid2)

df_scored = compute_final_score(df_curated,
                                w_similarity=0.4,
                                w_novelty=0.3,
                                w_recency=0.2,
                                w_source=0.1)

## 3.4 Adjusting to AMC Profiles

In [22]:
AMC_PROFILES = {
    "Data & AI": "machine learning artificial intelligence data science models analytics",
    "IT & Cloud": "cloud infrastructure devops automation security networks azure aws",
    "Business": "strategy innovation digital transformation operations performance",
    "Risk & Compliance": "regulation ai ethics governance risk compliance privacy",
    "Marketing": "digital marketing customer engagement personalization content automation advertising social media",
    "Category Management": "product portfolio pricing market trends competition consumer insights procurement",
    "Supply Chain": "logistics operations forecasting demand planning inventory optimization manufacturing",
    "Sales / Commercial": "sales enablement customer acquisition crm revenue optimization commercial strategy",
    "Board": "executive leadership corporate strategy decision making innovation governance investment",
    "New Product Development": "product design innovation prototyping research development user needs experimentation"
}

In [35]:
models_names = ["miniLM_multilingual", "distiluse_multilingual", "mpnet_en"]
# Get embeddings for scored articles
var_emb1 = np.vstack(df_scored[f"embedding_{models_names[0]}"].values)
var_emb2 = np.vstack(df_scored[f"embedding_{models_names[1]}"].values)
var_emb3 = np.vstack(df_scored[f"embedding_{models_names[2]}"].values)

# Create profile embeddings for each model
embedder1 = SentenceTransformerEmbedder(models_names[0])
embedder2 = SentenceTransformerEmbedder(models_names[1])
embedder3 = SentenceTransformerEmbedder(models_names[2])

profile_embs1 = embedder1.encode(list(AMC_PROFILES.values()))
profile_embs2 = embedder2.encode(list(AMC_PROFILES.values()))
profile_embs3 = embedder3.encode(list(AMC_PROFILES.values()))

def assign_area(article_emb1, article_emb2, article_emb3, profile_embs1, profile_embs2, profile_embs3, areas):
    sims1 = cosine_similarity([article_emb1], profile_embs1)[0]
    sims2 = cosine_similarity([article_emb2], profile_embs2)[0]
    sims3 = cosine_similarity([article_emb3], profile_embs3)[0]
    # Average similarities from all models
    avg_sims = (sims2)
    return areas[np.argmax(avg_sims)]

areas = list(AMC_PROFILES.keys())
df_scored["area"] = [
    assign_area(var_emb1[i], var_emb2[i], var_emb3[i], profile_embs1, profile_embs2, profile_embs3, areas)
    for i in range(len(df_scored))
]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/distiluse-base-multilingual-cased-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

# 4. HTML - Nwesletter

In [44]:
def generate_newsletter(df_area, area_name):
    html = f"<h1>Newsletter ‚Äì {area_name}</h1><ul>"
    for _, row in df_area.sort_values("final_score", ascending=False).head(3).iterrows():
        html += f"""
        <li>
            <b>{row['title']}</b><br>
            <i>{row['source']}</i><br>
            <a href="{row['url']}">Leer m√°s</a>
        </li>
        """
    html += "</ul>"
    return html

for area in AMC_PROFILES.keys():
    # df_area = df_scored[df_scored["area"] == area]
    # newsletter_html = generate_newsletter(df_area, area)
    # with open(os.path.join(NEWSLETTER_DIR, f"newsletter_{area.replace(' ', '_')}.html"), "w", encoding="utf-8") as f:
    #     f.write(newsletter_html)
    display(HTML(generate_newsletter(df_scored[df_scored["area"] == area], area)))
    

# 5. Saving results

In [None]:
import joblib
import json
from datetime import datetime

models_names = ["miniLM_multilingual", "distiluse_multilingual", "mpnet_en"]

# Saving KMeans for all three models
joblib.dump(kmeans1[0], os.path.join(MODEL_DIR, "kmeans_model1.pkl"))
joblib.dump(kmeans2[0], os.path.join(MODEL_DIR, "kmeans_model2.pkl"))
joblib.dump(kmeans3[0], os.path.join(MODEL_DIR, "kmeans_model3.pkl"))

# Saving cluster metadata for each model
for idx, (kmeans, model_name) in enumerate([(kmeans1, models_names[0]), (kmeans2, models_names[1]), (kmeans3, models_names[2])], 1):
    cluster_metadata = {
        "model_name": model_name,
        "n_clusters": kmeans[0].n_clusters,
        "inertia": kmeans[0].inertia_,
        "centroids": kmeans[0].cluster_centers_.tolist()
    }
    with open(os.path.join(MODEL_DIR, f"cluster_metadata_model{idx}.json"), "w") as f:
        json.dump(cluster_metadata, f, indent=2)

# Saving processed_urls
with open(os.path.join(PROCESSED_DATA_DIR, "processed_urls.json"), "w") as f:
    json.dump(links, f)

print("Modelos guardados correctamente")

Modelos guardados correctamente


In [38]:
df_no_wired.shape, df_curated.shape, df_scored.shape

((6485, 16), (6120, 22), (6120, 22))