### Entrega 3
#### Topic Modelling and Keywords

##### Dependencias y Drive

In [None]:
!pip install spacy
!python -m spacy download es_core_news_sm
!pip install bertopic sentence-transformers umap-learn hdbscan gensim


from google.colab import drive
drive.mount('/content/drive')

Collecting es-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Mounted at /content/drive


#### TÉCNICAS DE SHALLOW LEARNING

#### Bag od Words + Latent Dirichlet Allocation (LDA)

##### Preparación para Bag of words

In [None]:
import os
import glob
import csv
import re
import unicodedata
import spacy

from google.colab import drive
drive.mount('/content/drive')

import os

BASE_DIR = "/content/drive/Shareddrives/NLP/transcriptions"
TEXT_DIR = os.path.join(BASE_DIR, "all_transcriptions")

OUTPUT_DIR  = r"/content/output/preprocessing_steps"
SPACY_MODEL = "es_core_news_sm"
REMOVE_STOPWORDS = True

TOKEN_PATTERN = re.compile(r"^[A-Za-zÀ-ÖØ-öø-ÿ]{2,}$")

def normalize_text(text: str) -> str:
    """Unicode normalize + lowercase (case folding)."""
    return unicodedata.normalize("NFKC", text).lower()

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def main():
    ensure_dir(OUTPUT_DIR)
    print(f"Cargando modelo spaCy: {SPACY_MODEL}")
    nlp = spacy.load(SPACY_MODEL, disable=["ner"])

    files = sorted(glob.glob(os.path.join(TEXT_DIR, "*.txt")))
    if not files:
        raise FileNotFoundError(f"No se encuentran .txt en: {TEXT_DIR}")

    index_rows = []
    for path in files:
        base = os.path.splitext(os.path.basename(path))[0]
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        doc = nlp(normalize_text(text))

        raw_tokens, lemmas = [], []
        for tok in doc:
            if tok.is_space:
                continue
            if not TOKEN_PATTERN.match(tok.text):
                continue
            if REMOVE_STOPWORDS and tok.is_stop:
                continue
            raw_tokens.append(tok.text)
            lemmas.append(tok.lemma_ if tok.lemma_ else tok.text)

        # --- Save cleaned outputs ---
        with open(os.path.join(OUTPUT_DIR, f"{base}__clean_raw.txt"), "w", encoding="utf-8") as f:
            f.write(" ".join(raw_tokens))
        with open(os.path.join(OUTPUT_DIR, f"{base}__clean_lemma.txt"), "w", encoding="utf-8") as f:
            f.write(" ".join(lemmas))

        # Per-document CSV (raw vs lemma)
        per_doc_csv = os.path.join(OUTPUT_DIR, f"{base}__tokens.csv")
        max_len = max(len(raw_tokens), len(lemmas))
        with open(per_doc_csv, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(["raw", "lemma"])
            for i in range(max_len):
                w.writerow([
                    raw_tokens[i] if i < len(raw_tokens) else "",
                    lemmas[i] if i < len(lemmas) else ""
                ])

        index_rows.append([os.path.basename(path), len(raw_tokens), len(set(raw_tokens))])
        print(f"Procesado: {os.path.basename(path)}  (tokens: {len(raw_tokens)}, únicos: {len(set(raw_tokens))})")

    # Corpus index
    with open(os.path.join(OUTPUT_DIR, "_corpus_index.csv"), "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["file", "token_count", "unique_token_count"])
        w.writerows(index_rows)

    # OVERALL STATISTICS - TOKEN COUNT INFORMATION
    total_tokens = sum(row[1] for row in index_rows)
    total_unique_tokens = len(set(tok for path in files for tok in open(
        os.path.join(OUTPUT_DIR, os.path.splitext(os.path.basename(path))[0] + "__clean_raw.txt"),
        encoding="utf-8").read().split()))

    print("\n=== Corpus Statistics ===")
    print(f"Total documents: {len(files)}")
    print(f"Total tokens: {total_tokens}")
    print(f"Total unique tokens: {total_unique_tokens}")

    print("\nCompleted. Information stored in:", OUTPUT_DIR)

if __name__ == "__main__":
    main()

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Mounted at /content/drive
Cargando modelo spaCy: es_core_news_sm
Procesado: 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva).txt  (tokens: 6047, únicos: 2132)
Procesado: 15 Hábitos para Vivir con Abundancia y Tener Éxito (Sergio Fernández).txt  (tokens: 5447, únicos: 2079)
Procesado: 299 DÍAS SECUESTRADO, ¿c

##### Creación de Bag of words

In [None]:
!pip install gensim
# --- Bag of Words + LDA sobre salidas limpias ---
import os
import glob
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tabulate import tabulate

# Opcional: coherencia con gensim
USE_GENSIM_COHERENCE = True
if USE_GENSIM_COHERENCE:
    from gensim import corpora
    from gensim.models import CoherenceModel

INPUT_DIR = r"/content/output/preprocessing_steps"
OUTPUT_DIR = r"/content/output/bag_of_words"
USE_LEMMAS = True      # True = usa __clean_lemma.txt ; False = __clean_raw.txt
N_TOP_WORDS_TOPIC = 10 # top N palabras por tópico (LDA)
N_TOPICS = 20          # número de tópicos LDA (ajústalo)
MAX_ITER = 50          # iteraciones LDA
LEARNING_METHOD = "batch"  # "batch" (estable) o "online" (rápido en corpora grandes)
RANDOM_STATE = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

def read_documents(output_dir: str, use_lemmas: bool = True):
    pattern = "__clean_lemma.txt" if use_lemmas else "__clean_raw.txt"
    paths = sorted(glob.glob(os.path.join(output_dir, f"*{pattern}")))
    if not paths:
        raise FileNotFoundError(f"No cleaned files found in {output_dir} with pattern *{pattern}")
    docs = []
    doc_names = []
    for p in paths:
        with open(p, "r", encoding="utf-8") as f:
            docs.append(f.read().strip())
        # nombre base sin sufijo para claridad
        name = os.path.basename(p).replace(pattern, "").rstrip("_")
        doc_names.append(name)
    return doc_names, docs

def build_bow(docs, min_df=2, max_df=0.95, ngram_range=(1,1), vocabulary=None):
    """
    - min_df: ignora términos que aparecen en menos de min_df docs (int) o proporción (float).
    - max_df: ignora términos que aparecen en más de max_df docs (p.ej., 0.95).
    - ngram_range: (1,1)=unigramas, (1,2)=unigramas+bigramas.
    - vocabulary: vocabulario fijo opcional (dict o lista).
    """
    vectorizer = CountVectorizer(
        tokenizer=str.split,      # los tokens ya están separados por espacios
        preprocessor=None,
        lowercase=False,          # ya están en minúscula
        stop_words=None,          # ya removidas upstream
        min_df=min_df,
        max_df=max_df,
        ngram_range=ngram_range,
        vocabulary=vocabulary
    )
    X = vectorizer.fit_transform(docs)
    vocab = vectorizer.get_feature_names_out()
    return X, vocab, vectorizer

def to_dataframe(X, vocab, doc_names):
    df = pd.DataFrame.sparse.from_spmatrix(X, index=doc_names, columns=vocab)
    # ordena columnas alfabéticamente para reproducibilidad
    df = df.reindex(sorted(df.columns), axis=1)
    return df

# --- Lectura ---
doc_names, docs = read_documents(INPUT_DIR, use_lemmas=USE_LEMMAS)

# --- BoW ---
X_bow, vocab, vectorizer = build_bow(
    docs,
    min_df=8,           # ignora términos ultra-raros; ajusta según corpus
    max_df=0.85,        # quita términos extremadamente comunes entre docs
    ngram_range=(1,1)   # usa solo unigramas para LDA (recomendado)
)
df_bow = to_dataframe(X_bow, vocab, doc_names)
bow_csv = os.path.join(OUTPUT_DIR, "_bow_matrix.csv")
df_bow.to_csv(bow_csv, encoding="utf-8")
print(f"Saved BoW matrix to: {bow_csv}  shape={df_bow.shape}")





Saved BoW matrix to: /content/output/bag_of_words/_bow_matrix.csv  shape=(104, 2932)


##### Latent Dirichlet Allocation (LDA)

In [None]:
# --- LDA (scikit-learn) sobre la matriz de conteos ---
lda = LatentDirichletAllocation(
    n_components=N_TOPICS,
    max_iter=MAX_ITER,
    learning_method=LEARNING_METHOD,
    random_state=RANDOM_STATE,
    evaluate_every=0,    # pon >0 si quieres el bound cada n iteraciones
)
lda.fit(X_bow)

# Palabras top por tópico
def get_top_words_per_topic(lda_model, feature_names, n_top_words=10):
    topics = []
    for k, comp in enumerate(lda_model.components_):
        # comp: distribución de palabras del tópico k (no normalizada)
        top_idx = np.argsort(comp)[::-1][:n_top_words]
        top_terms = [(feature_names[i], float(comp[i])) for i in top_idx]
        topics.append({"topic_id": k, "top_terms": top_terms})
    return topics

topics = get_top_words_per_topic(lda, vocab, n_top_words=N_TOP_WORDS_TOPIC)

# Guarda tópicos en CSV
topic_rows = []
for t in topics:
    for rank, (term, weight) in enumerate(t["top_terms"], start=1):
        topic_rows.append({
            "topic_id": t["topic_id"],
            "rank": rank,
            "term": term,
            "weight": weight
        })
df_topics = pd.DataFrame(topic_rows, columns=["topic_id", "rank", "term", "weight"])
topics_csv = os.path.join(OUTPUT_DIR, "_lda_topics_top_terms.csv")
df_topics.to_csv(topics_csv, index=False, encoding="utf-8")
print(f"Saved LDA topics top-terms to: {topics_csv}")

# Distribución documento–tópico (gamma)
doc_topic = lda.transform(X_bow)  # shape: [n_docs, n_topics]
df_doc_topic = pd.DataFrame(doc_topic, index=doc_names, columns=[f"topic_{i}" for i in range(N_TOPICS)])
doc_topic_csv = os.path.join(OUTPUT_DIR, "_lda_doc_topic_distribution.csv")
df_doc_topic.to_csv(doc_topic_csv, encoding="utf-8")
print(f"Saved doc-topic distributions to: {doc_topic_csv}")

Saved LDA topics top-terms to: /content/output/bag_of_words/_lda_topics_top_terms.csv
Saved doc-topic distributions to: /content/output/bag_of_words/_lda_doc_topic_distribution.csv


##### Métricas de coherencia (C_v y UMass)

In [None]:
if USE_GENSIM_COHERENCE:
    # Para coherencia, Gensim espera tokens por documento
    # Como ya tienes textos tokenizados por espacios, reconstruimos listas de tokens:
    token_lists = [d.split() for d in docs]
    dictionary = corpora.Dictionary(token_lists)
    corpus = [dictionary.doc2bow(toks) for toks in token_lists]

    # Construye representación de tópicos en formato Gensim:
    # Lista de listas de términos (strings) por tópico:
    top_terms_per_topic = [[term for term, _w in topics[k]["top_terms"]] for k in range(len(topics))]

    # Coherencia C_v (recomendada) y UMass (alternativa)
    cm_cv = CoherenceModel(
        topics=top_terms_per_topic,
        texts=token_lists,
        dictionary=dictionary,
        coherence='c_v'
    )
    cm_umass = CoherenceModel(
        topics=top_terms_per_topic,
        corpus=corpus,
        dictionary=dictionary,
        coherence='u_mass'
    )

    coherence_cv = cm_cv.get_coherence()
    coherence_umass = cm_umass.get_coherence()
    print(f"\nCoherencia C_v: {coherence_cv:.4f}")
    print(f"Coherencia UMass: {coherence_umass:.4f}")

    # Guarda métricas
    coh_path = os.path.join(OUTPUT_DIR, "_lda_coherence.txt")
    with open(coh_path, "w", encoding="utf-8") as f:
        f.write(f"Coherencia C_v: {coherence_cv:.6f}\n")
        f.write(f"Coherencia UMass: {coherence_umass:.6f}\n")
    print(f"Saved coherence metrics to: {coh_path}")


Coherencia C_v: 0.5212
Coherencia UMass: -0.6793
Saved coherence metrics to: /content/output/bag_of_words/_lda_coherence.txt


##### Testeo de los topics con el ground truth

In [None]:
import os
import pandas as pd
import numpy as np
import unicodedata

# --- Configuración ---
OUTPUT_DIR = "output/bag_of_words"  # Ajusta la ruta donde están los resultados LDA
DOC_TOPIC_CSV = os.path.join(OUTPUT_DIR, "_lda_doc_topic_distribution.csv")
TOPICS_CSV = os.path.join(OUTPUT_DIR, "_lda_topics_top_terms.csv")
GROUNDTRUTH_CSV = "/content/drive/Shareddrives/NLP/transcription_summaries/topics.txt"
TOP_K_TOPICS = 2        # Número de tópicos más relevantes por documento
TOP_WORDS_PER_TOPIC = 2 # Número de palabras por tópico

# --- Función para normalizar texto (minúsculas + quitar acentos) ---
def normalize_text(text):
    text = text.lower()
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    return text

# --- Cargar datos ---
df_doc_topic = pd.read_csv(DOC_TOPIC_CSV, index_col=0)
df_topics = pd.read_csv(TOPICS_CSV)
df_gt = pd.read_csv(GROUNDTRUTH_CSV)

# Normalizar nombres de documentos y etiquetas
df_doc_topic.index = [normalize_text(os.path.splitext(name)[0]) for name in df_doc_topic.index]
df_gt['podcast'] = df_gt['podcast'].apply(lambda x: normalize_text(os.path.splitext(x)[0]))
df_gt['labels_list'] = df_gt['labels'].apply(lambda x: [normalize_text(w) for w in str(x).split(',') if w.strip()])

# --- Crear mapa de tópico -> palabras top ---
topic_words_map = {}
for topic_id, group in df_topics.groupby('topic_id'):
    sorted_terms = group.sort_values('rank')['term'].tolist()
    topic_words_map[int(topic_id)] = [normalize_text(w) for w in sorted_terms]

# --- Generar predicciones: top-K tópicos y top-N palabras ---
pred_rows = []
for doc_name, row in df_doc_topic.iterrows():
    probs = row.values.astype(float)
    topic_ids = np.array([int(c.split('_')[1]) for c in row.index])
    top_idx = np.argsort(probs)[::-1][:TOP_K_TOPICS]
    selected_topics = topic_ids[top_idx]
    selected_words = []
    for tid in selected_topics:
        selected_words.extend(topic_words_map.get(tid, [])[:TOP_WORDS_PER_TOPIC])
    pred_rows.append({
        'podcast': doc_name,
        'predicted_keywords': selected_words
    })

df_pred = pd.DataFrame(pred_rows)

# --- Unir con ground truth ---
df_eval = df_pred.merge(df_gt[['podcast', 'labels_list']], on='podcast', how='inner')

# --- Métricas ---
def precision_recall_f1_jaccard(y_true, y_pred):
    set_true = set(y_true)
    set_pred = set(y_pred)
    tp = len(set_true & set_pred)
    fp = len(set_pred - set_true)
    fn = len(set_true - set_pred)
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1   = (2*prec*rec)/(prec+rec) if (prec+rec) > 0 else 0.0
    jacc = tp / len(set_true | set_pred) if len(set_true | set_pred) > 0 else 0.0
    return prec, rec, f1, jacc

metrics_rows = []
for _, row in df_eval.iterrows():
    prec, rec, f1, jacc = precision_recall_f1_jaccard(row['labels_list'], row['predicted_keywords'])
    metrics_rows.append({
        'podcast': row['podcast'],
        'groundtruth': ', '.join(row['labels_list']),
        'predicted': ', '.join(row['predicted_keywords']),
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'jaccard': jacc
    })

df_metrics = pd.DataFrame(metrics_rows)

# --- Macro métricas ---
macro_precision = df_metrics['precision'].mean()
macro_recall = df_metrics['recall'].mean()
macro_f1 = df_metrics['f1'].mean()
macro_jaccard = df_metrics['jaccard'].mean()

# --- Guardar resultados ---
results_dir = os.path.join(OUTPUT_DIR, 'evaluation')
os.makedirs(results_dir, exist_ok=True)
df_metrics.to_csv(os.path.join(results_dir, '_lda_vs_groundtruth_metrics.csv'), index=False, encoding='utf-8')

# --- Mostrar resumen ---
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall:    {macro_recall:.4f}")
print(f"Macro F1:        {macro_f1:.4f}")
print(f"Macro Jaccard:   {macro_jaccard:.4f}\n")
print("=== Ejemplos ===")
print(df_metrics.head(10).to_string(index=False))


Macro Precision: 0.0000
Macro Recall:    0.0000
Macro F1:        0.0000
Macro Jaccard:   0.0000

=== Ejemplos ===
                                           podcast                                                   groundtruth                            predicted  precision  recall  f1  jaccard
       abogados tumban los bulos sobre extranjeria           inmigracion, ley, racismo, desinformacion, derechos        espanol, espana, pais, pueblo        0.0     0.0 0.0      0.0
         aranceles_ trump revienta su propia bolsa  aranceles, proteccionismo, mercados, deslocalizacion, crisis         unidos, europa, pais, pueblo        0.0     0.0 0.0      0.0
analizamos el acuerdo comercial entre la ue y eeuu acuerdo_comercial, sumision, dependencia, soberania, economia       unidos, europa, espana, israel        0.0     0.0 0.0      0.0
                        cayendo por su propio peso                    colapso, crisis, deuda, inflacion, fracaso       pais, pueblo, partido, derecha        0

#### TÉCNICAS DE DEEP LEARNING

#### BERTopic
**- Comparativa de embeddings: MiniLM vs DistilUSE**

**- Dos perfiles de HDBSCAN: eom (estricto) vs leaf (granular)**

In [None]:
import os, glob
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# ------------------------------
# 1) Cargar documentos clean_raw
# ------------------------------
INPUT_DIR = "/content/drive/Shareddrives/NLP/entrega2_preprocessing+embeddings/preprocessing_steps"
pattern = "__clean_raw.txt"
paths = sorted(glob.glob(os.path.join(INPUT_DIR, f"*{pattern}")))

docs, doc_names = [], []
for p in paths:
    with open(p, "r", encoding="utf-8") as f:
        text = f.read().strip()
        if len(text) > 0:
            docs.append(text)
            doc_names.append(os.path.basename(p).replace(pattern, "").rstrip("_"))

print(f"Documentos cargados: {len(docs)}")
if len(docs) == 0:
    raise FileNotFoundError(f"No se encontraron archivos con patrón {pattern} en {INPUT_DIR}")

# --------------------------------------------
# 2) Vectorizer (sin stopwords: ya están filtradas)
# --------------------------------------------
vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    lowercase=True,
    token_pattern=r"(?u)\b\w\w+\b",
    max_df=0.95,
    min_df=2
)

# --------------------------------------------
# 3) UMAP y HDBSCAN (dos perfiles de cluster)
# --------------------------------------------
# UMAP más local para favorecer más clusters
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric="cosine")

# Perfil HDBSCAN más estricto
hdbscan_eom = HDBSCAN(min_cluster_size=10, metric="euclidean", cluster_selection_method="eom")

# Perfil HDBSCAN más granular (más clusters, menos -1)
hdbscan_leaf = HDBSCAN(min_cluster_size=5, min_samples=1, metric="euclidean", cluster_selection_method="leaf")

# -------------------------------------------------
# 4) Función: entrenar BERTopic + guardar resultados
# -------------------------------------------------
def run_bertopic(embedding_model_name, docs, doc_names, output_dir, hdbscan_model):
    print(f"\n[RUN] Embedding: {embedding_model_name} | HDBSCAN: {hdbscan_model.cluster_selection_method} | min_cluster_size={hdbscan_model.min_cluster_size}")
    embedding_model = SentenceTransformer(embedding_model_name)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        language="spanish",  # corpus en español
        verbose=True
    )

    topics, probs = topic_model.fit_transform(docs)

    # --- Métricas básicas ---
    num_noise = sum(1 for t in topics if t == -1)
    noise_rate = num_noise / len(topics)
    topic_info = topic_model.get_topic_info()
    valid_topic_ids = [tid for tid in topic_info["Topic"].tolist() if tid != -1]
    num_topics = len(valid_topic_ids)
    print(f"Tópicos válidos: {num_topics} | Ruido (-1): {num_noise} ({noise_rate:.1%})")

    # --- Guardar resultados ---
    os.makedirs(output_dir, exist_ok=True)
    tag = f"{embedding_model_name.replace('/', '_')}_{hdbscan_model.cluster_selection_method}_mincs{hdbscan_model.min_cluster_size}"

    topic_info.to_csv(os.path.join(output_dir, f"_topics_{tag}.csv"), index=False)
    pd.DataFrame({"podcast": doc_names, "topic": topics, "probability": probs}).to_csv(
        os.path.join(output_dir, f"_doc_topics_{tag}.csv"), index=False
    )

    # --- Visualizaciones (robustas) ---
    # Evitar crash en visualize_topics si hay <3 tópicos
    try:
        if num_topics >= 3:
            topic_model.visualize_topics().write_html(os.path.join(output_dir, f"viz_topics_{tag}.html"))
        else:
            print(f"[Aviso] visualize_topics requiere >=3 tópicos. Actual: {num_topics}. Se omite.")
    except Exception as e:
        print(f"[Aviso] visualize_topics falló: {e}")

    # Estas suelen funcionar aunque haya pocos tópicos
    try:
        topic_model.visualize_barchart().write_html(os.path.join(output_dir, f"viz_barchart_{tag}.html"))
    except Exception as e:
        print(f"[Aviso] visualize_barchart falló: {e}")

    try:
        topic_model.visualize_hierarchy().write_html(os.path.join(output_dir, f"viz_hierarchy_{tag}.html"))
    except Exception as e:
        print(f"[Aviso] visualize_hierarchy falló: {e}")

    try:
        topic_model.visualize_heatmap().write_html(os.path.join(output_dir, f"viz_heatmap_{tag}.html"))
    except Exception as e:
        print(f"[Aviso] visualize_heatmap falló: {e}")

    try:
        topic_model.visualize_documents(docs).write_html(os.path.join(output_dir, f"viz_documents_{tag}.html"))
    except Exception as e:
        print(f"[Aviso] visualize_documents falló: {e}")

    # --- Coherencia C_v ---
    texts = [d.split() for d in docs]
    dictionary = Dictionary(texts)
    topics_words = [[w for (w, _) in topic_model.get_topic(tid)] for tid in valid_topic_ids]

    if num_topics > 0 and all(len(tw) > 0 for tw in topics_words):
        coherence_model = CoherenceModel(topics=topics_words, texts=texts, dictionary=dictionary, coherence="c_v")
        coherence_score = coherence_model.get_coherence()
    else:
        coherence_score = float("nan")
        print("[Aviso] No hay términos suficientes para calcular coherencia C_v.")

    print(f"Coherencia C_v: {coherence_score}\n")
    return {
        "embedding": embedding_model_name,
        "hdbscan": hdbscan_model.cluster_selection_method,
        "min_cluster_size": hdbscan_model.min_cluster_size,
        "num_topics": num_topics,
        "noise_rate": noise_rate,
        "coherence_Cv": coherence_score,
    }, topic_model

# --------------------------------------------
# 5) Ejecutar comparativa (2 embeddings × 2 HDBSCAN)
# --------------------------------------------
output_dir = "/content/output/bertopic"
results = []

embeddings_to_try = [
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",  # baseline
    "distiluse-base-multilingual-cased-v1"                          # suele rendir mejor en ES
]

for model_name in embeddings_to_try:
    for hdb in (hdbscan_eom, hdbscan_leaf):
        stats, _ = run_bertopic(model_name, docs, doc_names, output_dir, hdb)
        results.append(stats)

# --- Tabla comparativa ---
df_results = pd.DataFrame(results).sort_values(["embedding", "num_topics"], ascending=[True, False])
print("\nResultados comparativos:")
print(df_results)
df_results.to_csv(os.path.join(output_dir, "_bertopic_comparativa.csv"), index=False)

# ---------------------------------------------------
# 6) (Opcional) Selección rápida del mejor experimento
# ---------------------------------------------------
# Prioriza más tópicos y mayor coherencia, penaliza ruido.
df_results["score"] = (
    df_results["num_topics"] * 1.0
    + df_results["coherence_Cv"].fillna(0) * 2.0
    - df_results["noise_rate"] * 1.0
)
best = df_results.sort_values("score", ascending=False).head(1)
print("\nMejor configuración según score (2*Cv + num_topics - noise):")
print(best)


Documentos cargados: 104

[RUN] Embedding: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 | HDBSCAN: eom | min_cluster_size=10


2025-11-28 15:47:00,922 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-11-28 15:47:02,534 - BERTopic - Embedding - Completed ✓
2025-11-28 15:47:02,535 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-28 15:47:02,689 - BERTopic - Dimensionality - Completed ✓
2025-11-28 15:47:02,690 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-28 15:47:02,697 - BERTopic - Cluster - Completed ✓
2025-11-28 15:47:02,699 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-28 15:47:03,289 - BERTopic - Representation - Completed ✓


Tópicos válidos: 2 | Ruido (-1): 27 (26.0%)
[Aviso] visualize_topics requiere >=3 tópicos. Actual: 2. Se omite.
Coherencia C_v: 0.4237413339464669


[RUN] Embedding: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 | HDBSCAN: leaf | min_cluster_size=5


2025-11-28 15:47:12,104 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-11-28 15:47:13,729 - BERTopic - Embedding - Completed ✓
2025-11-28 15:47:13,733 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-28 15:47:13,919 - BERTopic - Dimensionality - Completed ✓
2025-11-28 15:47:13,919 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-28 15:47:13,926 - BERTopic - Cluster - Completed ✓
2025-11-28 15:47:13,931 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-28 15:47:14,530 - BERTopic - Representation - Completed ✓


Tópicos válidos: 9 | Ruido (-1): 5 (4.8%)
Coherencia C_v: 0.5706526997320949


[RUN] Embedding: distiluse-base-multilingual-cased-v1 | HDBSCAN: eom | min_cluster_size=10


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

2025-11-28 15:47:29,776 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-11-28 15:47:31,300 - BERTopic - Embedding - Completed ✓
2025-11-28 15:47:31,301 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-28 15:47:31,470 - BERTopic - Dimensionality - Completed ✓
2025-11-28 15:47:31,471 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-28 15:47:31,480 - BERTopic - Cluster - Completed ✓
2025-11-28 15:47:31,484 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-28 15:47:32,405 - BERTopic - Representation - Completed ✓


Tópicos válidos: 2 | Ruido (-1): 12 (11.5%)
[Aviso] visualize_topics requiere >=3 tópicos. Actual: 2. Se omite.
Coherencia C_v: 0.4310259748293079


[RUN] Embedding: distiluse-base-multilingual-cased-v1 | HDBSCAN: leaf | min_cluster_size=5


2025-11-28 15:47:39,270 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-11-28 15:47:40,744 - BERTopic - Embedding - Completed ✓
2025-11-28 15:47:40,745 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-28 15:47:40,907 - BERTopic - Dimensionality - Completed ✓
2025-11-28 15:47:40,907 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-28 15:47:40,913 - BERTopic - Cluster - Completed ✓
2025-11-28 15:47:40,915 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-28 15:47:41,532 - BERTopic - Representation - Completed ✓


Tópicos válidos: 11 | Ruido (-1): 21 (20.2%)
Coherencia C_v: 0.5507473168124948


Resultados comparativos:
                                           embedding hdbscan  \
3               distiluse-base-multilingual-cased-v1    leaf   
2               distiluse-base-multilingual-cased-v1     eom   
1  sentence-transformers/paraphrase-multilingual-...    leaf   
0  sentence-transformers/paraphrase-multilingual-...     eom   

   min_cluster_size  num_topics  noise_rate  coherence_Cv  
3                 5          11    0.201923      0.550747  
2                10           2    0.115385      0.431026  
1                 5           9    0.048077      0.570653  
0                10           2    0.259615      0.423741  

Mejor configuración según score (2*Cv + num_topics - noise):
                              embedding hdbscan  min_cluster_size  num_topics  \
3  distiluse-base-multilingual-cased-v1    leaf                 5          11   

   noise_rate  coherence_Cv      score  
3    0

Comparación de métricas con el ground truth

In [None]:

import os
import unicodedata
import pandas as pd

# =============================
# CONFIGURACIÓN DE RUTAS
# =============================
GROUNDTRUTH_PATH = "/content/drive/Shareddrives/NLP/transcription_summaries/topics.txt"
DOC_TOPICS_PATH = "/content/output/bertopic/_doc_topics_distiluse-base-multilingual-cased-v1_leaf_mincs5.csv"
TOPICS_INFO_PATH = "/content/output/bertopic/_topics_distiluse-base-multilingual-cased-v1_leaf_mincs5.csv"
OUTPUT_PATH = "/content/output/bertopic/_comparativa_groundtruth.csv"

# =============================
# FUNCIONES AUXILIARES
# =============================
def normalize(s):
    return unicodedata.normalize("NFKC", str(s)).lower().strip()

def to_set(field):
    if pd.isna(field):
        return set()
    return set(normalize(x) for x in str(field).split(",") if x.strip())

def jaccard(a, b):
    if not a and not b:
        return 0.0
    return len(a & b) / len(a | b) if (a | b) else 0.0

# =============================
# 1) Cargar archivos
# =============================
for path, name in [(GROUNDTRUTH_PATH, "groundtruth"), (DOC_TOPICS_PATH, "doc_topics"), (TOPICS_INFO_PATH, "topics_info")]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"No se encontró {name} en {path}")

gt_df = pd.read_csv(GROUNDTRUTH_PATH)
doc_topics_df = pd.read_csv(DOC_TOPICS_PATH)
topics_info_df = pd.read_csv(TOPICS_INFO_PATH)

# Normalizar nombres
gt_df["podcast"] = gt_df["podcast"].str.replace(".txt", "", regex=False).apply(normalize)
doc_topics_df["podcast"] = doc_topics_df["podcast"].apply(normalize)

# =============================
# 2) Diccionario tópico -> palabras clave
# =============================
topic_keywords = {}
for _, row in topics_info_df.iterrows():
    tid = row["Topic"]
    if tid == -1:
        continue
    kws = [normalize(x) for x in str(row.get("Name", "")).split("_") if x]
    topic_keywords[tid] = kws

# =============================
# 3) Comparación documento por documento
# =============================
rows = []
for _, row in doc_topics_df.iterrows():
    doc_name = row["podcast"]
    tid = row["topic"]
    prob = row.get("probability", float("nan"))

    gt_row = gt_df[gt_df["podcast"] == doc_name]
    if gt_row.empty:
        continue

    gt_labels = to_set(gt_row.iloc[0]["labels"])
    topic_words = set(topic_keywords.get(tid, []))

    inter = gt_labels & topic_words
    precision = len(inter) / len(topic_words) if topic_words else 0.0
    recall = len(inter) / len(gt_labels) if gt_labels else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
    jac = jaccard(gt_labels, topic_words)

    rows.append({
        "podcast": doc_name,
        "groundtruth": ", ".join(sorted(gt_labels)),
        "predicted": ", ".join(sorted(topic_words)),
        "topic_id": tid,
        "probability": prob,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "jaccard": jac
    })

results_df = pd.DataFrame(rows)

# =============================
# 4) Métricas globales
# =============================
macro_precision = results_df["precision"].mean()
macro_recall = results_df["recall"].mean()
macro_f1 = results_df["f1"].mean()
macro_jaccard = results_df["jaccard"].mean()

print("=== Métricas Globales ===")
print(f"Precision media: {macro_precision:.4f}")
print(f"Recall media   : {macro_recall:.4f}")
print(f"F1 media       : {macro_f1:.4f}")
print(f"Jaccard medio  : {macro_jaccard:.4f}")

# Guardar CSV
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
results_df.to_csv(OUTPUT_PATH, index=False)
print(f"Resultados guardados en: {OUTPUT_PATH}")

# =============================
# 5) Mostrar ejemplos
# =============================
print("\n=== Ejemplos ===")
examples = results_df.sort_values(["f1", "jaccard"]).head(8)
print(examples[["podcast", "groundtruth", "predicted", "precision", "recall", "f1", "jaccard"]].to_string(index=False))


=== Métricas Globales ===
Precision media: 0.0000
Recall media   : 0.0000
F1 media       : 0.0000
Jaccard medio  : 0.0000
Resultados guardados en: /content/output/bertopic/_comparativa_groundtruth.csv

=== Ejemplos ===
                                           podcast                                                   groundtruth                               predicted  precision  recall  f1  jaccard
       abogados tumban los bulos sobre extranjería           derechos, desinformación, inmigración, ley, racismo                                                0.0     0.0 0.0      0.0
         aranceles_ trump revienta su propia bolsa  aranceles, crisis, deslocalización, mercados, proteccionismo 2, europea, trump, unión, unión europea        0.0     0.0 0.0      0.0
analizamos el acuerdo comercial entre la ue y eeuu acuerdo_comercial, dependencia, economía, soberanía, sumisión 2, europea, trump, unión, unión europea        0.0     0.0 0.0      0.0
                        cayendo por su pr