In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os

BASE_DIR = "/content/drive/Shareddrives/NLP/transcriptions"
TEXT_DIR = os.path.join(BASE_DIR, "all_transcriptions")

Mounted at /content/drive


## **Técnicas de representación de palabras**

### Case-folding, stopword removal and lemmatization

In [2]:
!pip install spacy
!python -m spacy download es_core_news_sm

import os
import glob
import csv
import re
import unicodedata
import spacy


OUTPUT_DIR  = r"/content/output/preprocessing_steps"
SPACY_MODEL = "es_core_news_sm"
REMOVE_STOPWORDS = True

TOKEN_PATTERN = re.compile(r"^[A-Za-zÀ-ÖØ-öø-ÿ]{2,}$")

def normalize_text(text: str) -> str:
    """Unicode normalize + lowercase (case folding)."""
    return unicodedata.normalize("NFKC", text).lower()

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def main():
    ensure_dir(OUTPUT_DIR)
    print(f"Cargando modelo spaCy: {SPACY_MODEL}")
    nlp = spacy.load(SPACY_MODEL, disable=["ner"])

    files = sorted(glob.glob(os.path.join(TEXT_DIR, "*.txt")))
    if not files:
        raise FileNotFoundError(f"No se encuentran .txt en: {TEXT_DIR}")

    index_rows = []
    for path in files:
        base = os.path.splitext(os.path.basename(path))[0]
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        doc = nlp(normalize_text(text))

        raw_tokens, lemmas = [], []
        for tok in doc:
            if tok.is_space:
                continue
            if not TOKEN_PATTERN.match(tok.text):
                continue
            if REMOVE_STOPWORDS and tok.is_stop:
                continue
            raw_tokens.append(tok.text)
            lemmas.append(tok.lemma_ if tok.lemma_ else tok.text)

        # --- Save cleaned outputs ---
        with open(os.path.join(OUTPUT_DIR, f"{base}__clean_raw.txt"), "w", encoding="utf-8") as f:
            f.write(" ".join(raw_tokens))
        with open(os.path.join(OUTPUT_DIR, f"{base}__clean_lemma.txt"), "w", encoding="utf-8") as f:
            f.write(" ".join(lemmas))

        # Per-document CSV (raw vs lemma)
        per_doc_csv = os.path.join(OUTPUT_DIR, f"{base}__tokens.csv")
        max_len = max(len(raw_tokens), len(lemmas))
        with open(per_doc_csv, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(["raw", "lemma"])
            for i in range(max_len):
                w.writerow([
                    raw_tokens[i] if i < len(raw_tokens) else "",
                    lemmas[i] if i < len(lemmas) else ""
                ])

        index_rows.append([os.path.basename(path), len(raw_tokens), len(set(raw_tokens))])
        print(f"Procesado: {os.path.basename(path)}  (tokens: {len(raw_tokens)}, únicos: {len(set(raw_tokens))})")

    # Corpus index
    with open(os.path.join(OUTPUT_DIR, "_corpus_index.csv"), "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["file", "token_count", "unique_token_count"])
        w.writerows(index_rows)

    # OVERALL STATISTICS - TOKEN COUNT INFORMATION
    total_tokens = sum(row[1] for row in index_rows)
    total_unique_tokens = len(set(tok for path in files for tok in open(
        os.path.join(OUTPUT_DIR, os.path.splitext(os.path.basename(path))[0] + "__clean_raw.txt"),
        encoding="utf-8").read().split()))

    print("\n=== Corpus Statistics ===")
    print(f"Total documents: {len(files)}")
    print(f"Total tokens: {total_tokens}")
    print(f"Total unique tokens: {total_unique_tokens}")

    print("\nCompleted. Information stored in:", OUTPUT_DIR)

if __name__ == "__main__":
    main()

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m113.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Cargando modelo spaCy: es_core_news_sm
Procesado: 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva).txt  (tokens: 6047, únicos: 2132)
Procesado: 15 Hábitos para Vivir con Abundancia y Tener Éxito (Sergio Fernández).txt 

### Bag of Words

In [None]:
# --- Bag of Words from cleaned outputs + Top 10 terms per podcast ---

import os
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tabulate import tabulate

INPUT_DIR = r"/content/output/preprocessing_steps"
OUTPUT_DIR = r"/content/output/bag_of_words"
USE_LEMMAS = True 
N_TOP = 5     

os.makedirs(OUTPUT_DIR, exist_ok=True)

def read_documents(output_dir: str, use_lemmas: bool = True):
    pattern = "__clean_lemma.txt" if use_lemmas else "__clean_raw.txt"
    paths = sorted(glob.glob(os.path.join(output_dir, f"*{pattern}")))
    if not paths:
        raise FileNotFoundError(f"No cleaned files found in {output_dir} with pattern *{pattern}")
    docs = []
    doc_names = []
    for p in paths:
        with open(p, "r", encoding="utf-8") as f:
            docs.append(f.read().strip())
        # Base name without suffix for clarity
        name = os.path.basename(p).replace(pattern, "").rstrip("_")
        doc_names.append(name)
    return doc_names, docs

def build_bow(docs, min_df=1, max_df=1.0, ngram_range=(1,1), vocabulary=None):
    """
    - min_df: ignore terms that appear in fewer than min_df docs (int) or proportion (float).
    - max_df: ignore terms that appear in more than max_df docs (e.g., 0.95).
    - ngram_range: (1,1)=unigrams, (1,2)=unigrams+bigrams.
    - vocabulary: optional fixed vocab dict or list.
    """
    vectorizer = CountVectorizer(
        tokenizer=str.split,         
        preprocessor=None,
        lowercase=False,           
        stop_words=None,          
        min_df=min_df,
        max_df=max_df,
        ngram_range=ngram_range,
        vocabulary=vocabulary
    )
    X = vectorizer.fit_transform(docs)
    vocab = vectorizer.get_feature_names_out()
    return X, vocab

def to_dataframe(X, vocab, doc_names):
    df = pd.DataFrame.sparse.from_spmatrix(X, index=doc_names, columns=vocab)
    # Sort columns alphabetically for reproducibility
    df = df.reindex(sorted(df.columns), axis=1)
    return df

doc_names, docs = read_documents(INPUT_DIR, use_lemmas=USE_LEMMAS)

# --- Bag of Words ---
X_bow, vocab = build_bow(
    docs,
    min_df=2,           
    max_df=0.95,        
    ngram_range=(1,2)  
)
df_bow = to_dataframe(X_bow, vocab, doc_names)
bow_csv = os.path.join(OUTPUT_DIR, "_bow_matrix.csv")
df_bow.to_csv(bow_csv, encoding="utf-8")
print(f"Saved BoW matrix to: {bow_csv}  shape={df_bow.shape}")

# --- Top-N terms by count for each podcast (table) ---
rows = []
# Convert to dense per-row only when needed for small slices (keeps memory reasonable)
for podcast in df_bow.index:
    counts = df_bow.loc[podcast].astype("int64")
    top_terms = counts.sort_values(ascending=False).head(N_TOP)
    for rank, (term, count) in enumerate(top_terms.items(), start=1):
        rows.append({
            "podcast": podcast,
            "rank": rank,
            "term": term,
            "count": int(count)
        })

top_table = pd.DataFrame(rows, columns=["podcast", "rank", "term", "count"])

# Print a single tidy table
print("\n=== Top terms by count for each podcast ===")
top_table_sorted = top_table.sort_values(["podcast", "rank"])
print(tabulate(top_table_sorted.values.tolist(), headers=top_table_sorted.columns, tablefmt="fancy_grid"))

# Save the table
top_csv = os.path.join(OUTPUT_DIR, "_top_terms_per_podcast.csv")
top_table_sorted.to_csv(top_csv, index=False, encoding="utf-8")
print(f"\nSaved top-terms table to: {top_csv}")



Saved BoW matrix to: /content/output/bag_of_words/_bow_matrix.csv  shape=(104, 31988)

=== Top terms by count for each podcast ===
| podcast                                                                                  |   rank | term                    |   count |
|------------------------------------------------------------------------------------------|--------|-------------------------|---------|
| 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva)                        |      1 | persona                 |      92 |
| 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva)                        |      2 | comer                   |      70 |
| 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva)                        |      3 | salud                   |      63 |
| 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva)                        |      4 | tomar                   |      59 |
| 11 Hábitos Diarios para tener una 

### TF-IDF (Term Frequency - Inverse Document Frequency)

In [None]:
# --- TF-IDF from cleaned outputs + Top 10 distinctive terms per podcast ---

import os
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from tabulate import tabulate

INPUT_DIR = r"/content/output"
OUTPUT_DIR = r"/content/output/tf_idf"  
USE_LEMMAS = True               
N_TOP = 5                    
INCLUDE_BIGRAMS = True        
MIN_DF = 2                    
MAX_DF = 0.95         

os.makedirs(OUTPUT_DIR, exist_ok=True)

# === Utilities ===
def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def read_documents(output_dir: str, use_lemmas: bool = True):
    pattern = "__clean_lemma.txt" if use_lemmas else "__clean_raw.txt"
    paths = sorted(glob.glob(os.path.join(output_dir, f"*{pattern}")))
    if not paths:
        raise FileNotFoundError(f"No cleaned files found in {output_dir} with pattern *{pattern}")
    docs, doc_names = [], []
    for p in paths:
        with open(p, "r", encoding="utf-8") as f:
            docs.append(f.read().strip())
        name = os.path.basename(p).replace(pattern, "").rstrip("_")
        doc_names.append(name)
    return doc_names, docs

def build_tfidf(docs, min_df=1, max_df=1.0, ngram_range=(1,1), vocabulary=None):
    """
    Builds TF-IDF matrix using CountVectorizer + TfidfTransformer.
    - tokenizer=str.split assumes tokens are space-separated (already preprocessed).
    """
    vectorizer = CountVectorizer(
        tokenizer=str.split,
        preprocessor=None,
        lowercase=False,
        stop_words=None,
        min_df=min_df,
        max_df=max_df,
        ngram_range=ngram_range,
        vocabulary=vocabulary
    )
    X_counts = vectorizer.fit_transform(docs)
    vocab = vectorizer.get_feature_names_out()

    tfidf = TfidfTransformer(
        norm="l2",        
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=False
    )
    X_tfidf = tfidf.fit_transform(X_counts)
    return X_tfidf, vocab

def to_dataframe(X, vocab, doc_names):
    df = pd.DataFrame.sparse.from_spmatrix(X, index=doc_names, columns=vocab)
    df = df.reindex(sorted(df.columns), axis=1)
    return df


# 1) Load cleaned docs
doc_names, docs = read_documents(INPUT_DIR, use_lemmas=USE_LEMMAS)

# 2) Build TF-IDF
ngram_range = (1, 2) if INCLUDE_BIGRAMS else (1, 1)
X_tfidf, vocab = build_tfidf(
    docs,
    min_df=MIN_DF,
    max_df=MAX_DF,
    ngram_range=ngram_range
)
df_tfidf = to_dataframe(X_tfidf, vocab, doc_names)

# 3) Save TF-IDF matrix
tfidf_csv = os.path.join(OUTPUT_DIR, "_tfidf_matrix.csv")
df_tfidf.to_csv(tfidf_csv, encoding="utf-8")
print(f"Saved TF-IDF matrix to: {tfidf_csv}  shape={df_tfidf.shape}")

# 4) Compute Top-N TF-IDF terms per podcast
rows = []
for podcast in df_tfidf.index:
    scores = df_tfidf.loc[podcast].astype("float64")
    top_terms = scores.sort_values(ascending=False).head(N_TOP)
    for rank, (term, score) in enumerate(top_terms.items(), start=1):
        rows.append({
            "podcast": podcast,
            "rank": rank,
            "term": term,
            "tfidf": float(score)
        })

top_table = pd.DataFrame(rows, columns=["podcast", "rank", "term", "tfidf"]).sort_values(["podcast", "rank"])

# 5) Pretty print with tabulate (no row numbers)
print("\n### Top TF-IDF terms per podcast")
print(tabulate(top_table.values.tolist(), headers=top_table.columns, tablefmt="fancy_grid"))

# 6) Save the top-terms table
top_csv = os.path.join(OUTPUT_DIR, "_top_tfidf_terms_per_podcast.csv")
top_table.to_csv(top_csv, index=False, encoding="utf-8")
print(f"\nSaved top TF-IDF terms table to: {top_csv}")



Saved TF-IDF matrix to: /content/output/tf_idf/_tfidf_matrix.csv  shape=(104, 31988)

### Top TF-IDF terms per podcast
╒══════════════════════════════════════════════════════════════════════════════════════════╤════════╤═════════════════════════╤═══════════╕
│ podcast                                                                                  │   rank │ term                    │     tfidf │
╞══════════════════════════════════════════════════════════════════════════════════════════╪════════╪═════════════════════════╪═══════════╡
│ 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva)                        │      1 │ salud                   │ 0.230739  │
├──────────────────────────────────────────────────────────────────────────────────────────┼────────┼─────────────────────────┼───────────┤
│ 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva)                        │      2 │ comer                   │ 0.214823  │
├──────────────────────────────────────

In [None]:
# --- Cosine Similarity between all podcasts ---
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity (N x N)
sim_matrix = cosine_similarity(df_tfidf)  # rows & columns correspond to doc_names order
df_sim = pd.DataFrame(sim_matrix, index=doc_names, columns=doc_names)
df_sim_rounded = df_sim.round(3)

# Save to CSV
sim_csv = os.path.join(OUTPUT_DIR, "_cosine_similarity_matrix.csv")
df_sim_rounded.to_csv(sim_csv, encoding="utf-8")
print(f"Saved cosine similarity matrix to: {sim_csv}  shape={df_sim_rounded.shape}")

df_print = df_sim_rounded.reset_index().rename(columns={"index": "podcast"})
print("\n### Cosine similarity between podcasts (TF-IDF)")
print(tabulate(df_print.values.tolist(), headers=df_print.columns.tolist(), tablefmt="fancy_grid"))

# Show the top most similar pairs (excluding self-similarity)
pairs = []
for i in range(len(doc_names)):
    for j in range(i + 1, len(doc_names)):
        pairs.append((doc_names[i], doc_names[j], sim_matrix[i, j]))
if pairs:
    pairs_sorted = sorted(pairs, key=lambda x: x[2], reverse=True)[:5]
    pairs_table = pd.DataFrame(pairs_sorted, columns=["podcast_A", "podcast_B", "cosine_sim"]).round(3)
    print("\n### Top most similar podcast pairs (excluding identical)")
    print(tabulate(pairs_table.values.tolist(), headers=pairs_table.columns.tolist(), tablefmt="fancy_grid"))


Saved cosine similarity matrix to: /content/output/tf_idf/_cosine_similarity_matrix.csv  shape=(104, 104)

### Cosine similarity between podcasts (TF-IDF)
╒══════════════════════════════════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════════╤═══════════════════════════════════════════════╤═════════════════════════════════════════════╤══════════════════════════════════════════════════════╤══════════════════════════════╤═══════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════════════╤═══════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════════╤

### FastText

In [None]:
# --- FastText (direct) + doc embeddings + similarities ---
!pip install -q fasttext pandas scikit-learn tabulate

import os
import glob
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate
import fasttext

# ---- CONFIG ----
EMB_PATH = r"/content/drive/MyDrive/NLP_artifacts/cc.es.300.bin" 
INPUT_DIR = r"/content/output/preprocessing_steps"           
USE_LEMMAS = True
EMB_DIM = 300
N_NEIGHBORS = 5

# ---- 1) Load FastText model ----
if not os.path.exists(EMB_PATH):
    raise FileNotFoundError(
        f"FastText model not found at:\n{EMB_PATH}\n"
        "Make sure you downloaded cc.es.300.bin and the path is correct (note 'MyDrive')."
    )

print("Loading FastText Spanish model (cc.es.300.bin) ...")
ft = fasttext.load_model(EMB_PATH)
print(f"Loaded model: dim={ft.get_dimension()}, vocab≈{len(ft.get_words())}")

# ---- Load cleaned podcast documents ----
def read_documents(output_dir: str, use_lemmas: bool = True):
    pattern = "__clean_lemma.txt" if use_lemmas else "__clean_raw.txt"
    paths = sorted(glob.glob(os.path.join(output_dir, f"*{pattern}")))
    if not paths:
        raise FileNotFoundError(f"No cleaned files found in {output_dir} with pattern *{pattern}")
    docs, names = [], []
    for p in paths:
        with open(p, "r", encoding="utf-8") as f:
            docs.append(f.read().strip())
        name = os.path.basename(p).replace(pattern, "").rstrip("_")
        names.append(name)
    return names, docs

doc_names, docs = read_documents(INPUT_DIR, use_lemmas=USE_LEMMAS)
print(f"Loaded {len(docs)} cleaned documents.")

# ---- Embedding helpers  ----
def word_vec(token: str) -> np.ndarray:
    # FastText returns vectors even for OOV tokens via subword information
    return ft.get_word_vector(token)

def doc_embedding(tokens, normalize=True) -> np.ndarray:
    if not tokens:
        return np.zeros(ft.get_dimension(), dtype=np.float32)
    vecs = [word_vec(t) for t in tokens]
    emb = np.mean(vecs, axis=0)
    if normalize:
        n = np.linalg.norm(emb)
        if n > 0:
            emb = emb / n
    return emb.astype(np.float32)

# ---- 4) Build episode embeddings ----
doc_tokens = [d.split() for d in docs]  # already space-separated from your preprocessing
doc_embs = np.vstack([doc_embedding(toks) for toks in doc_tokens])
print(f"Document embeddings shape: {doc_embs.shape}")  # (n_docs, 300)

# ---- 5) Cosine similarity between podcasts ----
sim = cosine_similarity(doc_embs)
df_sim = pd.DataFrame(sim, index=doc_names, columns=doc_names).round(3)

print("\n= = = = = = = = = = = = = = = Cosine similarity between podcasts = = = = = = = = = = = = = = =")
df_print = df_sim.reset_index().rename(columns={"index": "podcast"})
print(tabulate(df_print.values.tolist(), headers=df_print.columns.tolist(), tablefmt="fancy_grid"))

# ---- 6) Top similar pairs (excluding identical) ----
pairs = []
for i in range(len(doc_names)):
    for j in range(i + 1, len(doc_names)):
        pairs.append((doc_names[i], doc_names[j], float(sim[i, j])))
pairs_sorted = sorted(pairs, key=lambda x: x[2], reverse=True)[:10]
pairs_df = pd.DataFrame(pairs_sorted, columns=["podcast_A", "podcast_B", "cosine_sim"]).round(3)

print("\n = = = = = = = = = = = = = = = Top similar podcast pairs = = = = = = = = = = = = = = =")
print(tabulate(pairs_df.values.tolist(), headers=pairs_df.columns.tolist(), tablefmt="fancy_grid"))


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
Loading FastText Spanish model (cc.es.300.bin) ...
Loaded model: dim=300, vocab≈2000000
Loaded 104 cleaned documents.
Document embeddings shape: (104, 300)

= = = = = = = = = = = = = = = Cosine similarity between podcasts = = = = = = = = = = = = = = =
╒══════════════════════════════════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════════════════╤═════════════════════════════════

Nearest neighbour words - FastText embeddings demostration

In [None]:
import numpy as np
from numpy.linalg import norm
from collections import Counter

# ---- Build corpus vocab and frequencies ----
freqs = Counter(t for doc in doc_tokens for t in doc)
lemma_vocab = set(freqs.keys())

# ---- Config for corpus-restricted neighbors ----
K = 5     
MIN_FREQ = 3    
#MAX_CANDIDATES = 100000 

def cosine(u: np.ndarray, v: np.ndarray) -> float:
    nu, nv = norm(u), norm(v)
    if nu == 0 or nv == 0:
        return 0.0
    return float(np.dot(u, v) / (nu * nv))

def corpus_neighbors_fasttext(ft_model, target: str, vocab: set, freqs: Counter,
                              k: int = 10, min_freq: int = 1, max_candidates: int | None = None):
    """
    Return top-k nearest neighbors using FastText vectors but restricted to your corpus tokens.
    - vocab: set of tokens (e.g., lemmas) from your cleaned docs.
    - freqs: Counter(token -> frequency) to allow freq-based filtering.
    - min_freq: keep only tokens with frequency >= min_freq.
    - max_candidates: optional speed cap; evaluates the most frequent tokens first.
    """
    # Ensure target is in corpus vocab
    target_vec = ft_model.get_word_vector(target)
    if norm(target_vec) == 0:
        raise ValueError(f"Target '{target}' produced a zero vector (unexpected).")

    # Candidate pool: tokens from corpus, excluding the target itself
    candidates = [t for t in vocab if t != target and freqs[t] >= min_freq]

    #if max_candidates is not None and len(candidates) > max_candidates:
    #    candidates = [t for t, _ in freqs.most_common(max_candidates) if t != target and freqs[t] >= min_freq]

    scored = []
    for t in candidates:
        v = ft_model.get_word_vector(t)
        c = cosine(target_vec, v)
        scored.append((t, c))

    # Sort by cosine desc and return top-k
    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:k]

# ---- get neighbours on corpus for selected  ----
query_words = ["inteligencia", "guerra", "política", "economía", "vida", "salud"]

for w in query_words:
    try:
        nbrs = corpus_neighbors_fasttext(
            ft_model=ft,
            target=w,
            vocab=lemma_vocab,
            freqs=freqs,
            k=K,
            min_freq=MIN_FREQ
        )
        print(f"\nCorpus-related neighbors for '{w}' (K={K}, min_freq={MIN_FREQ})")
        print(f" - "*24)
        for term, score in nbrs:
            print(f"  {term:25s} {score:.3f}   (freq={freqs[term]})")
    except Exception as e:
        print(f"\nCould not compute neighbors for '{w}': {e}")


Corpus-related neighbors for 'inteligencia' (K=5, min_freq=3)
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - 
  intuición                 0.599   (freq=6)
  sabiduría                 0.589   (freq=6)
  habilidad                 0.569   (freq=56)
  creatividad               0.557   (freq=11)
  imaginación               0.518   (freq=3)

Corpus-related neighbors for 'guerra' (K=5, min_freq=3)
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - 
  batalla                   0.691   (freq=50)
  contienda                 0.632   (freq=3)
  rebelión                  0.571   (freq=3)
  bélico                    0.569   (freq=17)
  invasión                  0.568   (freq=17)

Corpus-related neighbors for 'política' (K=5, min_freq=3)
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - 
  ideológica                0.623   (freq=3)
  geopolítica               0.610   (freq=7)
  diplomacia                0.607   (freq=3)
  ideología 

FastText embedding statistics

In [None]:
import os
from collections import Counter
import numpy as np
import pandas as pd
from tabulate import tabulate

# ----------  Build corpus-level token stats ----------
doc_tokens = [d.split() for d in docs]

token_counter = Counter(t for doc in doc_tokens for t in doc)
total_tokens = sum(token_counter.values())
unique_tokens = len(token_counter)

# FastText vocabulary
ft_vocab = set(ft.get_words())

in_vocab_types = {t for t in token_counter if t in ft_vocab}        
oov_type_set   = set(token_counter.keys()) - in_vocab_types          

in_vocab_token_count = sum(token_counter[t] for t in in_vocab_types)
oov_token_count = total_tokens - in_vocab_token_count

coverage_token_pct = (in_vocab_token_count / total_tokens * 100) if total_tokens else 0.0
coverage_type_pct  = (len(in_vocab_types) / unique_tokens * 100) if unique_tokens else 0.0

# ----------  B) Per-document coverage ----------
per_doc_rows = []
for name, toks in zip(doc_names, doc_tokens):
    cnt = Counter(toks)
    tok_total = sum(cnt.values())

    in_cnt = sum(freq for t, freq in cnt.items() if t in ft_vocab)
    oov_cnt = tok_total - in_cnt

    # Use distinct names to avoid shadowing the corpus-level sets
    in_types_count  = sum(1 for t in cnt if t in ft_vocab)
    oov_types_count = sum(1 for t in cnt if t not in ft_vocab)

    per_doc_rows.append({
        "podcast": name,
        "tokens": tok_total,
        "in_vocab_tokens": in_cnt,
        "oov_tokens": oov_cnt,
        "token_cov_%": round((in_cnt / tok_total * 100) if tok_total else 0.0, 2),
        "unique_types": len(cnt),
        "in_vocab_types": in_types_count,
        "oov_types": oov_types_count,
        "type_cov_%": round((in_types_count / len(cnt) * 100) if len(cnt) else 0.0, 2),
    })

df_per_doc = pd.DataFrame(per_doc_rows)

# ----------  C) Top OOV tokens ----------
TOP_N_OOV = 25
if oov_type_set:
    top_oov = sorted(((t, token_counter[t]) for t in oov_type_set),
                     key=lambda x: x[1], reverse=True)[:TOP_N_OOV]
    df_top_oov = pd.DataFrame(top_oov, columns=["token", "freq"])
else:
    df_top_oov = pd.DataFrame(columns=["token", "freq"])

# ----------  D) Docs with zero vectors (rare, but can happen if document is empty) ----------
zero_vec_docs = []
if 'doc_embs' in globals():
    for name, v in zip(doc_names, doc_embs):
        if np.linalg.norm(v) == 0.0:
            zero_vec_docs.append(name)

# ----------  E) Print nice summaries ----------
print("\n Corpus-level FastText coverage")
print(f" - "*24)
summary = [
    ["Total tokens", total_tokens],
    ["Unique tokens (types)", unique_tokens],
    ["In-vocab tokens", in_vocab_token_count],
    ["OOV tokens (by vocab)", oov_token_count],
    ["Token coverage %", f"{coverage_token_pct:.2f}%"],
    ["In-vocab types", len(in_vocab_types)],
    ["OOV types", len(oov_type_set)],
    ["Type coverage %", f"{coverage_type_pct:.2f}%"],
]
print(tabulate(summary, headers=["Metric", "Value"], tablefmt="fancy_grid"))

print("\n Per-podcast coverage")
print(f" - "*24)
print(tabulate(df_per_doc.head(104).values.tolist(), headers=df_per_doc.columns, tablefmt="fancy_grid"))

if not df_top_oov.empty:
    print(f"\n Top {min(TOP_N_OOV, len(df_top_oov))} OOV tokens (by frequency)")
    print(f" - "*24)
    print(tabulate(df_top_oov.values.tolist(), headers=df_top_oov.columns, tablefmt="fancy_grid"))
else:
    print("\n Top OOV tokens")
    print("No OOV tokens detected (all tokens appear in FastText vocabulary).")

if zero_vec_docs:
    print("\n Documents with zero embedding (likely empty after cleaning)")
    print(f" - "*24)
    for name in zero_vec_docs:
        print(f"- {name}")

# ----------  F) (Optional) Save stats ----------
OUT_DIR = os.path.join(INPUT_DIR, "fasttext_es")
os.makedirs(OUT_DIR, exist_ok=True)
df_per_doc.to_csv(os.path.join(OUT_DIR, "_coverage_per_podcast.csv"), index=False, encoding="utf-8")
df_top_oov.to_csv(os.path.join(OUT_DIR, "_top_oov_tokens.csv"), index=False, encoding="utf-8")

print(f"\nSaved coverage tables to: {OUT_DIR}")


 Corpus-level FastText coverage
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - 
╒═══════════════════════╤═════════╕
│ Metric                │ Value   │
╞═══════════════════════╪═════════╡
│ Total tokens          │ 285393  │
├───────────────────────┼─────────┤
│ Unique tokens (types) │ 24347   │
├───────────────────────┼─────────┤
│ In-vocab tokens       │ 277827  │
├───────────────────────┼─────────┤
│ OOV tokens (by vocab) │ 7566    │
├───────────────────────┼─────────┤
│ Token coverage %      │ 97.35%  │
├───────────────────────┼─────────┤
│ In-vocab types        │ 19600   │
├───────────────────────┼─────────┤
│ OOV types             │ 4747    │
├───────────────────────┼─────────┤
│ Type coverage %       │ 80.50%  │
╘═══════════════════════╧═════════╛

 Per-podcast coverage
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - 
╒══════════════════════════════════════════════════════════════════════════════════════════╤══════════╤═══════════

### Word2Vec

In [14]:
import gensim.models
from gensim import utils
from gensim.models.callbacks import CallbackAny2Vec
import glob, os
from tabulate import tabulate

class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0
        self.loss_previous_step = 0.0
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.loss_previous_step = loss
        self.epoch += 1

class MyCorpus:
    """An iterator that yields sentences (lists of str) line-by-line."""
    def __init__(self, paths):
        self.paths = paths
    def __iter__(self):
        len_lines = 0
        for path in self.paths:
            with open(path, 'r', encoding='utf-8') as file:
                for line in file:
                    len_lines += 1
                    yield utils.simple_preprocess(line)
        print(f"Number of docs: {len_lines}")

OUTPUT_DIR = "/content/output/preprocessing_steps"
USE_LEMMAS = True
SUFFIX = "__clean_lemma.txt" if USE_LEMMAS else "__clean_raw.txt"

corpus_paths = sorted(glob.glob(os.path.join(OUTPUT_DIR, f"*{SUFFIX}")))
if not corpus_paths:
    raise FileNotFoundError(f"No se encontraron ficheros con sufijo {SUFFIX} en {OUTPUT_DIR}")

sentences = MyCorpus(corpus_paths)
epoch_logger = EpochLogger()

# ------------------------------------------------------------------------------
# ENTRENAMIENTO
# ------------------------------------------------------------------------------

model = gensim.models.Word2Vec(
    sentences=sentences,
    vector_size=300,
    window=10,
    min_count=1,
    compute_loss=True,
    callbacks=[epoch_logger],
    negative=5,
    epochs=100,
    sg=1
)

# ------------------------------------------------------------------------------
# TSNE (visualización de los embeddings en dos componentes)
# ------------------------------------------------------------------------------
'''
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

words = list(model.wv.index_to_key)[:2000]
vectors = [model.wv[word] for word in words]


tsne = TSNE(n_components=2, random_state=0, n_iter=300, perplexity=30)
tsne_vectors = tsne.fit_transform(np.array(vectors))

plt.figure(figsize=(16, 16))
plt.scatter(tsne_vectors[:, 0], tsne_vectors[:, 1], s=6, alpha=0.7)

for i, word in enumerate(words):
    plt.annotate(word, (tsne_vectors[i, 0], tsne_vectors[i, 1]), fontsize=7, alpha=0.7)

plt.show()
'''

# ------------------------------------------------------------------------------
# GUARDAR Y CARGAR MODELO
# ------------------------------------------------------------------------------
save_path_model = "/content/output/word2vec_model"
save_path_vectors = "/content/output/word2vec_vectors"

model.save(save_path_model)
model.wv.save(save_path_vectors)
print("Guardado:")
print(" -", save_path_model)
print(" -", save_path_vectors)

loaded_model = gensim.models.Word2Vec.load(save_path_model)
loaded_wv = gensim.models.KeyedVectors.load(save_path_vectors, mmap='r')

# ------------------------------------------------------------------------------
# CALCULAR VECINOS
# ------------------------------------------------------------------------------
queries = ["inteligencia", "guerra", "política", "economía", "vida", "salud"]
rows = []

for q in queries:
    if q in loaded_wv.key_to_index:
        neighbors = loaded_wv.most_similar(q, topn=10)
        for t, s in neighbors:
            rows.append([q, t, f"{s:.3f}"])
    else:
        rows.append([q, "(sin vector)", "-"])

# Mostrar tabla
print(tabulate(rows, headers=["Palabra", "Vecino", "Similitud"], tablefmt="fancy_grid"))

Number of docs: 104
Number of docs: 104
Loss after epoch 0: 2586989.75
Number of docs: 104
Loss after epoch 1: 2149869.25
Number of docs: 104
Loss after epoch 2: 1881025.0
Number of docs: 104
Loss after epoch 3: 1880289.0
Number of docs: 104
Loss after epoch 4: 1576767.0
Number of docs: 104
Loss after epoch 5: 1554610.0
Number of docs: 104
Loss after epoch 6: 1492687.0
Number of docs: 104
Loss after epoch 7: 1476963.0
Number of docs: 104
Loss after epoch 8: 1411750.0
Number of docs: 104
Loss after epoch 9: 1283006.0
Number of docs: 104
Loss after epoch 10: 1115754.0
Number of docs: 104
Loss after epoch 11: 1073912.0
Number of docs: 104
Loss after epoch 12: 1047486.0
Number of docs: 104
Loss after epoch 13: 1016306.0
Number of docs: 104
Loss after epoch 14: 1007558.0
Number of docs: 104
Loss after epoch 15: 1000770.0
Number of docs: 104
Loss after epoch 16: 1005390.0
Number of docs: 104
Loss after epoch 17: 1044646.0
Number of docs: 104
Loss after epoch 18: 998616.0
Number of docs: 104
