In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

BASE_DIR = "/content/drive/Shareddrives/NLP/transcriptions"
TEXT_DIR = os.path.join(BASE_DIR, "all_transcriptions")

Mounted at /content/drive


Case-folding, lemmatization, stopword removal and lemmatization

In [None]:
!pip install spacy
!python -m spacy download es_core_news_sm

import os
import glob
import csv
import re
import unicodedata
import spacy


OUTPUT_DIR  = r"/content/output/preprocessing_steps"
SPACY_MODEL = "es_core_news_sm"
REMOVE_STOPWORDS = True

TOKEN_PATTERN = re.compile(r"^[A-Za-zÀ-ÖØ-öø-ÿ]{2,}$")

def normalize_text(text: str) -> str:
    """Unicode normalize + lowercase (case folding)."""
    return unicodedata.normalize("NFKC", text).lower()

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def main():
    ensure_dir(OUTPUT_DIR)
    print(f"Cargando modelo spaCy: {SPACY_MODEL}")
    nlp = spacy.load(SPACY_MODEL, disable=["ner"])

    files = sorted(glob.glob(os.path.join(TEXT_DIR, "*.txt")))
    if not files:
        raise FileNotFoundError(f"No se encuentran .txt en: {TEXT_DIR}")

    index_rows = []
    for path in files:
        base = os.path.splitext(os.path.basename(path))[0]
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        doc = nlp(normalize_text(text))

        raw_tokens, lemmas = [], []
        for tok in doc:
            if tok.is_space:
                continue
            if not TOKEN_PATTERN.match(tok.text):
                continue
            if REMOVE_STOPWORDS and tok.is_stop:
                continue
            raw_tokens.append(tok.text)
            lemmas.append(tok.lemma_ if tok.lemma_ else tok.text)

        # --- Save cleaned outputs ---
        with open(os.path.join(OUTPUT_DIR, f"{base}__clean_raw.txt"), "w", encoding="utf-8") as f:
            f.write(" ".join(raw_tokens))
        with open(os.path.join(OUTPUT_DIR, f"{base}__clean_lemma.txt"), "w", encoding="utf-8") as f:
            f.write(" ".join(lemmas))

        # Per-document CSV (raw vs lemma)
        per_doc_csv = os.path.join(OUTPUT_DIR, f"{base}__tokens.csv")
        max_len = max(len(raw_tokens), len(lemmas))
        with open(per_doc_csv, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(["raw", "lemma"])
            for i in range(max_len):
                w.writerow([
                    raw_tokens[i] if i < len(raw_tokens) else "",
                    lemmas[i] if i < len(lemmas) else ""
                ])

        index_rows.append([os.path.basename(path), len(raw_tokens), len(set(raw_tokens))])
        print(f"Procesado: {os.path.basename(path)}  (tokens: {len(raw_tokens)}, únicos: {len(set(raw_tokens))})")

    # Corpus index
    with open(os.path.join(OUTPUT_DIR, "_corpus_index.csv"), "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["file", "token_count", "unique_token_count"])
        w.writerows(index_rows)

    print("\nListo. Salidas en:", OUTPUT_DIR)

if __name__ == "__main__":
    main()

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m149.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Cargando modelo spaCy: es_core_news_sm
Procesado: 11 Hábitos Diarios para tener una Salud del 1% (Fuerza Explosiva).txt  (tokens: 6047, únicos: 2132)
Procesado: 15 Hábitos para Vivir con Abundancia y Tener Éxito (Sergio Fernández).txt 

Pre-trained model

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load BETO model and tokenizer
model_name = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()  # Set to evaluation mode

print("BETO model loaded successfully!")

def get_embeddings(text, method="mean"):
    """
    Get contextual embeddings for a text using BETO

    Args:
        text (str): Input text
        method (str): How to aggregate token embeddings
                     "mean" - mean of all token embeddings
                     "cls" - use [CLS] token embedding
                     "pooler" - use pooler output

    Returns:
        numpy array: Embedding vector
    """
    # Tokenize the text
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512,  # BERT's maximum length
        return_attention_mask=True
    )

    # Move to device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get embeddings without calculating gradients
    with torch.no_grad():
        outputs = model(**inputs)

    # Choose embedding aggregation method
    if method == "cls":
        # Use [CLS] token embedding
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    elif method == "mean":
        # Mean of all token embeddings (excluding padding tokens)
        attention_mask = inputs['attention_mask']
        token_embeddings = outputs.last_hidden_state

        # Create mask for non-padding tokens
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

        # Sum embeddings and divide by number of non-padding tokens
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embedding = (sum_embeddings / sum_mask).cpu().numpy()
    elif method == "pooler":
        # Use pooler output
        embedding = outputs.pooler_output.cpu().numpy()
    else:
        raise ValueError("Method must be 'mean', 'cls', or 'pooler'")

    return embedding.squeeze()

def process_transcript_files(text_dir, output_file="beto_embeddings.csv"):
    """
    Process all transcript files and generate BETO embeddings

    Args:
        text_dir (str): Directory containing transcript files
        output_file (str): Name of output CSV file
    """
    # Get all text files
    text_files = [f for f in os.listdir(text_dir) if f.endswith('.txt')]
    print(f"Found {len(text_files)} transcript files")

    embeddings_data = []

    for filename in tqdm(text_files, desc="Processing files"):
        file_path = os.path.join(text_dir, filename)

        try:
            # Read the transcript
            with open(file_path, 'r', encoding='utf-8') as f:
                transcript = f.read().strip()

            if not transcript:
                print(f"Warning: Empty file {filename}")
                continue

            # Get embeddings using different methods
            mean_embedding = get_embeddings(transcript, method="mean")
            cls_embedding = get_embeddings(transcript, method="cls")

            # Store results
            embeddings_data.append({
                'filename': filename,
                'text_length': len(transcript),
                'mean_embedding': mean_embedding.tolist(),
                'cls_embedding': cls_embedding.tolist(),
                'embedding_dim': len(mean_embedding)
            })

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

    # Create DataFrame and save
    df = pd.DataFrame(embeddings_data)

    # Save to CSV
    output_path = os.path.join(BASE_DIR, output_file)
    df.to_csv(output_path, index=False)
    print(f"Embeddings saved to {output_path}")

    return df

# Function to load and visualize embeddings
def load_and_analyze_embeddings(csv_path):
    """
    Load saved embeddings and provide basic analysis
    """
    df = pd.read_csv(csv_path)

    print("Embeddings DataFrame Info:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Embedding dimension: {df['embedding_dim'].iloc[0]}")

    # Convert string representations back to numpy arrays
    df['mean_embedding_array'] = df['mean_embedding'].apply(
        lambda x: np.array(eval(x)) if isinstance(x, str) else np.array(x)
    )
    df['cls_embedding_array'] = df['cls_embedding'].apply(
        lambda x: np.array(eval(x)) if isinstance(x, str) else np.array(x)
    )

    return df

# Main execution
if __name__ == "__main__":
    # Process all transcript files
    print("Starting BETO embedding generation...")
    embeddings_df = process_transcript_files(TEXT_DIR)

    # Display basic info
    print("\nEmbedding generation completed!")
    print(f"Processed {len(embeddings_df)} files")
    print(f"Embedding dimension: {embeddings_df['embedding_dim'].iloc[0]}")

    # Example: Access embeddings for the first file
    if len(embeddings_df) > 0:
        first_embedding = np.array(embeddings_df['mean_embedding'].iloc[0])
        print(f"First embedding shape: {first_embedding.shape}")
        print(f"First embedding sample (first 10 dims): {first_embedding[:10]}")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BETO model loaded successfully!
Starting BETO embedding generation...
Found 104 transcript files


Processing files:   3%|▎         | 3/104 [00:00<00:26,  3.77it/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Processing files: 100%|██████████| 104/104 [00:14<00:00,  7.36it/s]


Embeddings saved to /content/drive/Shareddrives/NLP/transcriptions/beto_embeddings.csv

Embedding generation completed!
Processed 104 files
Embedding dimension: 768
First embedding shape: (768,)
First embedding sample (first 10 dims): [ 0.26182157 -0.43910819  0.27464247 -0.16144565  0.76865429  0.21929124
 -0.16313431 -0.1653112  -0.24447471 -0.02274783]


Fine-tuning

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM, get_linear_schedule_with_warmup
from torch.optim import AdamW
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

class TranscriptDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

def prepare_fine_tuning_data(text_dir, train_ratio=0.8):
    """Prepare data for fine-tuning from transcript files"""
    text_files = [f for f in os.listdir(text_dir) if f.endswith('.txt')]
    texts = []

    for filename in text_files:
        file_path = os.path.join(text_dir, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                transcript = f.read().strip()
            if transcript:
                texts.append(transcript)
        except Exception as e:
            print(f"Error reading {filename}: {e}")

    # Shuffle and split
    random.shuffle(texts)
    split_idx = int(len(texts) * train_ratio)
    train_texts = texts[:split_idx]
    val_texts = texts[split_idx:]

    return train_texts, val_texts

def fine_tune_mlm(model, tokenizer, train_texts, val_texts, output_dir,
                  batch_size=8, epochs=3, learning_rate=2e-5):
    """
    Fine-tune BETO using Masked Language Modeling
    """
    # Create datasets
    train_dataset = TranscriptDataset(train_texts, tokenizer)
    val_dataset = TranscriptDataset(val_texts, tokenizer)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Setup optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Training loop
    model.train()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        total_loss = 0

        for batch in tqdm(train_loader, desc="Training"):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Prepare inputs for MLM - randomly mask tokens
            inputs = input_ids.clone()
            labels = input_ids.clone()

            # Create random mask (15% of tokens)
            probability_matrix = torch.full(labels.shape, 0.15)
            masked_indices = torch.bernoulli(probability_matrix).bool()
            labels[~masked_indices] = -100  # Only compute loss on masked tokens

            # 80% of the time, replace masked tokens with [MASK]
            indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
            inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

            # 10% of the time, replace masked tokens with random word
            indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
            random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long).to(device)
            inputs[indices_random] = random_words[indices_random]

            # Forward pass
            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Average training loss: {avg_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)

                # Prepare inputs for MLM
                inputs = input_ids.clone()
                labels = input_ids.clone()
                probability_matrix = torch.full(labels.shape, 0.15)
                masked_indices = torch.bernoulli(probability_matrix).bool()
                labels[~masked_indices] = -100

                outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Average validation loss: {avg_val_loss:.4f}")
        model.train()

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Fine-tuned model saved to {output_dir}")

def get_fine_tuned_embeddings(text, model, tokenizer, method="mean"):
    """
    Get embeddings using fine-tuned model
    """
    # Use the base model for embeddings (not the MLM head)
    base_model = model.bert if hasattr(model, 'bert') else model.base_model

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512,
        return_attention_mask=True
    )

    device = next(model.parameters()).device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = base_model(**inputs)

    if method == "cls":
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    elif method == "mean":
        attention_mask = inputs['attention_mask']
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embedding = (sum_embeddings / sum_mask).cpu().numpy()
    elif method == "pooler":
        embedding = outputs.pooler_output.cpu().numpy()
    else:
        raise ValueError("Method must be 'mean', 'cls', or 'pooler'")

    return embedding.squeeze()

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    BASE_DIR = "/content/drive/Shareddrives/NLP/transcriptions"
    TEXT_DIR = os.path.join(BASE_DIR, "all_transcriptions")
    FINE_TUNED_DIR = os.path.join(BASE_DIR, "fine_tuned_beto")

    # Load model for masked language modeling
    model_name = "dccuchile/bert-base-spanish-wwm-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)

    print("Original BETO model loaded successfully!")

    # Prepare data for fine-tuning
    print("Preparing data for fine-tuning...")
    train_texts, val_texts = prepare_fine_tuning_data(TEXT_DIR)

    print(f"Training samples: {len(train_texts)}")
    print(f"Validation samples: {len(val_texts)}")

    # Fine-tune the model
    print("Starting fine-tuning...")
    fine_tune_mlm(
        model=model,
        tokenizer=tokenizer,
        train_texts=train_texts,
        val_texts=val_texts,
        output_dir=FINE_TUNED_DIR,
        batch_size=8,
        epochs=3,
        learning_rate=2e-5
    )

    # Load fine-tuned model for embedding generation
    print("Loading fine-tuned model...")
    fine_tuned_model = AutoModelForMaskedLM.from_pretrained(FINE_TUNED_DIR)
    fine_tuned_model.to(device)
    fine_tuned_model.eval()

    # Generate embeddings with fine-tuned model
    def process_with_fine_tuned(text_dir, output_file="fine_tuned_embeddings.csv"):
        text_files = [f for f in os.listdir(text_dir) if f.endswith('.txt')]
        embeddings_data = []

        for filename in tqdm(text_files, desc="Generating fine-tuned embeddings"):
            file_path = os.path.join(text_dir, filename)

            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    transcript = f.read().strip()

                if not transcript:
                    continue

                # Get embeddings using fine-tuned model
                mean_embedding = get_fine_tuned_embeddings(transcript, fine_tuned_model, tokenizer, "mean")
                cls_embedding = get_fine_tuned_embeddings(transcript, fine_tuned_model, tokenizer, "cls")

                embeddings_data.append({
                    'filename': filename,
                    'text_length': len(transcript),
                    'mean_embedding': mean_embedding.tolist(),
                    'cls_embedding': cls_embedding.tolist(),
                    'embedding_dim': len(mean_embedding)
                })

            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
                continue

        df = pd.DataFrame(embeddings_data)
        output_path = os.path.join(BASE_DIR, output_file)
        df.to_csv(output_path, index=False)
        print(f"Fine-tuned embeddings saved to {output_path}")
        return df

    # Generate embeddings
    print("Generating embeddings with fine-tuned model...")
    fine_tuned_embeddings = process_with_fine_tuned(TEXT_DIR)

    print("Fine-tuning and embedding generation completed!")

if __name__ == "__main__":
    main()

Using device: cuda
Original BETO model loaded successfully!
Preparing data for fine-tuning...
Training samples: 83
Validation samples: 21
Starting fine-tuning...
Epoch 1/3


Training: 100%|██████████| 11/11 [00:11<00:00,  1.07s/it]


Average training loss: 4.2870


Validation: 100%|██████████| 3/3 [00:01<00:00,  1.99it/s]


Average validation loss: 1.5284
Epoch 2/3


Training: 100%|██████████| 11/11 [00:11<00:00,  1.04s/it]


Average training loss: 3.8599


Validation: 100%|██████████| 3/3 [00:01<00:00,  2.00it/s]


Average validation loss: 1.5122
Epoch 3/3


Training: 100%|██████████| 11/11 [00:11<00:00,  1.05s/it]


Average training loss: 3.7113


Validation: 100%|██████████| 3/3 [00:01<00:00,  2.06it/s]


Average validation loss: 1.3707
Fine-tuned model saved to /content/drive/Shareddrives/NLP/transcriptions/fine_tuned_beto
Loading fine-tuned model...
Generating embeddings with fine-tuned model...


Generating fine-tuned embeddings: 100%|██████████| 104/104 [00:21<00:00,  4.82it/s]


Fine-tuned embeddings saved to /content/drive/Shareddrives/NLP/transcriptions/fine_tuned_embeddings.csv
Fine-tuning and embedding generation completed!


Pre-trained vs fine-tuned

In [None]:
def compare_embeddings(original_csv, fine_tuned_csv):
    """Compare original and fine-tuned embeddings"""
    orig_df = pd.read_csv(original_csv)
    fine_df = pd.read_csv(fine_tuned_csv)

    # Convert embeddings to arrays
    orig_embeddings = [np.array(eval(x)) for x in orig_df['mean_embedding']]
    fine_embeddings = [np.array(eval(x)) for x in fine_df['mean_embedding']]

    # Calculate similarities
    from sklearn.metrics.pairwise import cosine_similarity

    similarities = []
    for orig, fine in zip(orig_embeddings, fine_embeddings):
        sim = cosine_similarity([orig], [fine])[0][0]
        similarities.append(sim)

    print(f"Average cosine similarity between original and fine-tuned: {np.mean(similarities):.4f}")
    print(f"Similarity std: {np.std(similarities):.4f}")

    return similarities

#Usage
similarities = compare_embeddings(
     "/content/drive/Shareddrives/NLP/transcriptions/beto_embeddings.csv",
     "/content/drive/Shareddrives/NLP/transcriptions/fine_tuned_embeddings.csv"
 )

Average cosine similarity between original and fine-tuned: 0.9126
Similarity std: 0.0099
