# 🧠 BERT 

## 📦 Dependencias
- Importa librerías: transformers, torch, sklearn, pandas, numpy.

In [61]:
from transformers import BertTokenizer, BertForSequenceClassification,TrainerCallback,Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import warnings
import time
import psutil
import threading
from tqdm.auto import tqdm
warnings.filterwarnings('ignore')

## 📦 Dependencias
- Importa librerías: transformers.

In [2]:
from transformers import logging
from tqdm.auto import tqdm
logging.set_verbosity_info()

## 🎲 Reproducibilidad y dispositivo
- Configura **dispositivo** (CPU/GPU).

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando dispositivo:", device)

Usando dispositivo: cuda


## 🏷️ Preparación de etiquetas
- Mapea/convierte etiquetas y define `num_labels` si aplica.

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)
model.to(device)

loading file vocab.txt from cache at C:\Users\swart\.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\3f076fdb1ab68d5b2880cb87a0886f315b8146f8\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\swart\.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\3f076fdb1ab68d5b2880cb87a0886f315b8146f8\tokenizer_config.json
loading file tokenizer.json from cache at C:\Users\swart\.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\3f076fdb1ab68d5b2880cb87a0886f315b8146f8\tokenizer.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at C:\Users\swart\.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\3f076fdb1ab68d5b2880cb87a0886f315b8146f8\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropo

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

## 📥 Carga de datos
- Lee dataset: dataset_multilingue_10idiomas_balanceado.csv.

In [5]:
df = pd.read_csv('dataset_multilingue_10idiomas_balanceado.csv')

## 🧼 Limpieza de texto
- Normaliza texto (lower/regex), quita URLs/símbolos, etc.

In [6]:
def detect_language_safe(text):
    """Detecta idioma de forma segura"""
    try:
        if len(str(text).strip()) < 3:
            return 'unknown'
        lang = detect(str(text)[:100])  # Solo primeros 100 chars
        return lang
    except (LangDetectException, Exception):
        return 'unknown'

## 🛠️ Helpers / funciones auxiliares
- Funciones de apoyo (tokenización, métricas, etc.).

In [7]:
def analyze_multilingual_distribution(df, sample_size=1000):
    """Analiza distribución de idiomas en el dataset"""
    print(f"\nANÁLISIS MULTILINGÜE (muestra de {sample_size})...")
    
    # Tomar muestra para análisis de idiomas
    sample_df = df.sample(n=min(sample_size, len(df)), random_state=42)
    sample_df['language'] = sample_df['Texto'].apply(detect_language_safe)
    
    # Mostrar distribución por idioma
    lang_dist = sample_df['language'].value_counts()
    print(f"\nDISTRIBUCIÓN POR IDIOMAS:")
    
    lang_names = {
        'es': 'Español', 'en': 'Inglés', 'pt': 'Portugués', 
        'fr': 'Francés', 'it': 'Italiano', 'de': 'Alemán',
        'ca': 'Catalán', 'ro': 'Rumano', 'pl': 'Polaco',
        'nl': 'Neerlandés', 'unknown': 'Desconocido'
    }
    
    for lang, count in lang_dist.head(10).items():
        pct = (count / len(sample_df)) * 100
        lang_name = lang_names.get(lang, lang.upper())
        print(f"  {lang_name:12} ({lang}): {count:3d} ({pct:.1f}%)")
    
    # Análisis por sentimiento e idioma
    print(f"\nDISTRIBUCIÓN POR SENTIMIENTO:")
    sentiment_dist = df["Label"].value_counts().sort_index()
    label_names = ["Malo", "Neutro", "Bueno"]
    total = len(df)
    
    for label, count in sentiment_dist.items():
        pct = (count / total) * 100
        print(f"  {label_names[int(label)]:8}: {count:6,} ({pct:.1f}%)")
    
    return sample_df

- Salida de un script de diagnóstico que imprime: Una muestra para la distribución por idiomas, sdemás, los rótulos “Malo/Neutro/Bueno” son solo alias de la columna sentimiento (probablemente negativo/neutro/positivo).

In [8]:
sample_with_langs = analyze_multilingual_distribution(df)


ANÁLISIS MULTILINGÜE (muestra de 1000)...

DISTRIBUCIÓN POR IDIOMAS:
  Inglés       (en): 684 (68.4%)
  Español      (es): 208 (20.8%)
  Francés      (fr):  45 (4.5%)
  Italiano     (it):  13 (1.3%)
  Catalán      (ca):  11 (1.1%)
  Alemán       (de):   8 (0.8%)
  Portugués    (pt):   8 (0.8%)
  Polaco       (pl):   7 (0.7%)
  Neerlandés   (nl):   5 (0.5%)
  Rumano       (ro):   3 (0.3%)

DISTRIBUCIÓN POR SENTIMIENTO:
  Malo    : 101,310 (33.3%)
  Neutro  : 101,308 (33.3%)
  Bueno   : 101,307 (33.3%)


## 🧼 Limpieza de texto
- Normaliza texto (lower/regex), quita URLs/símbolos, etc.

In [9]:
def preprocess_multilingual_text(text):
    
    if pd.isna(text) or text is None:
        return ""
    
    text = str(text).strip()
    
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = ' '.join(text.split())  # Normalizar espacios
    
    return text

In [10]:
df['Texto'] = df['Texto'].apply(preprocess_multilingual_text)

## 🧼 Limpieza de texto
- Normaliza texto (lower/regex), quita URLs/símbolos, etc.

In [11]:
df = df[df['Texto'].str.len() > 0]
print(f"Muestras después del preprocessamiento: {len(df):,}")

Muestras después del preprocessamiento: 303,925


## 🧭 División train/valid/test
- Separa datos para entrenamiento y validación/prueba.

In [12]:
def create_multilingual_splits(df, test_size=0.2):
    """
    Crea splits considerando tanto sentimiento como diversidad lingüística
    """
    print(f"\nCREANDO SPLITS ESTRATIFICADOS MULTILINGÜES...")
    
    # Detectar idioma en una muestra más grande para splits
    df_lang_sample = df.sample(n=min(2000, len(df)), random_state=42)
    df_lang_sample['language'] = df_lang_sample['Texto'].apply(detect_language_safe)
    
    # Crear estratificación combinada (sentimiento + idioma principal)
    # Identificar idiomas principales (>5% del dataset)
    lang_dist = df_lang_sample['language'].value_counts()
    main_languages = lang_dist[lang_dist >= len(df_lang_sample) * 0.05].index.tolist()
    
    print(f"Idiomas principales detectados: {main_languages}")
    
    # Para el dataset completo, usar solo estratificación por sentimiento
    # (más estable que intentar estratificar por idioma en datasets grandes)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df["Texto"],
        df["Label"],
        test_size=test_size,
        stratify=df["Label"],
        random_state=42
    )
    
    return train_texts, val_texts, train_labels, val_labels

In [13]:
train_texts, val_texts, train_labels, val_labels = create_multilingual_splits(df)


CREANDO SPLITS ESTRATIFICADOS MULTILINGÜES...
Idiomas principales detectados: ['en', 'es']


- Imprime cuántas muestras hay en cada conjunto:

In [14]:
print(f"  Entrenamiento: {len(train_texts):,} muestras")
print(f"  Validación: {len(val_texts):,} muestras")

  Entrenamiento: 243,140 muestras
  Validación: 60,785 muestras


## Balance de clases:
- Hace tres cosas sencillas con tus etiquetas (pandas):
- Cuenta cuántas veces aparece cada clase
- val_dist = val_labels.value_counts().sort_index(): Lo mismo pero para el conjunto de validación.
- label_names = ["Malo", "Neutro", "Bueno"], Una lista de nombres bonitos para mostrar en gráficos/tablas.

In [15]:
train_dist = train_labels.value_counts().sort_index()
val_dist = val_labels.value_counts().sort_index()
label_names = ["Malo", "Neutro", "Bueno"]

## Balance de clases para train y validación
- Muestra clases para train y validación, con conteos y porcentajes, formateada en columnas.

In [16]:
print(f"{'Clase':<8} {'Train':<8} {'Val':<8} {'Train%':<8} {'Val%':<8}")
print("-" * 45)
for i, (train_count, val_count) in enumerate(zip(train_dist, val_dist)):
    train_pct = (train_count / len(train_texts)) * 100
    val_pct = (val_count / len(val_texts)) * 100
    print(f"{label_names[i]:<8} {train_count:<8,} {val_count:<8,} {train_pct:<7.1f}% {val_pct:<7.1f}%")

Clase    Train    Val      Train%   Val%    
---------------------------------------------
Malo     81,048   20,262   33.3   % 33.3   %
Neutro   81,046   20,262   33.3   % 33.3   %
Bueno    81,046   20,261   33.3   % 33.3   %


## 🔤 Tokenizador BERT
- Configura el tokenizador (padding/truncation/max_len).

In [17]:
train_encodings = tokenizer(
    list(train_texts), 
    truncation=True, 
    padding=True, 
    max_length=128,  # Suficiente para la mayoría de reviews
    return_tensors=None  # Evitar tensores grandes en memoria
)

## 🔤 Tokenizador BERT
- Configura el tokenizador (padding/truncation/max_len).

In [18]:
val_encodings = tokenizer(
    list(val_texts), 
    truncation=True, 
    padding=True, 
    max_length=128,
    return_tensors=None
)

- Calcula pesos de clase para lidiar con desbalance y los prepara para usarlos en PyTorch.

In [19]:
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

- Calcular pesos por clase (sklearn)

In [20]:
for i, weight in enumerate(class_weights):
    label_name = label_names[i]
    print(f"  {label_name}: {weight:.3f}")

  Malo: 1.000
  Neutro: 1.000
  Bueno: 1.000


- Resume los pesos de clase
- Calcula el promedio
- Calcula la varianza
- Prints con formato a 3 decimales

In [21]:
avg_weight = np.mean(class_weights)
weight_variance = np.var(class_weights)
print(f"  Peso promedio: {avg_weight:.3f}")
print(f"  Varianza de pesos: {weight_variance:.3f}")

  Peso promedio: 1.000
  Varianza de pesos: 0.000


In [22]:
if weight_variance < 0.1:
    print("Dataset bien balanceado (varianza < 0.1)")
    USE_CLASS_WEIGHTS = False  # No necesario usar pesos
else:
    print("Dataset con desbalance residual")
    USE_CLASS_WEIGHTS = True

Dataset bien balanceado (varianza < 0.1)


## 🛠️ Helpers / funciones auxiliares
- Funciones de apoyo (tokenización, métricas, etc.).

In [23]:
class MultilingualReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(int(self.labels.iloc[idx]))
        return item

train_dataset = MultilingualReviewDataset(train_encodings, train_labels)
val_dataset = MultilingualReviewDataset(val_encodings, val_labels)

## 🏷️ Preparación de etiquetas
- Mapea/convierte etiquetas y define `num_labels` si aplica.

In [24]:
class MultilingualWeightedTrainer(Trainer):
    def __init__(self, class_weights=None, use_weights=True, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
        self.use_weights = use_weights
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Usar pesos solo si es necesario
        if self.use_weights and self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
        
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

## 📊 Evaluación
- Calcula métricas (accuracy/F1/precision/recall) y reportes/confusión.

In [28]:
def compute_multilingual_metrics(eval_pred):
    """Métricas optimizadas para evaluación multilingüe"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calcular métricas detalladas
    report = classification_report(
        labels, predictions, 
        target_names=["Malo", "Neutro", "Bueno"], 
        output_dict=True,
        zero_division=0
    )
    
    return {
        'accuracy': report['accuracy'],
        'macro_f1': report['macro avg']['f1-score'],
        'weighted_f1': report['weighted avg']['f1-score'],
        'malo_f1': report['Malo']['f1-score'],
        'neutro_f1': report['Neutro']['f1-score'],
        'bueno_f1': report['Bueno']['f1-score'],
        'malo_precision': report['Malo']['precision'],
        'malo_recall': report['Malo']['recall'],
        'neutro_precision': report['Neutro']['precision'],
        'neutro_recall': report['Neutro']['recall'],
        'bueno_precision': report['Bueno']['precision'],
        'bueno_recall': report['Bueno']['recall']
    }

## 🎲 Reproducibilidad y dispositivo
- Configura **dispositivo** (CPU/GPU).

In [55]:
class ResourceMonitor:
    def __init__(self):
        self.monitoring = False
        self.thread = None
        
    def start_monitoring(self):
        self.monitoring = True
        self.thread = threading.Thread(target=self._monitor_resources, daemon=True)
        self.thread.start()
        
    def stop_monitoring(self):
        self.monitoring = False
        
    def _monitor_resources(self):
        while self.monitoring:
            cpu_percent = psutil.cpu_percent(interval=1)
            memory = psutil.virtual_memory()
            memory_percent = memory.percent
            
            gpu_info = ""
            if torch.cuda.is_available():
                try:
                    gpu_mem_used = torch.cuda.memory_allocated() / 1e9
                    gpu_mem_total = torch.cuda.get_device_properties(0).total_memory / 1e9
                    gpu_mem_percent = (gpu_mem_used / gpu_mem_total) * 100
                    gpu_info = f" | GPU: {gpu_mem_percent:.1f}% ({gpu_mem_used:.1f}/{gpu_mem_total:.1f}GB)"
                except:
                    gpu_info = " | GPU: Monitoring"
            
            print(f"\r[RECURSOS] CPU: {cpu_percent:.1f}% | RAM: {memory_percent:.1f}%{gpu_info}", end="", flush=True)
            time.sleep(5)

## 📊 Evaluación
- Calcula métricas (accuracy/F1/precision/recall) y reportes/confusión.

In [63]:
class DetailedProgressCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None
        self.epoch_start_time = None
        self.last_log_time = None
        
    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        print(f"\nENTRENAMIENTO INICIADO")
        print(f"Total pasos: {state.max_steps:,}")
        print(f"Épocas: {args.num_train_epochs}")
        print(f"Batch size efectivo: {args.per_device_train_batch_size * args.gradient_accumulation_steps}")
        print(f"Usando FP16: {args.fp16}")
        print(f"Métrica objetivo: {args.metric_for_best_model}")
        print("=" * 70)
        
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        current_epoch = int(state.epoch) + 1
        print(f"\n📈 ÉPOCA {current_epoch}/{args.num_train_epochs} - Paso {state.global_step:,}/{state.max_steps:,}")
        print("-" * 50)
        
    def on_step_end(self, args, state, control, logs=None, **kwargs):
        # Mostrar progreso cada 100 pasos
        if state.global_step % 100 == 0 or state.global_step <= 50:
            current_time = time.time()
            
            # Calcular ETA
            if self.start_time:
                elapsed = current_time - self.start_time
                if state.global_step > 0:
                    avg_time_per_step = elapsed / state.global_step
                    remaining_steps = state.max_steps - state.global_step
                    eta_seconds = remaining_steps * avg_time_per_step
                    eta_formatted = time.strftime('%H:%M:%S', time.gmtime(eta_seconds))
                else:
                    eta_formatted = "Calculando..."
            else:
                eta_formatted = "Calculando..."
            
            # Progreso global
            progress_percent = (state.global_step / state.max_steps) * 100
            progress_bar = "█" * int(progress_percent // 2.5) + "░" * (40 - int(progress_percent // 2.5))
            
            print(f"\n[{progress_bar}] {progress_percent:.1f}% - Paso {state.global_step:,}/{state.max_steps:,} | ETA: {eta_formatted}")
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            current_time = time.time()
            
            # Evitar spam de logs
            if self.last_log_time is None or current_time - self.last_log_time > 30:  # Cada 30 segundos max
                print(f"Paso {state.global_step:,}: Loss = {logs['loss']:.4f}")
                
                # Mostrar learning rate si está disponible
                if 'learning_rate' in logs:
                    print(f"Learning Rate: {logs['learning_rate']:.2e}")
                
                # Mostrar velocidad de entrenamiento
                if self.start_time and state.global_step > 0:
                    elapsed = current_time - self.start_time
                    steps_per_second = state.global_step / elapsed
                    print(f"Velocidad: {steps_per_second:.2f} pasos/seg")
                
                self.last_log_time = current_time
                    
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        if logs:
            print(f"\n🔍 EVALUACIÓN - Época {int(state.epoch)}")
            print("-" * 30)
            for key, value in logs.items():
                if isinstance(value, (int, float)) and not key.startswith('eval_runtime'):
                    print(f"   {key}: {value:.4f}")
            print("-" * 30)
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if self.epoch_start_time:
            epoch_time = time.time() - self.epoch_start_time
            current_epoch = int(state.epoch)
            print(f"\nÉpoca {current_epoch} completada en {epoch_time/60:.1f} minutos")
            
    def on_train_end(self, args, state, control, **kwargs):
        if self.start_time:
            total_time = time.time() - self.start_time
            print(f"\nENTRENAMIENTO COMPLETADO!")
            print(f"Tiempo total: {total_time/3600:.2f} horas ({total_time/60:.1f} minutos)")
            print(f"Pasos completados: {state.global_step:,}")
            print("=" * 70)

## 🚂 Entrenamiento
- Épocas: 3.
- Batch size: 16.
- Configura y ejecuta el **entrenamiento**.

In [75]:
training_args = TrainingArguments(
    output_dir="./results_multilingual_optimized",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=18,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,  
    gradient_checkpointing=True,
    dataloader_num_workers=0,  
    dataloader_pin_memory=False,
    warmup_steps=500,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none",
    disable_tqdm=False,
    logging_strategy="steps",
    seed=42,
    data_seed=42,
)

PyTorch: setting up devices
average_tokens_across_devices is True but world size is 1. Setting it to False automatically.


## 🚂 Entrenamiento
- Configura y ejecuta el **entrenamiento**.

In [77]:
trainer = MultilingualWeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    class_weights=class_weights_tensor if USE_CLASS_WEIGHTS else None,
    use_weights=USE_CLASS_WEIGHTS,
    compute_metrics=compute_multilingual_metrics,
    callbacks=[DetailedProgressCallback()]  
)

Using auto half precision backend


In [79]:
trainer.train()

***** Running training *****
  Num examples = 243,140
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 45,591
  Number of trainable parameters = 177,855,747



ENTRENAMIENTO INICIADO
Total pasos: 45,591
Épocas: 3
Batch size efectivo: 16
Usando FP16: True
Métrica objetivo: macro_f1

📈 ÉPOCA 1/3 - Paso 0/45,591
--------------------------------------------------

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 1/45,591 | ETA: 00:12:04


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Weighted F1,Malo F1,Neutro F1,Bueno F1,Malo Precision,Malo Recall,Neutro Precision,Neutro Recall,Bueno Precision,Bueno Recall
1,0.5147,0.528436,0.740479,0.706063,0.706066,0.902672,0.739781,0.475735,0.826899,0.993732,0.623983,0.908351,0.932411,0.319333
2,0.4963,0.508545,0.745315,0.708174,0.708178,0.911697,0.746892,0.465934,0.840058,0.996693,0.62287,0.932583,0.969569,0.306648
3,0.4877,0.507309,0.750679,0.714934,0.714937,0.917414,0.751833,0.475554,0.849208,0.997532,0.627057,0.938604,0.961683,0.315878



[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 2/45,591 | ETA: 11:47:29

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 3/45,591 | ETA: 10:35:42

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 4/45,591 | ETA: 10:25:35

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 5/45,591 | ETA: 08:16:56

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 6/45,591 | ETA: 08:16:12

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 7/45,591 | ETA: 08:17:50

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 8/45,591 | ETA: 08:22:51

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 9/45,591 | ETA: 07:23:14

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 10/45,591 | ETA: 07:39:41

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 11/45,591 | ETA: 07:46:30

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 12/45,591 | ETA: 07:49:36

[░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 0.0% - Paso 13/45,591 | ETA: 07:51:07

[░░░░░░░░░░░░░░░░░░░░░░


***** Running Evaluation *****
  Num examples = 60785
  Batch size = 18



Época 1 completada en 644.9 minutos


Saving model checkpoint to ./results_multilingual_optimized\checkpoint-15197
Configuration saved in ./results_multilingual_optimized\checkpoint-15197\config.json
Model weights saved in ./results_multilingual_optimized\checkpoint-15197\model.safetensors



📈 ÉPOCA 2/3 - Paso 15,197/45,591
--------------------------------------------------

[█████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░] 33.3% - Paso 15,200/45,591 | ETA: 22:31:44
Paso 15,200: Loss = 0.4960
Learning Rate: 1.35e-05
Velocidad: 0.37 pasos/seg
Paso 15,250: Loss = 0.4846
Learning Rate: 1.35e-05
Velocidad: 0.37 pasos/seg

[█████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░] 33.6% - Paso 15,300/45,591 | ETA: 22:26:53
Paso 15,300: Loss = 0.4881
Learning Rate: 1.34e-05
Velocidad: 0.37 pasos/seg
Paso 15,350: Loss = 0.5204
Learning Rate: 1.34e-05
Velocidad: 0.37 pasos/seg

[█████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░] 33.8% - Paso 15,400/45,591 | ETA: 22:22:05
Paso 15,400: Loss = 0.4995
Learning Rate: 1.34e-05
Velocidad: 0.37 pasos/seg
Paso 15,450: Loss = 0.5168
Learning Rate: 1.34e-05
Velocidad: 0.37 pasos/seg

[█████████████░░░░░░░░░░░░░░░░░░░░░░░░░░░] 34.0% - Paso 15,500/45,591 | ETA: 22:17:14
Paso 15,500: Loss = 0.4843
Learning Rate: 1.34e-05
Velocidad: 0.38 pasos/seg
Paso 15,550: Loss = 0.5171
L


***** Running Evaluation *****
  Num examples = 60785
  Batch size = 18



Época 2 completada en 646.5 minutos


Saving model checkpoint to ./results_multilingual_optimized\checkpoint-30394
Configuration saved in ./results_multilingual_optimized\checkpoint-30394\config.json
Model weights saved in ./results_multilingual_optimized\checkpoint-30394\model.safetensors



📈 ÉPOCA 3/3 - Paso 30,394/45,591
--------------------------------------------------

[██████████████████████████░░░░░░░░░░░░░░] 66.7% - Paso 30,400/45,591 | ETA: 11:16:29
Paso 30,400: Loss = 0.5096
Learning Rate: 6.74e-06
Velocidad: 0.37 pasos/seg
Paso 30,450: Loss = 0.4504
Learning Rate: 6.72e-06
Velocidad: 0.37 pasos/seg

[██████████████████████████░░░░░░░░░░░░░░] 66.9% - Paso 30,500/45,591 | ETA: 11:11:56
Paso 30,500: Loss = 0.4698
Learning Rate: 6.70e-06
Velocidad: 0.37 pasos/seg
Paso 30,550: Loss = 0.4730
Learning Rate: 6.68e-06
Velocidad: 0.37 pasos/seg

[██████████████████████████░░░░░░░░░░░░░░] 67.1% - Paso 30,600/45,591 | ETA: 11:07:23
Paso 30,600: Loss = 0.4622
Learning Rate: 6.66e-06
Velocidad: 0.37 pasos/seg
Paso 30,650: Loss = 0.5179
Learning Rate: 6.63e-06
Velocidad: 0.37 pasos/seg

[██████████████████████████░░░░░░░░░░░░░░] 67.3% - Paso 30,700/45,591 | ETA: 11:02:50
Paso 30,700: Loss = 0.4342
Learning Rate: 6.61e-06
Velocidad: 0.37 pasos/seg
Paso 30,750: Loss = 0.4829
L


***** Running Evaluation *****
  Num examples = 60785
  Batch size = 18



Época 3 completada en 650.4 minutos


Saving model checkpoint to ./results_multilingual_optimized\checkpoint-45591
Configuration saved in ./results_multilingual_optimized\checkpoint-45591\config.json
Model weights saved in ./results_multilingual_optimized\checkpoint-45591\model.safetensors
Deleting older checkpoint [results_multilingual_optimized\checkpoint-15197] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results_multilingual_optimized\checkpoint-45591 (score: 0.7149335357427283).



ENTRENAMIENTO COMPLETADO!
Tiempo total: 33.94 horas (2036.2 minutos)
Pasos completados: 45,591


TrainOutput(global_step=45591, training_loss=0.5019875658818491, metrics={'train_runtime': 122171.1049, 'train_samples_per_second': 5.97, 'train_steps_per_second': 0.373, 'total_flos': 4.798004728978944e+16, 'train_loss': 0.5019875658818491, 'epoch': 3.0})

## 🧼 Limpieza de texto
- Normaliza texto (lower/regex), quita URLs/símbolos, etc.

In [81]:
metrics = trainer.evaluate()
for key, value in metrics.items():
    if key.startswith('eval_'):
        clean_key = key.replace('eval_', '')
        print(f"  {clean_key}: {value:.4f}")


***** Running Evaluation *****
  Num examples = 60785
  Batch size = 18


  loss: 0.5073
  accuracy: 0.7507
  macro_f1: 0.7149
  weighted_f1: 0.7149
  malo_f1: 0.9174
  neutro_f1: 0.7518
  bueno_f1: 0.4756
  malo_precision: 0.8492
  malo_recall: 0.9975
  neutro_precision: 0.6271
  neutro_recall: 0.9386
  bueno_precision: 0.9617
  bueno_recall: 0.3159
  runtime: 1996.1024
  samples_per_second: 30.4520
  steps_per_second: 1.6920


## 🧩 Construcción del dataset tokenizado
- Aplica el tokenizador y construye Dataset/DataLoader.

In [82]:
model_save_path = "./modelo_sentimientos_multilingual_balanced"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

Saving model checkpoint to ./modelo_sentimientos_multilingual_balanced
Configuration saved in ./modelo_sentimientos_multilingual_balanced\config.json
Model weights saved in ./modelo_sentimientos_multilingual_balanced\model.safetensors
tokenizer config file saved in ./modelo_sentimientos_multilingual_balanced\tokenizer_config.json
Special tokens file saved in ./modelo_sentimientos_multilingual_balanced\special_tokens_map.json


('./modelo_sentimientos_multilingual_balanced\\tokenizer_config.json',
 './modelo_sentimientos_multilingual_balanced\\special_tokens_map.json',
 './modelo_sentimientos_multilingual_balanced\\vocab.txt',
 './modelo_sentimientos_multilingual_balanced\\added_tokens.json')

## 📦 Dependencias
- Importa librerías: torch.

In [162]:
import gc
torch.cuda.empty_cache()
gc.collect()

4520

## 🎲 Reproducibilidad y dispositivo
- Configura **dispositivo** (CPU/GPU).

In [84]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = False

## 🔤 Tokenizador BERT
- Configura el tokenizador (padding/truncation/max_len).

In [86]:
try:
    tokenizer = BertTokenizer.from_pretrained(model_save_path)
    model = BertForSequenceClassification.from_pretrained(model_save_path)
    model.to(device)
    model.eval()
except Exception as e:
    model_save_path = "./results_multilingual_balanced"

loading file vocab.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading file chat_template.jinja
loading configuration file ./modelo_sentimientos_multilingual_balanced\config.json
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_nu

## 🎲 Reproducibilidad y dispositivo
- Configura **dispositivo** (CPU/GPU).

In [87]:
def evaluate_in_batches_multilingual(texts, labels, model, tokenizer, batch_size=16):
    all_predictions = []
    all_probabilities = []
    
    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = list(texts.iloc[i:i+batch_size])
            
            # Tokenizar con configuración multilingüe
            inputs = tokenizer(
                batch_texts, 
                padding=True, 
                truncation=True, 
                max_length=128, 
                return_tensors="pt"
            ).to(device)
            
            # Predicción
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            probabilities = torch.softmax(outputs.logits, dim=1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
            
            # Limpieza de memoria
            del inputs, outputs, predictions, probabilities
            torch.cuda.empty_cache()
            
            if (i // batch_size + 1) % 20 == 0:
                print(f"    Procesados {i + batch_size}/{len(texts)} muestras")
    
    return np.array(all_predictions), np.array(all_probabilities)


## 🧩 Construcción del dataset tokenizado
- Aplica el tokenizador y construye Dataset/DataLoader.

In [88]:
val_predictions, val_probabilities = evaluate_in_batches_multilingual(
    val_texts, val_labels, model, tokenizer, batch_size=32
)

    Procesados 640/60785 muestras
    Procesados 1280/60785 muestras
    Procesados 1920/60785 muestras
    Procesados 2560/60785 muestras
    Procesados 3200/60785 muestras
    Procesados 3840/60785 muestras
    Procesados 4480/60785 muestras
    Procesados 5120/60785 muestras
    Procesados 5760/60785 muestras
    Procesados 6400/60785 muestras
    Procesados 7040/60785 muestras
    Procesados 7680/60785 muestras
    Procesados 8320/60785 muestras
    Procesados 8960/60785 muestras
    Procesados 9600/60785 muestras
    Procesados 10240/60785 muestras
    Procesados 10880/60785 muestras
    Procesados 11520/60785 muestras
    Procesados 12160/60785 muestras
    Procesados 12800/60785 muestras
    Procesados 13440/60785 muestras
    Procesados 14080/60785 muestras
    Procesados 14720/60785 muestras
    Procesados 15360/60785 muestras
    Procesados 16000/60785 muestras
    Procesados 16640/60785 muestras
    Procesados 17280/60785 muestras
    Procesados 17920/60785 muestras
    Proc

## 🧪 Bloque de código
- Ejecución auxiliar del flujo.

In [89]:
accuracy = np.mean(val_predictions == val_labels)
print(f"\nACCURACY FINAL: {accuracy:.4f}")


ACCURACY FINAL: 0.7507


## 📊 Evaluación
- Calcula métricas (accuracy/F1/precision/recall) y reportes/confusión.

In [90]:
print(f"\nMATRIZ DE CONFUSIÓN:")
cm = confusion_matrix(val_labels, val_predictions)
print("         Predicho:")
print("         Malo  Neutro Bueno")
for i, row in enumerate(cm):
    label_name = label_names[i]
    print(f"{label_name:7} {row[0]:4d}   {row[1]:4d}  {row[2]:4d}")


MATRIZ DE CONFUSIÓN:
         Predicho:
         Malo  Neutro Bueno
Malo    20212     42     8
Neutro   997   19018   247
Bueno   2592   11268  6401


## 📊 Evaluación
- Calcula métricas (accuracy/F1/precision/recall) y reportes/confusión.

In [91]:
report = classification_report(val_labels, val_predictions, target_names=label_names, output_dict=True)
print(f"\nREPORTE DETALLADO:")
print(classification_report(val_labels, val_predictions, target_names=label_names))


REPORTE DETALLADO:
              precision    recall  f1-score   support

        Malo       0.85      1.00      0.92     20262
      Neutro       0.63      0.94      0.75     20262
       Bueno       0.96      0.32      0.48     20261

    accuracy                           0.75     60785
   macro avg       0.81      0.75      0.71     60785
weighted avg       0.81      0.75      0.71     60785



## 🧪 Bloque de código
- Ejecución auxiliar del flujo.

In [93]:
print(f"\nANÁLISIS DE SESGO:")
bias_metrics = {}
for i, label_name in enumerate(label_names):
    precision = report[label_name]['precision']
    recall = report[label_name]['recall']
    f1 = report[label_name]['f1-score']
    bias = abs(precision - recall)
    
    bias_metrics[label_name] = {
        'precision': precision,
        'recall': recall, 
        'f1': f1,
        'bias': bias
    }
    
    print(f"  {label_name}: F1={f1:.3f}, P={precision:.3f}, R={recall:.3f}, Bias={bias:.3f}")

avg_bias = np.mean([m['bias'] for m in bias_metrics.values()])
print(f"\nBias promedio: {avg_bias:.4f}")

if avg_bias < 0.05:
    print("Excelente balance (bias < 0.05)")
elif avg_bias < 0.1:
    print("Buen balance (bias < 0.1)")
elif avg_bias < 0.15:
    print("Balance moderado (0.1 ≤ bias < 0.15)")
else:
    print("Sesgo significativo (bias ≥ 0.15)")


ANÁLISIS DE SESGO:
  Malo: F1=0.917, P=0.849, R=0.998, Bias=0.148
  Neutro: F1=0.752, P=0.627, R=0.939, Bias=0.312
  Bueno: F1=0.476, P=0.962, R=0.316, Bias=0.646

Bias promedio: 0.3685
Sesgo significativo (bias ≥ 0.15)


## 🧪 Bloque de código
- Ejecución auxiliar del flujo.

In [95]:
multilingual_examples = [
    # Español
    "El servicio fue excelente y el personal muy amable",
    "No me gustó para nada el producto, fue terrible", 
    "El producto está bien, nada especial",
    
    # Inglés
    "The service was excellent and the staff very friendly",
    "I didn't like the product at all, it was terrible",
    "The product is fine, nothing special",
    
    # Francés
    "Le service était excellent et le personnel très sympathique",
    "Je n'ai pas du tout aimé le produit, c'était terrible",
    "Le produit est bien, rien d'exceptionnel",
    
    # Italiano
    "Il servizio era eccellente e il personale molto gentile",
    "Non mi è piaciuto per niente il prodotto, è stato terribile",
    "Il prodotto va bene, niente di speciale"
]

## 🔤 Tokenizador BERT
- Configura el tokenizador (padding/truncation/max_len).

In [97]:
test_inputs = tokenizer(
    multilingual_examples, 
    padding=True, 
    truncation=True, 
    max_length=128, 
    return_tensors="pt"
).to(device)

## 🔮 Predicción / inferencia
- Aplica el modelo a nuevos textos y obtiene probabilidades/labels.

In [99]:
with torch.no_grad():
    test_outputs = model(**test_inputs)
    test_predictions = torch.argmax(test_outputs.logits, dim=1)
    test_probabilities = torch.softmax(test_outputs.logits, dim=1)

## 🧪 Bloque de código
- Ejecución auxiliar del flujo.

In [101]:
example_results = []
for text in tqdm(multilingual_examples, desc="Detectando idiomas", unit="texto"):
    try:
        lang = detect(text)
        lang_name = {
            'es': 'ESP', 'en': 'ENG', 'fr': 'FRA', 'it': 'ITA', 
            'de': 'DEU', 'pt': 'POR', 'ca': 'CAT', 'ro': 'ROM',
            'pl': 'POL', 'nl': 'NLD'
        }.get(lang, lang.upper())
    except:
        lang_name = 'UNK'
    example_results.append(lang_name)

for i, (text, pred, probs, lang_name) in enumerate(zip(multilingual_examples, test_predictions, test_probabilities, example_results)):
    pred_label = label_names[pred.item()]
    confidence = probs[pred].item() * 100
    
    print(f"\n{i+1:2d}. [{lang_name}] '{text[:60]}{'...' if len(text) > 60 else ''}'")
    print(f"     → {pred_label} ({confidence:.1f}%)")
    print(f"     M:{probs[0]:.2f} | N:{probs[1]:.2f} | B:{probs[2]:.2f}")

Detectando idiomas:   0%|          | 0/12 [00:00<?, ?texto/s]


 1. [ESP] 'El servicio fue excelente y el personal muy amable'
     → Bueno (99.8%)
     M:0.00 | N:0.00 | B:1.00

 2. [ESP] 'No me gustó para nada el producto, fue terrible'
     → Malo (99.9%)
     M:1.00 | N:0.00 | B:0.00

 3. [ESP] 'El producto está bien, nada especial'
     → Neutro (71.1%)
     M:0.27 | N:0.71 | B:0.02

 4. [ENG] 'The service was excellent and the staff very friendly'
     → Bueno (99.9%)
     M:0.00 | N:0.00 | B:1.00

 5. [ENG] 'I didn't like the product at all, it was terrible'
     → Malo (99.9%)
     M:1.00 | N:0.00 | B:0.00

 6. [ENG] 'The product is fine, nothing special'
     → Neutro (97.1%)
     M:0.00 | N:0.97 | B:0.03

 7. [FRA] 'Le service était excellent et le personnel très sympathique'
     → Bueno (99.8%)
     M:0.00 | N:0.00 | B:1.00

 8. [FRA] 'Je n'ai pas du tout aimé le produit, c'était terrible'
     → Malo (99.9%)
     M:1.00 | N:0.00 | B:0.00

 9. [FRA] 'Le produit est bien, rien d'exceptionnel'
     → Neutro (96.5%)
     M:0.00 | N:0.96 |

## 🧪 Bloque de código
- Ejecución auxiliar del flujo.

In [103]:
print(f"\nMEJORES MÉTRICAS POR CLASE:")
for label_name in label_names:
    f1 = bias_metrics[label_name]['f1']
    print(f"  {label_name}: F1 = {f1:.4f}")


MEJORES MÉTRICAS POR CLASE:
  Malo: F1 = 0.9174
  Neutro: F1 = 0.7518
  Bueno: F1 = 0.4756


## 🧪 Bloque de código
- Ejecución auxiliar del flujo.

In [107]:
df

Unnamed: 0,Texto,Rating,Ciudad,Categoria,Fecha,Fuente,Sentimiento,Label
0,I do not have any good memories of the store b...,Malo,,,,,,0.0
1,local experiences in tenerife the best part of...,Neutro,Tenerife,Ocio,2024-11-02,TUI,,1.0
2,tickets and events in seville too crowded but ...,Neutro,Seville,Ocio,2022-09-08,TUI,,1.0
3,"L’expérience était très sympa , mais l’hélicop...",Neutro,Barcelona,Desconocido,2024-05-12,Booking,,1.0
4,local experiences in malaga spectacular scener...,Neutro,Malaga,Desconocido,2023-11-29,TUI,,1.0
...,...,...,...,...,...,...,...,...
303920,Esta entrada no tiene comentarios,Malo,,,,,,0.0
303921,tickets and events in palma de mallorca an unf...,Neutro,Palma de Mallorca,Desconocido,2023-09-06,TUI,,1.0
303922,We somehow missed the importance of having our...,Malo,,,,,,0.0
303923,Esta entrada no tiene comentarios,Malo,,,,,,0.0


## 📥 Carga de datos
- Lee dataset: dataset_maestro.csv.

In [109]:
df_maestro = pd.read_csv('dataset_maestro.csv')

In [117]:
df_maestro.head()

Unnamed: 0,Texto,Rating,Ciudad,Categoria,Fecha,Fuente,Sentimiento
0,"É stato veloce, arrivati e passati subito. Con...",Bueno,Barcelona,Tour,2025-07-18,Booking,
1,Fantastisk oplevelse og det gik enormt hurtigt...,Bueno,Barcelona,Tour,2025-07-17,Booking,
2,"La Sagrada Familia is a ""must"" in terms of par...",Bueno,Barcelona,Tour,2025-07-17,Booking,
3,Nous sommes arrivés en retard et le monsieur à...,Bueno,Barcelona,Playa,2025-07-17,Booking,
4,"j’ai choisi d’y aller vers 15h un mardi, il y ...",Bueno,Barcelona,Playa,2025-07-17,Booking,


In [123]:
texto = df_maestro['Texto'].fillna('').astype(str).tolist()

## 📦 Dependencias
- Importa librerías: torch.

In [141]:
import torch.nn.functional as F

## 🎲 Reproducibilidad y dispositivo
- Configura **dispositivo** (CPU/GPU).

In [153]:
def predecir_con_bert_gpu(df, columna_texto, ruta_modelo, batch_size=8):
    # Limpiar caché inicial
    gc.collect()
    torch.cuda.empty_cache()
    
    # Configurar dispositivo (GPU si está disponible)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Usando dispositivo: {device}")
    
    if device.type == 'cpu':
        print("GPU no disponible, usando CPU (será más lento)")
    else:
        print(f"GPU disponible: {torch.cuda.get_device_name(0)}")
        print(f"Memoria GPU libre: {torch.cuda.get_device_properties(0).total_memory // 1024**3} GB")
    
    # Cargar modelo y tokenizador
    print("Cargando modelo...")
    modelo = BertForSequenceClassification.from_pretrained(ruta_modelo)
    tokenizador = BertTokenizer.from_pretrained(ruta_modelo)
    
    # Mover modelo a GPU
    modelo = modelo.to(device)
    modelo.eval()
    
    # Preparar textos
    textos = df[columna_texto].fillna('').astype(str).tolist()
    total_textos = len(textos)
    
    predicciones = []
    confianzas = []
    
    print(f"Procesando {total_textos} textos en batches de {batch_size}...")
    print("Progreso:")
    
    try:
        for i in range(0, total_textos, batch_size):
            # Mostrar progreso cada 50 batches
            if i % (batch_size * 50) == 0:
                porcentaje = (i / total_textos) * 100
                print(f"  {porcentaje:.1f}% - Procesado: {i}/{total_textos}")
                
                # Mostrar uso de memoria GPU si está disponible
                if device.type == 'cuda':
                    memoria_usada = torch.cuda.memory_allocated() / 1024**3
                    print(f"Memoria GPU usada: {memoria_usada:.2f} GB")
            
            batch_textos = textos[i:i+batch_size]
            
            # Tokenizar
            inputs = tokenizador(
                batch_textos,
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors='pt'
            )
            
            # Mover inputs a GPU
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Predecir
            with torch.no_grad():
                outputs = modelo(**inputs)
                probs = F.softmax(outputs.logits, dim=-1)
                preds = torch.argmax(outputs.logits, dim=-1)
                
                # Mover resultados de vuelta a CPU y guardar
                predicciones.extend(preds.cpu().numpy())
                confianzas.extend(probs.max(dim=-1)[0].cpu().numpy())
            
            # Limpiar caché GPU cada 100 batches para evitar sobrecarga
            if i % (batch_size * 100) == 0 and i > 0:
                if device.type == 'cuda':
                    torch.cuda.empty_cache()
                gc.collect()
                print("Caché limpiado")
        
        print("Procesamiento completado!")
        
    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"Error de memoria GPU. Reduce batch_size a {batch_size//2} e intenta de nuevo")
            print("   O usa device='cpu' para procesar en CPU")
            torch.cuda.empty_cache()
            return None, None
        else:
            print(f"Error durante la predicción: {e}")
            return None, None
    
    finally:
        # Limpieza final
        torch.cuda.empty_cache()
        gc.collect()
    
    return predicciones, confianzas

## 🎲 Reproducibilidad y dispositivo
- Configura **dispositivo** (CPU/GPU).

In [147]:
def verificar_gpu():
    """Función para verificar el estado de la GPU"""
    if torch.cuda.is_available():
        print(f"GPU disponible: {torch.cuda.get_device_name(0)}")
        print(f"Memoria total: {torch.cuda.get_device_properties(0).total_memory // 1024**3} GB")
        print(f"Memoria libre: {torch.cuda.memory_reserved(0) // 1024**3} GB")
        return True
    else:
        print("GPU no disponible")
        return False

In [149]:
verificar_gpu()

GPU disponible: NVIDIA GeForce RTX 3050 Ti Laptop GPU
Memoria total: 3 GB
Memoria libre: 5 GB


True

In [164]:
batch_size_inicial = 16

preds, conf = predecir_con_bert_gpu(
    df_maestro, 
    'Texto', 
    './modelo_sentimientos_multilingual_balanced',
    batch_size=batch_size_inicial
)

loading configuration file ./modelo_sentimientos_multilingual_balanced\config.json
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tran

Usando dispositivo: cuda
GPU disponible: NVIDIA GeForce RTX 3050 Ti Laptop GPU
Memoria GPU libre: 3 GB
Cargando modelo...


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at ./modelo_sentimientos_multilingual_balanced.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
loading file vocab.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading file chat_template.jinja


Procesando 855800 textos en batches de 16...
Progreso:
  0.0% - Procesado: 0/855800
Memoria GPU usada: 5.32 GB
  0.1% - Procesado: 800/855800
Memoria GPU usada: 5.32 GB
  0.2% - Procesado: 1600/855800
Memoria GPU usada: 5.32 GB
Caché limpiado
  0.3% - Procesado: 2400/855800
Memoria GPU usada: 5.32 GB
  0.4% - Procesado: 3200/855800
Memoria GPU usada: 5.32 GB
Caché limpiado
  0.5% - Procesado: 4000/855800
Memoria GPU usada: 5.32 GB
  0.6% - Procesado: 4800/855800
Memoria GPU usada: 5.32 GB
Caché limpiado
  0.7% - Procesado: 5600/855800
Memoria GPU usada: 5.32 GB
  0.7% - Procesado: 6400/855800
Memoria GPU usada: 5.32 GB
Caché limpiado
  0.8% - Procesado: 7200/855800
Memoria GPU usada: 5.32 GB
  0.9% - Procesado: 8000/855800
Memoria GPU usada: 5.32 GB
Caché limpiado
  1.0% - Procesado: 8800/855800
Memoria GPU usada: 5.32 GB
  1.1% - Procesado: 9600/855800
Memoria GPU usada: 5.32 GB
Caché limpiado
  1.2% - Procesado: 10400/855800
Memoria GPU usada: 5.32 GB
  1.3% - Procesado: 11200/855800

KeyboardInterrupt: 

## 💾 Guardado / exportación
- Exporta resultados/tablas.
- Archivos: df_final.csv.

In [None]:
if preds is not None:
    # Mapear a etiquetas
    mapeo = {0: 'negativo', 1: 'neutro', 2: 'positivo'}
    df['sentimiento'] = [mapeo[p] for p in preds]
    df['confianza'] = conf
    
    # Guardar resultado
    df.to_csv('df_final.csv', index=False)

## 📦 Dependencias
- Importa librerías: transformers, torch.

In [3]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import gc

# Limpiar caché inicial
gc.collect()
torch.cuda.empty_cache()

# Configurar dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

# Cargar SOLO el modelo
print("Cargando modelo...")
modelo = BertForSequenceClassification.from_pretrained('./modelo_sentimientos_multilingual_balanced')
tokenizador = BertTokenizer.from_pretrained('./modelo_sentimientos_multilingual_balanced')

# Mover a GPU
modelo = modelo.to(device)
modelo.eval()

print(f"Memoria GPU usada: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

Usando dispositivo: cuda
Cargando modelo...
Memoria GPU usada: 0.66 GB


## 📦 Dependencias
- Importa librerías: torch, pandas.

In [5]:
import pandas as pd
import torch.nn.functional as F
import os
from tqdm import tqdm

## 🎲 Reproducibilidad y dispositivo
- Configura **dispositivo** (CPU/GPU).

In [35]:
def procesar_por_chunks(ruta_csv, columna_texto, ruta_modelo, chunk_size=50000, batch_size=16):
    """Procesa dataset por chunks para optimizar memoria"""
    
    # Cargar modelo una vez
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Cargando modelo en {device}...")
    
    modelo = BertForSequenceClassification.from_pretrained(ruta_modelo)
    tokenizador = BertTokenizer.from_pretrained(ruta_modelo)
    modelo = modelo.to(device)
    modelo.eval()
    
    # Contar filas totales
    total_filas = sum(1 for line in open(ruta_csv, encoding='utf-8')) - 1
    total_chunks = (total_filas // chunk_size) + (1 if total_filas % chunk_size else 0)
    
    print(f"Total filas: {total_filas:,}")
    print(f"Total chunks: {total_chunks}")
    
    archivo_final = 'resultado_completo.csv'
    primera_vez = True
    
    # Procesar chunk por chunk
    for chunk_num in range(total_chunks):
        print(f"\nProcesando chunk {chunk_num + 1}/{total_chunks}")
        
        inicio = chunk_num * chunk_size
        
        # Cargar solo este chunk
        if chunk_num == 0:
            chunk_df = pd.read_csv(ruta_csv, nrows=chunk_size, encoding='utf-8')
        else:
            chunk_df = pd.read_csv(ruta_csv, skiprows=range(1, inicio + 1), nrows=chunk_size, encoding='utf-8')
        
        # Procesar chunk
        textos = chunk_df[columna_texto].fillna('').astype(str).tolist()
        predicciones = []
        confianzas = []
        
        # Procesar en batches
        for i in range(0, len(textos), batch_size):
            batch_textos = textos[i:i+batch_size]
            
            inputs = tokenizador(
                batch_textos,
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors='pt'
            )
            
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = modelo(**inputs)
                probs = F.softmax(outputs.logits, dim=-1)
                preds = torch.argmax(outputs.logits, dim=-1)
                
                predicciones.extend(preds.cpu().numpy())
                confianzas.extend(probs.max(dim=-1)[0].cpu().numpy())
        
        # Agregar resultados al chunk
        mapeo = {0: 'negativo', 1: 'neutro', 2: 'positivo'}
        chunk_df['sentimiento'] = [mapeo[p] for p in predicciones]
        chunk_df['confianza'] = confianzas
        
        # Guardar chunk
        modo = 'w' if primera_vez else 'a'
        header = primera_vez
        chunk_df.to_csv(archivo_final, mode=modo, header=header, index=False)
        primera_vez = False
        
        print(f"Chunk {chunk_num + 1} guardado")
        
        # Limpiar memoria
        del chunk_df, predicciones, confianzas, textos
        gc.collect()
        torch.cuda.empty_cache()
    
    print(f"\nCompletado! Resultado en: {archivo_final}")

## 🧪 Bloque de código
- Ejecución auxiliar del flujo.

In [37]:
ruta_csv = 'dataset_maestro.csv'     
columna_texto = 'Texto'           
ruta_modelo = './modelo_sentimientos_multilingual_balanced'      

procesar_por_chunks(ruta_csv, columna_texto, ruta_modelo, chunk_size=50000, batch_size=16)

Cargando modelo en cuda...
Total filas: 855,800
Total chunks: 18

Procesando chunk 1/18
Chunk 1 guardado

Procesando chunk 2/18
Chunk 2 guardado

Procesando chunk 3/18
Chunk 3 guardado

Procesando chunk 4/18
Chunk 4 guardado

Procesando chunk 5/18
Chunk 5 guardado

Procesando chunk 6/18
Chunk 6 guardado

Procesando chunk 7/18
Chunk 7 guardado

Procesando chunk 8/18
Chunk 8 guardado

Procesando chunk 9/18
Chunk 9 guardado

Procesando chunk 10/18
Chunk 10 guardado

Procesando chunk 11/18
Chunk 11 guardado

Procesando chunk 12/18
Chunk 12 guardado

Procesando chunk 13/18
Chunk 13 guardado

Procesando chunk 14/18
Chunk 14 guardado

Procesando chunk 15/18
Chunk 15 guardado

Procesando chunk 16/18
Chunk 16 guardado

Procesando chunk 17/18
Chunk 17 guardado

Procesando chunk 18/18
Chunk 18 guardado

Completado! Resultado en: resultado_completo.csv
