In [None]:
# =============================================================================
# SECCIÃ“N 1: INSTALACIÃ“N DE DEPENDENCIAS CON UV
# =============================================================================
import shutil

# Instalamos 'uv' en el sistema si no existe
if shutil.which("uv") is None:
    !pip install uv -q

# Utilizamos uv para instalar las librerÃ­as del proyecto de forma ultra rÃ¡pida
!uv pip install --system pandas numpy torch transformers tqdm joblib

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m6 packages[0m [2min 87ms[0m[0m


In [None]:
# =============================================================================
# SECCIÃ“N 2: CONFIGURACIÃ“N Y CARGA DE DATOS
# =============================================================================

import torch
import pandas as pd

# DefiniciÃ³n de dispositivo (GPU si estÃ¡ disponible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"ðŸš€ Procesando en: {device}")

# URL del dataset en GitHub
GITHUB_URL = "https://raw.githubusercontent.com/berniehans/ProjectEvo-Callao-Safety/main/data/v2_dataset_callao.csv"

df = df = pd.read_csv(GITHUB_URL, sep=';', encoding='latin1', usecols=['DESCRIPCION_OCURRENCIA', 'TIPOLOGIA'])
TEXT_COL = 'DESCRIPCION_OCURRENCIA'
LABEL_COL = 'TIPOLOGIA'

print(f"âœ… Registros cargados: {len(df)}")

ðŸš€ Procesando en: cuda
âœ… Registros cargados: 98458


In [None]:
# =============================================================================
# SECCIÃ“N 3: INICIALIZACIÃ“N DE BERT (BETO - SPANISH)
# =============================================================================

from transformers import BertTokenizer, BertModel

print("ðŸ“¦ Descargando modelo BETO y Tokenizer...")
model_name = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name).to(device)

# Ponemos el modelo en modo evaluaciÃ³n (no entrenamiento de pesos de BERT)
bert_model.eval()
print("âœ… Modelo listo para extracciÃ³n de caracterÃ­sticas.")

ðŸ“¦ Descargando modelo BETO y Tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

âœ… Modelo listo para extracciÃ³n de caracterÃ­sticas.


In [None]:
# =============================================================================
# SECCIÃ“N 4: DEFINICIÃ“N DE EXTRACCIÃ“N CON BARRA DE PROGRESO
# =============================================================================

from tqdm.auto import tqdm  # Auto selecciona la versiÃ³n para notebook o terminal

def get_bert_embeddings(text_list, batch_size=32):
    all_embeddings = []

    # Calculamos el nÃºmero total de pasos para la barra de progreso
    total_steps = (len(text_list) + batch_size - 1) // batch_size

    # Envolvemos el iterador con tqdm para ver el avance
    pbar = tqdm(total=total_steps, desc="âœ¨ Extrayendo Embeddings de BETO")

    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i : i + batch_size]

        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = bert_model(**inputs)
            # [CLS] token es el primer vector de la Ãºltima capa oculta
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
            all_embeddings.append(cls_embeddings)

        pbar.update(1) # Actualizar barra manualmente

    pbar.close()
    return torch.cat(all_embeddings, dim=0)

print("âš™ï¸ FunciÃ³n de extracciÃ³n configurada con soporte para TQDM.")

âš™ï¸ FunciÃ³n de extracciÃ³n configurada con soporte para TQDM.


In [None]:
# =============================================================================
# SECCIÃ“N 5: GENERACIÃ“N DE EMBEDDINGS (PROCESO INTENSIVO)
# =============================================================================

print("â³ Iniciando transformaciÃ³n semÃ¡ntica...")

# Aseguramos que los textos sean strings para evitar errores en el tokenizer
texts = df[TEXT_COL].astype(str).tolist()

# Llamada a la funciÃ³n con los datos
embeddings = get_bert_embeddings(texts, batch_size=128)

print(f"\nâœ… Proceso terminado.")
print(f"Estructura de salida: {embeddings.shape} (Vectores de 768 dimensiones)")

â³ Iniciando transformaciÃ³n semÃ¡ntica...


âœ¨ Extrayendo Embeddings de BETO:   0%|          | 0/770 [00:00<?, ?it/s]


âœ… Proceso terminado.
Estructura de salida: torch.Size([98458, 768]) (Vectores de 768 dimensiones)


In [None]:
# =============================================================================
# SECCIÃ“N 6: PERSISTENCIA CON SHARDING (FRAGMENTACIÃ“N PARA GITHUB)
# =============================================================================

from sklearn.preprocessing import LabelEncoder
import os
import math

# 1. Codificar etiquetas a nÃºmeros
le = LabelEncoder()
y_encoded = le.fit_transform(df[LABEL_COL])

# 2. ConfiguraciÃ³n de Sharding
os.makedirs('data_processed', exist_ok=True)

# Un vector de 768 en float32 pesa ~3KB. 25,000 registros pesan ~75MB.
# Usaremos un tamaÃ±o de chunk conservador de 25,000 para no exceder los 100MB.
CHUNK_SIZE = 25000
total_samples = embeddings.shape[0]
num_chunks = math.ceil(total_samples / CHUNK_SIZE)

print(f"ðŸ“¦ Iniciando fragmentaciÃ³n: {total_samples} muestras divididas en {num_chunks} archivos.")

# 3. Guardar Meta-datos (Clases)
metadata = {
    'class_names': le.classes_,
    'total_chunks': num_chunks,
    'total_samples': total_samples,
    'feature_dim': embeddings.shape[1]
}
torch.save(metadata, 'data_processed/metadata.pt')

# 4. Bucle de Guardado Fragmentado
for i in range(num_chunks):
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, total_samples)

    # IMPORTANTE: Usamos .clone() para romper la referencia al tensor gigante
    chunk_data = {
        'embeddings': embeddings[start_idx:end_idx].clone(),
        'labels': torch.tensor(y_encoded[start_idx:end_idx])
    }

    file_name = f'data_processed/bert_embeddings_part_{i+1}.pt'
    torch.save(chunk_data, file_name)

    # Verificar tamaÃ±o en MB
    file_size = os.path.getsize(file_name) / (1024 * 1024)
    print(f"âœ… Guardado: {file_name} | TamaÃ±o: {file_size:.2f} MB")

print("\nðŸš€ Fase 2 completada exitosamente. Datos listos para subir a GitHub.")

ðŸ“¦ Iniciando fragmentaciÃ³n: 98458 muestras divididas en 4 archivos.
âœ… Guardado: data_processed/bert_embeddings_part_1.pt | TamaÃ±o: 73.43 MB
âœ… Guardado: data_processed/bert_embeddings_part_2.pt | TamaÃ±o: 73.43 MB
âœ… Guardado: data_processed/bert_embeddings_part_3.pt | TamaÃ±o: 73.43 MB
âœ… Guardado: data_processed/bert_embeddings_part_4.pt | TamaÃ±o: 68.91 MB

ðŸš€ Fase 2 completada exitosamente. Datos listos para subir a GitHub.
