# Configuraci√≥n del Pipeline

In [1]:
# Utilities: cargar modelo local con fallback a Hugging Face Hub
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline


def load_local_or_hub(model_path: str, hub_model: str = None, device=None, use_pipeline: bool = True, local_files_only: bool = True):
    """Intenta cargar un modelo desde `model_path` (local). Si falla y `hub_model` est√° dado, lo descarga desde Hugging Face Hub.

    Args:
        model_path: Ruta al directorio del modelo local (si existe).
        hub_model: Nombre del modelo en HF Hub para fallback (p. ej. 'distilbert-base-uncased-finetuned-sst-2-english').
        device: √çndice de dispositivo para la pipeline: 0..n para GPU, -1 para CPU. Si None, se decide autom√°ticamente.
        use_pipeline: Si True devuelve tambi√©n la pipeline.
        local_files_only: Si True evita descargar archivos desde internet cuando se intenta cargar localmente.

    Returns:
        (tokenizer, model, pipeline) si use_pipeline True, sino (tokenizer, model).

    Lanza RuntimeError si no puede cargar ni local ni del hub.
    """
    model_dir = Path(model_path)
    if device is None:
        device = 0 if torch.cuda.is_available() else -1

    last_exc = None
    tokenizer = None
    model = None

    # Intento local primero
    if model_dir.exists():
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=local_files_only)
            model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=local_files_only)
        except Exception as e:
            last_exc = e

    else:
        last_exc = FileNotFoundError(f"Local model path {model_path} not found.")

    # Si fall√≥ local y hay hub_model, intento descargar desde Hub
    if last_exc is not None and hub_model is not None:
        try:
            tokenizer = AutoTokenizer.from_pretrained(hub_model)
            model = AutoModelForSequenceClassification.from_pretrained(hub_model)
            last_exc = None
        except Exception as e:
            last_exc = e

    if last_exc is not None:
        raise RuntimeError(f"Failed to load model locally ({model_path}) and hub ({hub_model}): {last_exc}")

    if use_pipeline:
        nlp = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)
        return tokenizer, model, nlp

    return tokenizer, model


def predict_sentiment(texts, nlp_pipeline, truncation=True, top_k=None):
    """Wrapper para predecir sentimiento con la pipeline.

    Args:
        texts: str o list[str]
        nlp_pipeline: pipeline de transformers
        truncation: si truncar entradas largas
        top_k: si se quiere obtener las top_k etiquetas (opcional)
    """
    if top_k is None:
        return nlp_pipeline(texts, truncation=truncation)
    return nlp_pipeline(texts, truncation=truncation, top_k=top_k)


  from .autonotebook import tqdm as notebook_tqdm
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Celda de prueba: carga con fallback a hub y predicciones de ejemplo
MODEL_DIR = "./models/sentiment"
HUB_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"

try:
    tokenizer, model, nlp = load_local_or_hub(MODEL_DIR, hub_model=HUB_MODEL)
    print("Modelo cargado (local o hub). Ejecutando pruebas...\n")

    examples = [
        "I am absolutely loving the new features of this game!",
        "Me encanta este producto! Es genial y lo recomiendo.",
        "No me gust√≥, es una p√©rdida de tiempo."
    ]

    for t in examples:
        print('Texto:', t)
        print('Resultado:', predict_sentiment(t, nlp))
        print()

except Exception as e:
    print('Error al cargar/ejecutar:', type(e).__name__, e)
    print('Aseg√∫rate de tener internet para descargar desde el hub o de que MODEL_DIR exista con un modelo guardado.')


Device set to use cpu


Modelo cargado (local o hub). Ejecutando pruebas...

Texto: I am absolutely loving the new features of this game!
Resultado: [{'label': 'POSITIVE', 'score': 0.9998846054077148}]

Texto: Me encanta este producto! Es genial y lo recomiendo.
Resultado: [{'label': 'POSITIVE', 'score': 0.97633296251297}]

Texto: No me gust√≥, es una p√©rdida de tiempo.
Resultado: [{'label': 'NEGATIVE', 'score': 0.9901206493377686}]



# Procesamiento de  Dataset

In [4]:
# Cargar 20 tweets aleatorios desde `../datasets/twitter_validation.csv` y predecir con la pipeline
from pathlib import Path
import pandas as pd

CSV_PATHS = [Path("../datasets/twitter_validation.csv"), Path("./datasets/twitter_validation.csv")]
csv_file = next((p for p in CSV_PATHS if p.exists()), None)
if csv_file is None:
    raise FileNotFoundError("No se encontr√≥ `twitter_validation.csv` en ../datasets o ./datasets. Coloca el archivo o ajusta la ruta.")

print(f"Leyendo datos desde: {csv_file}")
# Intentar leer con header por defecto
try:
    df = pd.read_csv(csv_file, low_memory=False)
except Exception as e:
    raise RuntimeError(f"Error leyendo {csv_file}: {e}")

# Esquema esperado proporcionado por el usuario
expected_cols = ['ID', 'Entity', 'Sentiment', 'Tweet']

# Si la columna 'Tweet' no est√° presente, intentar leer sin cabecera y asignar el esquema
if 'Tweet' not in df.columns and 'tweet' not in [c.lower() for c in df.columns]:
    print('No se detect√≥ columna Tweet en el CSV con header; intentando leer sin cabecera y asignar columnas...')
    df_no_header = pd.read_csv(csv_file, header=None, low_memory=False)
    # Asignar nombres: si hay al menos tantas columnas como expected, asignarlas; si no, asignar las que quepan
    if df_no_header.shape[1] >= len(expected_cols):
        extra = df_no_header.shape[1] - len(expected_cols)
        new_cols = expected_cols + [f'extra_{i+1}' for i in range(extra)]
    else:
        new_cols = expected_cols[:df_no_header.shape[1]]
    df_no_header.columns = new_cols
    df = df_no_header
    print(f'Columnas asignadas: {df.columns.tolist()}')
else:
    # Normalizar nombres que coincidan por case-insensitive con expected_cols
    cols_lower = {c.lower(): c for c in df.columns}
    rename_map = {}
    for ec in expected_cols:
        if ec.lower() in cols_lower:
            rename_map[cols_lower[ec.lower()]] = ec
    if rename_map:
        df = df.rename(columns=rename_map)
        print(f'Renombradas columnas: {rename_map}')

# Verificar que ahora existe la columna Tweet
if 'Tweet' not in df.columns:
    raise KeyError(f"No se pudo localizar la columna 'Tweet' tras los intentos. Columnas disponibles: {list(df.columns)}")

tweet_col = 'Tweet'
print(f"Usando columna: {tweet_col}")

# Sample reproducible de 20 (si hay menos, toma todas)
n = min(20, len(df))
sample_df = df.sample(n=n, random_state=42).reset_index(drop=True)
texts = sample_df[tweet_col].fillna("").astype(str).tolist()

# Asegurar que existe la pipeline `nlp`; si no, intentar cargarla (usa variables MODEL_DIR/HUB_MODEL si est√°n definidas)
try:
    nlp  # pragma: no cover
except NameError:
    # Intentar cargar usando las variables definidas anteriormente en la notebook (o valores por defecto)
    MODEL_DIR = globals().get('MODEL_DIR', './models/sentiment')
    HUB_MODEL = globals().get('HUB_MODEL', 'distilbert-base-uncased-finetuned-sst-2-english')
    print('Pipeline `nlp` no encontrada en el entorno; intentando cargar modelo...')
    tokenizer, model, nlp = load_local_or_hub(MODEL_DIR, hub_model=HUB_MODEL)

# Ejecutar predicciones en batch (la pipeline acepta lista)
print(f"Ejecutando predicciones para {len(texts)} tweets...")
results = nlp(texts, truncation=True)

# Adjuntar resultados al dataframe y mostrar
pred_labels = [r.get('label') if isinstance(r, dict) else (r[0].get('label') if r else None) for r in results]
pred_scores = [r.get('score') if isinstance(r, dict) else (r[0].get('score') if r else None) for r in results]

sample_df['pred_label'] = pred_labels
sample_df['pred_score'] = pred_scores

# Mostrar resultado compacto
for i, row in sample_df.iterrows():
    tweet = row[tweet_col]
    label = row['pred_label']
    score = row['pred_score']
    print(f"[{i+1}] ({label} {score:.3f}) -> {tweet}")

# Tambi√©n mostrar tabla con las columnas relevantes
display(sample_df[[tweet_col, 'pred_label', 'pred_score']])


Leyendo datos desde: ../datasets/twitter_validation.csv
No se detect√≥ columna Tweet en el CSV con header; intentando leer sin cabecera y asignar columnas...
Columnas asignadas: ['ID', 'Entity', 'Sentiment', 'Tweet']
Usando columna: Tweet
Ejecutando predicciones para 20 tweets...
[1] (NEGATIVE 0.995) -> Remote working and an increase in cloud-based data is spurring cyber-attacks: Today sees the publication of the Verizon Business 2020 Data Breach Investigations Report which shows that network security is even more important¬†... dlvr.it/RX3fdK EXETLOS
[2] (POSITIVE 0.993) -> I actually quite like the design of the ps5. It truly feels like the next generation of a console rather than just being a bulkier box with more power
[3] (NEGATIVE 0.991) -> New York charges Johnson & Johnson with insurance fraud over opioid claims

pageone.ng/2020/09/17/new‚Ä¶

$JNJ
[4] (POSITIVE 0.999) -> Chris loves me in borderlands one and two.
[5] (NEGATIVE 0.881) -> Check out my video! #LeagueofLegends | Ca

Leyendo datos desde: ../datasets/twitter_validation.csv
No se detect√≥ columna Tweet en el CSV con header; intentando leer sin cabecera y asignar columnas...
Columnas asignadas: ['ID', 'Entity', 'Sentiment', 'Tweet']
Usando columna: Tweet
Ejecutando predicciones para 20 tweets...
[1] (NEGATIVE 0.995) -> Remote working and an increase in cloud-based data is spurring cyber-attacks: Today sees the publication of the Verizon Business 2020 Data Breach Investigations Report which shows that network security is even more important¬†... dlvr.it/RX3fdK EXETLOS
[2] (POSITIVE 0.993) -> I actually quite like the design of the ps5. It truly feels like the next generation of a console rather than just being a bulkier box with more power
[3] (NEGATIVE 0.991) -> New York charges Johnson & Johnson with insurance fraud over opioid claims

pageone.ng/2020/09/17/new‚Ä¶

$JNJ
[4] (POSITIVE 0.999) -> Chris loves me in borderlands one and two.
[5] (NEGATIVE 0.881) -> Check out my video! #LeagueofLegends | Ca

Unnamed: 0,Tweet,pred_label,pred_score
0,Remote working and an increase in cloud-based ...,NEGATIVE,0.995162
1,I actually quite like the design of the ps5. I...,POSITIVE,0.992933
2,New York charges Johnson & Johnson with insura...,NEGATIVE,0.991299
3,Chris loves me in borderlands one and two.,POSITIVE,0.999385
4,Check out my video! #LeagueofLegends | Capture...,NEGATIVE,0.880977
5,Amazing deal for you!\n\nLenovo Legion Y540 9t...,NEGATIVE,0.552346
6,[PS4] | Assassins Creed Syndicate First Playth...,POSITIVE,0.995777
7,@EAMaddenNFL servers down?,NEGATIVE,0.999021
8,@SpeakerPelosi this is VERY INTERESTING üßê,POSITIVE,0.998414
9,So good I had to share! Check out all the item...,POSITIVE,0.99882


## Comparativa

In [7]:
# Evaluaci√≥n (c√≥digo simple)
# Aseg√∫rate de ejecutar antes la celda que crea `sample_df` con columnas 'pred_label' y 'Sentiment'
try:
    sample_df
except NameError:
    raise RuntimeError("`sample_df` no encontrado. Ejecuta la celda de procesamiento de dataset antes de esta celda.")

if 'Sentiment' not in sample_df.columns:
    raise KeyError("La columna 'Sentiment' no est√° presente en `sample_df`. Aseg√∫rate de que el CSV contiene esa columna.")

# Preparar etiquetas
y_true = sample_df['Sentiment'].fillna('').astype(str).str.lower().str.strip()
y_pred = sample_df['pred_label'].fillna('').astype(str).str.lower().str.strip()

# Intentar usar sklearn para reporte detallado
try:
    from sklearn.metrics import classification_report, accuracy_score
    print('Accuracy:', accuracy_score(y_true, y_pred))
    print('\nClassification report:')
    print(classification_report(y_true, y_pred, zero_division=0))
except Exception:
    acc = (y_true == y_pred).mean()
    print(f'Accuracy (manual): {acc:.3f} (instala scikit-learn para un informe m√°s completo: pip install scikit-learn)')


Accuracy: 0.3

Classification report:
              precision    recall  f1-score   support

  irrelevant       0.00      0.00      0.00         3
    negative       0.21      1.00      0.35         3
     neutral       0.00      0.00      0.00         8
    positive       0.50      0.50      0.50         6

    accuracy                           0.30        20
   macro avg       0.18      0.38      0.21        20
weighted avg       0.18      0.30      0.20        20

