In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier, SGDRegressor, Ridge
from sklearn.metrics import roc_auc_score, r2_score, mean_absolute_error, mean_squared_error

# df esperado (tú lo construyes desde FoLiA):
# df columns: chat_id, speaker, timestamp (datetime64), text (str)

def build_targets(df: pd.DataFrame, fast_minutes=5):
    df = df.sort_values(["chat_id", "timestamp"]).copy()

    # siguiente mensaje (misma conversación) donde cambia speaker
    df["next_speaker"] = df.groupby("chat_id")["speaker"].shift(-1)
    df["next_time"]    = df.groupby("chat_id")["timestamp"].shift(-1)

    # buscamos el siguiente evento donde next_speaker != speaker
    # (truco: iterar por chat; simple y claro)
    delta = []
    for _, g in df.groupby("chat_id", sort=False):
        t = g["timestamp"].to_numpy()
        sp = g["speaker"].to_numpy()
        dt = np.full(len(g), np.nan, dtype="float64")
        for i in range(len(g)-1):
            j = i + 1
            while j < len(g) and sp[j] == sp[i]:
                j += 1
            if j < len(g):
                dt[i] = (t[j] - t[i]) / np.timedelta64(1, "s")
        delta.append(dt)
    df["reply_seconds"] = np.concatenate(delta)

    # Target continua
    df["y_cont"] = df["reply_seconds"]

    # Target discreta (elige UNA de estas dos):
    # Opción interpretable por umbral fijo:
    df["y_disc"] = (df["reply_seconds"] <= fast_minutes*60).astype(int)

    # alternativa estilo tu notebook: umbral = media del train (lo haces después del split)
    return df

def make_features(df):
    out = df.copy()
    out["len_chars"] = out["text"].fillna("").str.len()
    out["n_qmark"]   = out["text"].fillna("").str.count(r"\?")
    out["hour"]      = out["timestamp"].dt.hour
    out["dow"]       = out["timestamp"].dt.dayofweek
    return out

# =========================
# 1) Preparar datos
# =========================
df = ...  # <- aquí cargas lo que extraigas de FoLiA a este esquema
df = build_targets(df)
df = df.dropna(subset=["text", "y_cont"])              # sin target continuo no entrenas regresión
df = df[df["y_cont"] >= 0]                             # sanity check
df = make_features(df)

X = df[["text", "len_chars", "n_qmark", "hour", "dow"]]
y_cls = df["y_disc"].values
y_reg = np.log1p(df["y_cont"].values)                  # log1p recomendado
groups = df["chat_id"].values

pre = ColumnTransformer(
    transformers=[
        ("txt", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9), "text"),
        ("num", StandardScaler(with_mean=False), ["len_chars", "n_qmark", "hour", "dow"]),
    ],
    remainder="drop",
)

# =========================
# 2) Clasificación (SGDClassifier)
# =========================
clf = SGDClassifier(random_state=777)

pipe_cls = Pipeline([
    ("pre", pre),
    ("model", clf),
])

param_dist_cls = {
    "model__loss": ["log_loss", "modified_huber", "hinge"],
    "model__penalty": ["l2", "l1", "elasticnet"],
    "model__alpha": np.logspace(-6, -2, 30),
    "model__l1_ratio": np.linspace(0, 1, 11),
}

cv = GroupKFold(n_splits=4)

search_cls = RandomizedSearchCV(
    pipe_cls,
    param_distributions=param_dist_cls,
    n_iter=40,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    random_state=777,
    error_score=-1000,
    verbose=1,
)

search_cls.fit(X, y_cls, groups=groups)
best_cls = search_cls.best_estimator_
print("Best ROC-AUC (CV):", search_cls.best_score_)

# =========================
# 3) Regresión (Ridge o SGDRegressor)
# =========================
pipe_reg = Pipeline([
    ("pre", pre),
    ("model", Ridge()),
])

param_grid_reg = {
    "model__alpha": np.logspace(-2, 3, 20),
}

search_reg = GridSearchCV(
    pipe_reg,
    param_grid=param_grid_reg,
    scoring="r2",
    cv=cv,
    n_jobs=-1,
    error_score=-1000,
    verbose=1,
)

search_reg.fit(X, y_reg, groups=groups)
best_reg = search_reg.best_estimator_
print("Best R2 (CV):", search_reg.best_score_)


AttributeError: 'ellipsis' object has no attribute 'sort_values'