# Día 8: Clasificación y Cierre del Curso

**Introducción a Python para ML** | EAE Business School | 11 febrero 2026

**¡Último día!** Hoy vamos a aprender clasificación con regresión logística.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import plotly.express as px
import plotly.graph_objects as go

## Parte 0: Visualización regresión logística

In [None]:
# Generamos 500 apuestas al azar; queremos estudiar la probabilidad de ganar.
n_bids = 500
bids = np.random.uniform(0, 10, n_bids)

real_win = np.random.uniform(5, 9, 1)[0]
wins = np.where(bids > real_win + np.random.normal(size=n_bids), 1.0, 0.0)



# Entrenamos una regresión logística para aprender los coeficientes.
lr = LogisticRegression()
lr.fit(bids.reshape(-1, 1), wins)
beta_0 = lr.intercept_[0]
beta_1 = lr.coef_[0,0]

pred_win = lr.predict(bids.reshape(-1, 1))

# Visualizamos los resultados
df = pd.DataFrame({
    "bids": bids,
    "wins": wins,
    "pred": pred_win.astype(np.bool),
})

xs = np.linspace(0.0, 10.0, 100)
sgs = 1 / (1 + np.exp(- beta_0 - beta_1 * xs))

fig = px.scatter(
    df,
    x="bids",
    y="wins",
    #color="pred",
    title=f"real = {real_win:.2f}, beta_0 = {beta_0:.2f}, beta_1 = {beta_1:.2f}"
)
#fig.add_trace(go.Scatter(x=xs, y=sgs, name="Logistic Reg"))
#fig.add_vline(x=-beta_0/beta_1)
fig.update_layout(showlegend=False)
fig.show()

## Parte 1: Cargar Datos Hotel Bookings

In [None]:
url = 'https://raw.githubusercontent.com/ber2/eae-python/main/data/hotel_bookings.csv'
df = pd.read_csv(url)
print(f'Shape: {df.shape}')
df.head()

In [None]:
# Explorar target
print(df['is_canceled'].value_counts())
print(f'\nTasa de cancelación: {df["is_canceled"].mean():.1%}')

## Parte 2: Preparar Datos

In [None]:
# Seleccionar features numéricas
features = ['lead_time', 'stays_weekend_nights', 'stays_week_nights', 
            'adults', 'previous_cancellations', 'booking_changes', 
            'days_in_waiting_list']

X = df[features].fillna(df[features].median())
y = df['is_canceled']

print(f'Features: {X.shape[1]}')
print(f'Samples: {len(X)}')

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train: {len(X_train)}, Test: {len(X_test)}')

## Parte 3: Baseline Model

In [None]:
# Baseline: predecir siempre la clase mayoritaria (no cancela)
baseline_pred = np.zeros(len(y_test))
baseline_acc = accuracy_score(y_test, baseline_pred)
print(f'Baseline Accuracy: {baseline_acc:.3f}')
print('Nuestro modelo debe superar este baseline!')

## Parte 4: Entrenar Regresión Logística

In [None]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
print('✓ Modelo entrenado')

In [None]:
# Predicciones
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilidad clase 1

print('Primeras 10 probabilidades:')
for i in range(10):
    print(f'Sample {i}: P(cancelar) = {y_proba[i]:.2f}, Predicción: {y_pred[i]}, Real: {y_test.iloc[i]}')

## Parte 5: Evaluar Modelo

In [None]:
# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
print('Matriz de Confusión:')
print(cm)

fig = px.imshow(cm, text_auto=True,
                labels=dict(x='Predicho', y='Real'),
                x=['No Cancela', 'Cancela'],
                y=['No Cancela', 'Cancela'],
                title='Matriz de Confusión')
fig.show()

In [None]:
# Métricas
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy:  {acc:.3f}')
print(f'Precision: {prec:.3f}')
print(f'Recall:    {rec:.3f}')
print(f'F1-Score:  {f1:.3f}')
print(f'\nMejora sobre baseline: {acc - baseline_acc:.3f}')

## Parte 6: Interpretar Coeficientes

In [None]:
# Features más importantes
coefs = pd.DataFrame({
    'Feature': features,
    'Coeficiente': model.coef_[0]
}).sort_values('Coeficiente', key=abs, ascending=False)

print('\nFeatures más influyentes:')
print(coefs)

## Parte 7: Ajustar Umbral

In [None]:
# Probar diferentes umbrales
for threshold in [0.3, 0.5, 0.7]:
    y_pred_thresh = (y_proba >= threshold).astype(int)
    acc_t = accuracy_score(y_test, y_pred_thresh)
    prec_t = precision_score(y_test, y_pred_thresh)
    rec_t = recall_score(y_test, y_pred_thresh)
    print(f'\nUmbral {threshold}:')
    print(f'  Accuracy: {acc_t:.3f}, Precision: {prec_t:.3f}, Recall: {rec_t:.3f}')

## Ejercicio: workflow completo con el dataset original

1. Explorar datos
2. Limpiar y preparar
3. Train/test split (o cross validation si se quiere rizar el rizo)
4. Feature engineering: fabricar nuevas features, añadir features categóricas
5. Entrenar `LogisticRegression`
6. Evaluar y comparar contra baseline (dummy, predecir que nunca habrá cancelación)
7. Interpretar coeficientes
8. Ajustar umbral

In [None]:
url = 'https://raw.githubusercontent.com/ber2/eae-python/main/data/hotel_bookings_large.csv'
df_large = pd.read_csv(url)
print(f'Shape: {df_large.shape}')
df_large.head()