In [None]:
from ml_pipeline_e2e_practica.preprocesamiento_seguro import cargar_dataframe_limpio, preparar_matrices
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

df = cargar_dataframe_limpio()
print(f'Registros totales tras limpieza: {len(df):,}')

## Regresión: demanda continua
Comparamos un modelo lineal y uno basado en árboles para estimar `demand`.

In [None]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = preparar_matrices(df)
modelos_reg = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(random_state=42),
}

print('MSE en el set de prueba:')
for nombre, modelo in modelos_reg.items():
    modelo.fit(X_train_reg, y_train_reg)
    mse = mean_squared_error(y_test_reg, modelo.predict(X_test_reg))
    print(f'- {nombre}: {mse:.4f}')

## Clasificación: alta vs baja demanda
Etiquetamos `is_high_demand` usando la mediana y medimos la precisión de dos clasificadores.

In [None]:
df_clf = df.copy()
mediana = df_clf['demand'].median()
df_clf['is_high_demand'] = (df_clf['demand'] > mediana).astype(int)

X_train_clf, X_test_clf, y_train_clf, y_test_clf = preparar_matrices(df_clf, target_col='is_high_demand')
modelos_clf = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=5),
}

print('Accuracy en el set de prueba:')
for nombre, modelo in modelos_clf.items():
    modelo.fit(X_train_clf, y_train_clf)
    acc = accuracy_score(y_test_clf, modelo.predict(X_test_clf))
    print(f'- {nombre}: {acc:.4f}')

In [None]:
# Graficar resultados con los datos de regresión
import matplotlib.pyplot as plt

# Gráfico de barras para MSE
model_names_reg = list(modelos_reg.keys())
mse_values = [mean_squared_error(y_test_reg, modelos_reg[name].predict(X_test_reg)) for name in model_names_reg]

plt.figure(figsize=(10, 6))
bars = plt.bar(model_names_reg, mse_values, color=['#3498db', '#e74c3c'], edgecolor='black', linewidth=1.5)

# Añadir valores sobre las barras
for bar, value in zip(bars, mse_values):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{value:.4f}',
             ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.ylabel('Mean Squared Error (MSE)', fontsize=12)
plt.xlabel('Modelo', fontsize=12)
plt.title('Comparativa de Modelos de Regresión - Predicción de Demanda', fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.tight_layout()
plt.show()

# Gráfico de predicciones vs valores reales para cada modelo
fig, axes = plt.subplots(1, len(modelos_reg), figsize=(15, 5), sharey=True)

for i, (name, modelo) in enumerate(modelos_reg.items()):
    y_pred = modelo.predict(X_test_reg)
    axes[i].scatter(y_test_reg, y_pred, alpha=0.6, color='#3498db', edgecolor='black', linewidth=0.5)
    axes[i].plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', linewidth=2)
    axes[i].set_xlabel('Valores Reales (y_test_reg)', fontsize=12)
    axes[i].set_ylabel('Predicciones', fontsize=12)
    axes[i].set_title(f'{name}: Predicciones vs Reales', fontsize=14, fontweight='bold')
    axes[i].grid(alpha=0.3, linestyle='--')

plt.tight_layout()
plt.show()