# Día 7: Evaluación de Modelos de Regresión

**Introducción a Python para ML** | EAE Business School | 10 febrero 2026

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go

In [None]:
url = 'https://raw.githubusercontent.com/ber2/eae-python/main/data/Houses_Barcelona_samp.csv'
df = pd.read_csv(url)
df_clean = df[['price', 'sqrmts', 'rooms', 'bathrooms', 'floor']].dropna()
df_clean.head()

## Parte 1: Regresión Simple y Evaluación

In [None]:
X = df_clean[['sqrmts']]
y = df_clean['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae:.0f}€')
print(f'MSE: {mse:.0f}€')
print(f'RMSE: {rmse:.0f}€')
print(f'R²: {r2:.3f}')

In [None]:
# Visualizar predicciones vs reales
df_viz = pd.DataFrame({'Real': y_test, 'Predicho': y_pred, "Residual": y_test - y_pred})

In [None]:
fig = px.scatter(df_viz, x='Real', y='Predicho', title='Predicciones vs Reales')
max_val = max(df_viz['Real'].max(), df_viz['Predicho'].max())
fig.add_trace(go.Scatter(x=[0, max_val], y=[0, max_val], mode='lines', name='Perfecto'))
fig.show()

In [None]:
fig = px.scatter(df_viz, x='Real', y='Residual', title='Residuales vs Reales')
fig.add_hline(y=0, line_dash="dash")
fig.show()

## Parte 2: Regresión Múltiple

In [None]:
X = df_clean[['sqrmts', 'rooms', 'bathrooms', 'floor']]
y = df_clean['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Métricas múltiples

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred_multi)

print('Regresión Múltiple:')
print(f'MAE: {mae:.0f}€')
print(f'RMSE: {rmse:.0f}€')
print(f'R²: {r2:.3f}')

print('\nCoeficientes:')
for feat, coef in zip(X_multi.columns, model_multi.coef_):
    print(f'{feat}: {coef:.0f}€')

print(f"\nIntercept: {model.intercept_:.0f}€")


## Parte 3: One-Hot Encoding

In [None]:
# Añadir neighborhood
df_with_cat = df[['price', 'sqrmts', 'rooms', 'neighborhood']].dropna()
df_encoded = pd.get_dummies(df_with_cat, columns=['neighborhood'], drop_first=True)
df_encoded.head()

In [None]:
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'R² con neighborhood: {r2_score(y_test, y_pred):.3f}')

## Parte 4: Detectar Overfitting

In [None]:
# Comparar train vs test
y_train_pred = model.predict(X_train)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_pred)

print(f'RMSE Train: {r2_train:.7f}€')
print(f'RMSE Test: {r2_test:.7f}€')

if r2_test > r2_train + 0.05:
    print('⚠️ Posible overfitting')
else:
    print('✓ Modelo generaliza bien')

## Ejercicio: Cars Dataset

Usad el dataset de coches para construir un modelo de regresión múltiple

In [None]:
url_cars = 'https://raw.githubusercontent.com/ber2/eae-python/main/data/cars_1990.csv'
df_cars = pd.read_csv(url_cars)
df_cars.head()

In [None]:
# EJERCICIO:
# 1. Seleccionar features: engine_size, horsepower, weight
# 2. Target: price
# 3. Train/test split
# 4. Entrenar modelo
# 5. Calcular MAE, RMSE, R²
# 6. Comparar con modelo simple (solo horsepower)
# Vuestra solución aquí
