# Regression metrics

En este notebook se comparan varios modelos de regresión con distintas métricas de error.

Mean Squared Error

Mean Absolute Error

Root Mean Squared Error

Coeficiente de determinación

Mean Absolute Percentage Error

Root Mean Squared Log Error

Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error

diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

models = [
    ("Linear Regression", LinearRegression()),
    ("Ridge", Ridge()),
    ("Lasso", Lasso()),
    ("Decision Tree", DecisionTreeRegressor(random_state=42)),
    ("Random Forest", RandomForestRegressor(random_state=42)),
    ("SVR", SVR())
]

preds = {}
for name, model in models:
    model.fit(X_train, y_train)
    preds[name] = model.predict(X_test)

print("Modelos entrenados y predicciones listas")

Mean Squared Error

In [None]:
print("=== MSE (más bajo es mejor) ===")
for name, y_pred in preds.items():
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name}: {mse:.2f}")

Mean Absolute Error

In [None]:
X_bmi = diabetes.data[:, [2]]  # columna 'bmi'
Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    X_bmi, y, test_size=0.25, random_state=42
)

line_models = [
    ("Linear Regression", LinearRegression()),
    ("Ridge", Ridge()),
    ("Lasso", Lasso())
]

print("=== MAE (más bajo es mejor) ===")
plt.figure(figsize=(7,5))
plt.scatter(Xb_test, yb_test, s=20, c="k", label="True")

order = np.argsort(Xb_test[:,0])
Xb_sorted = Xb_test[order]

for name, model in line_models:
    model.fit(Xb_train, yb_train)
    yhat = model.predict(Xb_test)
    mae = mean_absolute_error(yb_test, yhat)
    print(f"{name}: {mae:.2f}")
    plt.plot(Xb_sorted, model.predict(Xb_sorted), label=name)

plt.title("Ajuste con una sola variable (BMI) y MAE")
plt.xlabel("BMI (característica estandarizada)")
plt.ylabel("Progression")
plt.legend()
plt.tight_layout()
plt.show()

Root Mean Squared Error

In [None]:
print("=== RMSE (más bajo es mejor) ===")
names, scores = [], []
for name, y_pred in preds.items():
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    names.append(name); scores.append(rmse)
    print(f"{name}: {rmse:.2f}")

plt.figure(figsize=(7,4))
plt.bar(names, scores)
plt.title("Comparación de RMSE")
plt.ylabel("RMSE")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.show()

Coeficiente de determinación

In [None]:
from sklearn.metrics import r2_score

print("=== R2 (más alto es mejor, máximo 1.0) ===")
names, scores = [], []
for name, y_pred in preds.items():
    r2 = r2_score(y_test, y_pred)
    names.append(name); scores.append(r2)
    print(f"{name}: {r2:.2f}")

plt.figure(figsize=(7,4))
plt.scatter(names, scores, s=100)
plt.axhline(0, color="red", linestyle="--", linewidth=1)
plt.title("Comparación de R²")
plt.ylabel("R²")
plt.ylim(min(scores)-0.1, 1.0)
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.show()

Mean Absolute Percentage Error

In [None]:
def mean_absolute_percentage_error(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    mask = np.abs(y_true) > eps
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

print("=== MAPE (%) — más bajo es mejor ===")
names, scores = [], []
for name, y_pred in preds.items():
    mape = mean_absolute_percentage_error(y_test, y_pred)
    names.append(name); scores.append(mape)
    print(f"{name}: {mape:.2f}%")

plt.figure(figsize=(7,4))
plt.bar(names, scores, color="skyblue")
plt.title("Comparación de MAPE (%)")
plt.ylabel("MAPE (%)")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.show()

Root Mean Squared Log Error

In [None]:
from sklearn.metrics import mean_squared_log_error

def rmsle(y_true, y_pred, eps=1e-9):
    y_pred = np.clip(y_pred, eps, None)
    y_true = np.clip(y_true, eps, None)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

print("=== RMSLE — más bajo es mejor ===")
names, scores = [], []
for name, y_pred in preds.items():
    val = rmsle(y_test, y_pred)
    names.append(name); scores.append(val)
    print(f"{name}: {val:.4f}")

plt.figure(figsize=(7,4))
plt.bar(names, scores, color="salmon")
plt.title("Comparación de RMSLE")
plt.ylabel("RMSLE")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.show()