In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
from sklearn.datasets import fetch_california_housing

# Load the California housing dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target)



In [3]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Mean Absolute Error (MAE)
Description: MAE measures the average magnitude of the errors in a set of predictions, without considering their direction. It is the average over the test sample of the absolute differences between prediction and actual observation where all individual differences have equal weight.

Lower MAE indicates better model performance. It is intuitive and easy to understand as it gives an average of the absolute errors. However, it does not penalize larger errors as much as MSE.

In [4]:
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

Mean Absolute Error (MAE): 0.5332001304956553


# Mean Squared Error (MSE)
Description: MSE measures the average of the squares of the errors. It is the average squared difference between the estimated values and the actual value.

Lower MSE indicates better model performance. MSE is more sensitive to outliers than MAE because it squares the errors, which means that larger errors have a disproportionately larger effect on the metric.

In [5]:
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

Mean Squared Error (MSE): 0.5558915986952444


# Root Mean Squared Error (RMSE)
Description: RMSE is the square root of the mean squared error. It provides an error estimate in the same units as the target variable, making it more interpretable than MSE.

Lower RMSE indicates better model performance. RMSE is particularly useful when large errors are undesirable.


In [6]:
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 0.7455813830127764


# R-squared (R²)
Description: R-squared, or the coefficient of determination, indicates the proportion of the variance in the dependent variable that is predictable from the independent variables.

Ranges from 0 to 1. Higher values indicate better model performance, with 1 indicating that the model explains all the variability of the response data around its mean. However, R² can sometimes be misleading for models with many predictors.

In [7]:
r2 = r2_score(y_test, y_pred)
print(f'R-squared (R²): {r2}')

R-squared (R²): 0.5757877060324508


# Adjusted R-squared
Description: Adjusted R-squared adjusts the R-squared value based on the number of predictors in the model, providing a more accurate measure for multiple regression models. It accounts for the diminishing returns of adding more predictors.

Higher values indicate better model performance. Adjusted R-squared is useful for comparing models with different numbers of predictors as it penalizes the addition of irrelevant predictors.

In [8]:
def adjusted_r2_score(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)



n = len(y_test)
p = X_test.shape[1]
adjusted_r2 = adjusted_r2_score(y_test, y_pred, n, p)
print(f'Adjusted R-squared: {adjusted_r2}')

Adjusted R-squared: 0.5749637928613558


# Mean Absolute Percentage Error (MAPE)
Description: MAPE measures the accuracy of predictions as a percentage. It is useful for understanding the error in relative terms.

Lower MAPE indicates better model performance. It is expressed as a percentage, making it easier to interpret in some contexts. However, it can be problematic when actual values are very close to zero.

In [9]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Mean Absolute Percentage Error (MAPE): {mape}%')

Mean Absolute Percentage Error (MAPE): 31.95218741361489%


# Median Absolute Error
Description: Median absolute error provides a robust measure of the central tendency of errors, less influenced by outliers than MAE. It is the median of all absolute errors.

Lower median absolute error indicates better model performance. It is particularly useful when the dataset contains outliers that might skew the mean error metrics.


In [10]:
median_ae = median_absolute_error(y_test, y_pred)
print(f'Median Absolute Error: {median_ae}')

Median Absolute Error: 0.41023300084958947
