In [15]:
# Import necessary libraries
import numpy as np  # For numerical operations
from sklearn.datasets import fetch_california_housing  # To fetch the California housing dataset
from sklearn.model_selection import train_test_split  # For splitting data into training and test sets
from sklearn.preprocessing import StandardScaler  # For standardizing the data
from sklearn.linear_model import Lasso, Ridge, ElasticNet  # For L1, L2, and Elastic Net regression models
from sklearn.metrics import mean_squared_error  # To calculate the performance metric

import warnings
warnings.filterwarnings("ignore")

# Load the California Housing dataset
# This dataset contains information about housing prices in California.
# It has 8 features (predictors) and a target (median house value).
data = fetch_california_housing()

# Display dataset description for context
print(data.DESCR)  # Detailed description of the dataset

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [4]:
# Features (X) and target (y)
X, y = data.data, data.target  # X: Predictors, y: Target variable (median house value)

# Display feature names and first few rows of the dataset
print("Feature names:", data.feature_names)
print("First 5 rows of features:\n", X[:5])
print("First 5 target values:\n", y[:5])

Feature names: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
First 5 rows of features:
 [[ 8.32520000e+00  4.10000000e+01  6.98412698e+00  1.02380952e+00
   3.22000000e+02  2.55555556e+00  3.78800000e+01 -1.22230000e+02]
 [ 8.30140000e+00  2.10000000e+01  6.23813708e+00  9.71880492e-01
   2.40100000e+03  2.10984183e+00  3.78600000e+01 -1.22220000e+02]
 [ 7.25740000e+00  5.20000000e+01  8.28813559e+00  1.07344633e+00
   4.96000000e+02  2.80225989e+00  3.78500000e+01 -1.22240000e+02]
 [ 5.64310000e+00  5.20000000e+01  5.81735160e+00  1.07305936e+00
   5.58000000e+02  2.54794521e+00  3.78500000e+01 -1.22250000e+02]
 [ 3.84620000e+00  5.20000000e+01  6.28185328e+00  1.08108108e+00
   5.65000000e+02  2.18146718e+00  3.78500000e+01 -1.22250000e+02]]
First 5 target values:
 [4.526 3.585 3.521 3.413 3.422]


In [5]:
# Split the dataset into training and test sets
# 80% of the data will be used for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Standardize the data
# Standardization ensures that all features have mean 0 and standard deviation 1.
# This is important for models like Lasso, Ridge, and Elastic Net to converge effectively.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit the scaler on training data and transform it
X_test = scaler.transform(X_test)  # Transform the test data using the same scaler

In [7]:
# Lasso Regression (L1 Regularization)
# L1 regularization penalizes the absolute magnitude of coefficients,
# leading to some coefficients being reduced to exactly zero (sparsity).
lasso = Lasso(alpha=0.1)  # alpha controls the regularization strength
lasso.fit(X_train, y_train)  # Train the Lasso regression model
y_pred_lasso = lasso.predict(X_test)  # Predict using the Lasso model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)  # Calculate Mean Squared Error

In [8]:
# Ridge Regression (L2 Regularization)
# L2 regularization penalizes the square of the magnitude of coefficients,
# resulting in smaller coefficients but not exactly zero.
ridge = Ridge(alpha=1.0)  # alpha controls the regularization strength
ridge.fit(X_train, y_train)  # Train the Ridge regression model
y_pred_ridge = ridge.predict(X_test)  # Predict using the Ridge model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)  # Calculate Mean Squared Error

In [9]:
# Elastic Net Regression (Combination of L1 and L2)
# Elastic Net combines L1 and L2 regularization. The balance is controlled by l1_ratio.
# l1_ratio = 0.5 means equal weighting for L1 and L2.
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # alpha and l1_ratio control the regularization
elastic_net.fit(X_train, y_train)  # Train the Elastic Net model
y_pred_elastic = elastic_net.predict(X_test)  # Predict using the Elastic Net model
mse_elastic = mean_squared_error(y_test, y_pred_elastic)  # Calculate Mean Squared Error

In [10]:
# Print Mean Squared Errors for comparison
# Lower MSE indicates better model performance on the test data.
print("Mean Squared Error (Lasso):", mse_lasso)

Mean Squared Error (Lasso): 0.6796290284328821


In [11]:
print("Mean Squared Error (Ridge):", mse_ridge)

Mean Squared Error (Ridge): 0.5558548589435969


In [12]:
print("Mean Squared Error (Elastic Net):", mse_elastic)

Mean Squared Error (Elastic Net): 0.6358566879910775


In [16]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'alpha': [0.025, 0.05, 0.75, 0.1, 0.25],  # Mixing ratio for L1/L2
    'l1_ratio': [0, 0.25, 0.5, 0.1,  0.25]  # Regularization strength
}

# Use GridSearchCV to find the best combination
grid_search = GridSearchCV(estimator=elastic_net, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate on test data (if available)
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test Mean Squared Error:", mse)

Best Parameters: {'alpha': 0.1, 'l1_ratio': 0}
Test Mean Squared Error: 0.5822654306706845
