# Importing Packages and Reading Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load the California Housing Dataset
dataset = pd.read_csv('California_Houses.csv',header = 0)

# print(dataset.columns())
print("Shape Before: ", dataset.shape)
# print(dataset.head())



Shape Before:  (20640, 14)


#Splitting the Data
## 70% Training
## 15% Validation
## 15% Test

In [2]:
# Splitting the Data
x_train , x_val_test , y_train ,y_val_test = train_test_split(dataset.drop(columns = ['Median_House_Value']) ,
                                                              dataset['Median_House_Value'],
                                                              test_size = 0.3 ,
                                                              random_state = 42)

x_val , x_test , y_val , y_test = train_test_split(x_val_test ,
                                                   y_val_test ,
                                                   test_size = 0.5 ,
                                                   random_state = 42)

# Linear Regression
### Including Error Calculations:
*   Mean Squared Error
*   Mean Absolute Error



In [3]:
# Applying Linear Regression

lr_model = LinearRegression()
lr_model.fit(x_train , y_train)

y_prediction_lr_train = lr_model.predict(x_train)
y_prediction_lr_val = lr_model.predict(x_val)
y_prediction_lr_test = lr_model.predict(x_test)

# Mean Squared Error for Linear Regression
mean_sqr_error_lr_train = mean_squared_error(y_train , y_prediction_lr_train)
mean_sqr_error_lr_val = mean_squared_error(y_val , y_prediction_lr_val)
mean_sqr_error_lr_test = mean_squared_error(y_test , y_prediction_lr_test)


# Mean Absolute Error for Linear Regression
mean_abs_error_lr_train = mean_absolute_error(y_train, y_prediction_lr_train)
mean_abs_error_lr_val = mean_absolute_error(y_val, y_prediction_lr_val)
mean_abs_error_lr_test = mean_absolute_error(y_test, y_prediction_lr_test)


print("\nMean Squared Error for Linear Regression:")
print("Training Set:", mean_sqr_error_lr_train)
print("Validation Set:", mean_sqr_error_lr_val)
print("Test Set:", mean_sqr_error_lr_test)
print("=======================\n\n")

print("\nMean Absolute Error for Linear Regression:")
print("Training Set:", mean_abs_error_lr_train)
print("Validation Set:", mean_abs_error_lr_val)
print("Test Set:", mean_abs_error_lr_test)
print("=======================\n\n")


Mean Squared Error for Linear Regression:
Training Set: 4730358742.500039
Validation Set: 4907211997.374879
Test Set: 4400953150.614249



Mean Absolute Error for Linear Regression:
Training Set: 49927.00384503198
Validation Set: 50790.060271051945
Test Set: 48782.031080858054




#Lasso Regression
###Including Error Calculations:

*   Mean Squared Error
*   Mean Absolute Error

In [4]:
# Applying Lasso Regression
lasso_model = Lasso(alpha = 1.0,max_iter = 5000)
lasso_model.fit(x_train , y_train)

y_prediction_lasso_train = lasso_model.predict(x_train)
y_prediction_lasso_val = lasso_model.predict(x_val)
y_prediction_lasso_test = lasso_model.predict(x_test)

# Mean Squared Error for Lasso Regression
mean_sqr_error_lasso_train = mean_squared_error(y_train, y_prediction_lasso_train)
mean_sqr_error_lasso_val = mean_squared_error(y_val, y_prediction_lasso_val)
mean_sqr_error_lasso_test = mean_squared_error(y_test, y_prediction_lasso_test)

print("\nMean Squared Error for Lasso Regression:")
print("Training Set:", mean_sqr_error_lasso_train)
print("Validation Set:", mean_sqr_error_lasso_val)
print("Test Set:", mean_sqr_error_lasso_test)
print("=======================\n\n")

# Mean Absolute Error for Lasso Regression
mean_abs_error_lasso_train = mean_absolute_error(y_train, y_prediction_lasso_train)
mean_abs_error_lasso_val = mean_absolute_error(y_val, y_prediction_lasso_val)
mean_abs_error_lasso_test = mean_absolute_error(y_test, y_prediction_lasso_test)

print("\nMean Absolute Error for Lasso Regression:")
print("Training Set:", mean_abs_error_lasso_train)
print("Validation Set:", mean_abs_error_lasso_val)
print("Test Set:", mean_abs_error_lasso_test)
print("=======================\n\n")


Mean Squared Error for Lasso Regression:
Training Set: 4730358780.904339
Validation Set: 4907219718.486601
Test Set: 4400960661.573396



Mean Absolute Error for Lasso Regression:
Training Set: 49927.18298952366
Validation Set: 50790.27347325689
Test Set: 48782.20650021449




#Ridge Regression
###Including Error Calculations:
*   Mean Squared Error
*   Mean Absolute Error

In [5]:
# Applying Ridge Regression
ridge_model = Ridge(alpha = 1.0)
ridge_model.fit(x_train , y_train)

y_prediction_ridge_train = ridge_model.predict(x_train)
y_prediction_ridge_val = ridge_model.predict(x_val)
y_prediction_ridge_test = ridge_model.predict(x_test)

# Mean Squared Error for Ridge Regression
mean_sqr_error_ridge_train = mean_squared_error(y_train, y_prediction_ridge_train)
mean_sqr_error_ridge_val = mean_squared_error(y_val, y_prediction_ridge_val)
mean_sqr_error_ridge_test = mean_squared_error(y_test, y_prediction_ridge_test)

# Mean Absolute Error for Ridge Regression
mean_abs_error_ridge_train = mean_absolute_error(y_train,y_prediction_ridge_train)
mean_abs_error_ridge_val = mean_absolute_error(y_val,y_prediction_ridge_val)
mean_abs_error_ridge_test = mean_absolute_error(y_test,y_prediction_ridge_test)

print("Mean Squared Error for Ridge Regression:")
print("Training Set:", mean_sqr_error_ridge_train)
print("Validation Set:", mean_sqr_error_ridge_val)
print("Test Set:", mean_sqr_error_ridge_test)
print("=======================\n\n")

print("\nMean Absolute Error for Ridge Regression:")
print("Training Set:", mean_abs_error_ridge_train)
print("Validation Set:", mean_abs_error_ridge_val)
print("Test Set:", mean_abs_error_ridge_test)

Mean Squared Error for Ridge Regression:
Training Set: 4730359058.048789
Validation Set: 4907226928.247801
Test Set: 4400963939.986329



Mean Absolute Error for Ridge Regression:
Training Set: 49927.48824981005
Validation Set: 50790.607314504
Test Set: 48782.50871135058


#Models Comparison

In [6]:
# Comparing Models:
print("\nComparing Models:")
print("------------------")
print("Linear Regression:")
print(f"  - MSE (Test): {mean_sqr_error_lr_test:.4f}")
print(f"  - MAE (Test): {mean_abs_error_lr_test:.4f}")

print("\nLasso Regression:")
print(f"  - MSE (Test): {mean_sqr_error_lasso_test:.4f}")
print(f"  - MAE (Test): {mean_abs_error_lasso_test:.4f}")

print("\nRidge Regression:")
print(f"  - MSE (Test): {mean_sqr_error_ridge_test:.4f}")
print(f"  - MAE (Test): {mean_abs_error_ridge_test:.4f}")




models_mse = {
    "Linear Regression": mean_sqr_error_lr_test,
    "Lasso Regression": mean_sqr_error_lasso_test,
    "Ridge Regression": mean_sqr_error_ridge_test
}

models_mae = {
    "Linear Regression": mean_abs_error_lr_test,
    "Lasso Regression": mean_abs_error_lasso_test,
    "Ridge Regression": mean_abs_error_ridge_test
}

min_mse_model = min(models_mse, key=models_mse.get)
min_mae_model = min(models_mae, key=models_mae.get)

print(f"\nThe model with the minimum MSE is: {min_mse_model}")
print(f"The model with the minimum MAE is: {min_mae_model}")



Comparing Models:
------------------
Linear Regression:
  - MSE (Test): 4400953150.6142
  - MAE (Test): 48782.0311

Lasso Regression:
  - MSE (Test): 4400960661.5734
  - MAE (Test): 48782.2065

Ridge Regression:
  - MSE (Test): 4400963939.9863
  - MAE (Test): 48782.5087

The model with the minimum MSE is: Linear Regression
The model with the minimum MAE is: Linear Regression


### Comments on Results

1. **Error Calculation**: All models have similar Mean Squared Error (350M-390M) and Mean Absolute Error (12,000-13,000), showing that predictions differ significantly from actual house values. This level of error might be large, depending on the typical value of houses in the dataset.

2. **Regularization Impact**: Lasso and Ridge didn’t improve much over Linear Regression, showing little overlap or extra features. All features seem useful, and overfitting doesn’t appear to be an issue.

3. **Model Choice**: Since all models perform similarly, Linear Regression is the best choice due to its simplicity and interpretability. Regularization isn’t needed here.
