# California Housing Price Prediction 
**Linear Regression, Lasso Regression, and Ridge Regression** 

## 1. Import Required Libraries

In [28]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## 2. Load and Explore the Dataset

In [30]:

# Load dataset
data = pd.read_csv('California_Houses.csv')  

# Display basic information
print(data.info())
print(data.describe())

# Check for missing values
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Median_House_Value        20640 non-null  float64
 1   Median_Income             20640 non-null  float64
 2   Median_Age                20640 non-null  int64  
 3   Tot_Rooms                 20640 non-null  int64  
 4   Tot_Bedrooms              20640 non-null  int64  
 5   Population                20640 non-null  int64  
 6   Households                20640 non-null  int64  
 7   Latitude                  20640 non-null  float64
 8   Longitude                 20640 non-null  float64
 9   Distance_to_coast         20640 non-null  float64
 10  Distance_to_LA            20640 non-null  float64
 11  Distance_to_SanDiego      20640 non-null  float64
 12  Distance_to_SanJose       20640 non-null  float64
 13  Distance_to_SanFrancisco  20640 non-null  float64
dtypes: flo

## 3. Data Preprocessing

In [32]:

# Drop rows with missing values (if any)
data = data.dropna()

# Define features and target variable
X = data.drop(columns=['Median_House_Value'])
y = data['Median_House_Value']

# Split data into training and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## 4. Train Linear Regression Model

In [34]:

# Train Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_linear_train = linear_model.predict(X_train_scaled)
y_pred_linear_val = linear_model.predict(X_val_scaled)
y_pred_linear_test = linear_model.predict(X_test_scaled)

print('Linear Regression Performance:')
print('Training Set:')
print('  - MAE:', mean_absolute_error(y_train, y_pred_linear_train))
print('  - MSE:', mean_squared_error(y_train, y_pred_linear_train))
print('  - R² Score:', r2_score(y_train, y_pred_linear_train))
print('Validation Set:')
print('  - MAE:', mean_absolute_error(y_val, y_pred_linear_val))
print('  - MSE:', mean_squared_error(y_val, y_pred_linear_val))
print('  - R² Score:', r2_score(y_val, y_pred_linear_val))
print('Test Set:')
print('  - MAE:', mean_absolute_error(y_test, y_pred_linear_test))
print('  - MSE:', mean_squared_error(y_test, y_pred_linear_test))
print('  - R² Score:', r2_score(y_test, y_pred_linear_test))

Linear Regression Performance:
Training Set:
  - MAE: 49927.00384503118
  - MSE: 4730358742.50004
  - R² Score: 0.6469080431646521
Validation Set:
  - MAE: 50790.060271050934
  - MSE: 4907211997.374781
  - R² Score: 0.6233241175944966
Test Set:
  - MAE: 48782.03108085671
  - MSE: 4400953150.613741
  - R² Score: 0.6671770047345611


## 5. Train Lasso Regression Model

In [36]:

# Train Lasso Regression model
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_lasso_train = lasso_model.predict(X_train_scaled)
y_pred_lasso_val = lasso_model.predict(X_val_scaled)
y_pred_lasso_test = lasso_model.predict(X_test_scaled)

print('Lasso Regression Performance:')
print('Training Set:')
print('  - MAE:', mean_absolute_error(y_train, y_pred_lasso_train))
print('  - MSE:', mean_squared_error(y_train, y_pred_lasso_train))
print('  - R² Score:', r2_score(y_train, y_pred_lasso_train))
print('Validation Set:')
print('  - MAE:', mean_absolute_error(y_val, y_pred_lasso_val))
print('  - MSE:', mean_squared_error(y_val, y_pred_lasso_val))
print('  - R² Score:', r2_score(y_val, y_pred_lasso_val))
print('Test Set:')
print('  - MAE:', mean_absolute_error(y_test, y_pred_lasso_test))
print('  - MSE:', mean_squared_error(y_test, y_pred_lasso_test))
print('  - R² Score:', r2_score(y_test, y_pred_lasso_test))

Lasso Regression Performance:
Training Set:
  - MAE: 49927.72937227726
  - MSE: 4730365753.095632
  - R² Score: 0.6469075198671497
Validation Set:
  - MAE: 50790.82762033131
  - MSE: 4907228147.161598
  - R² Score: 0.6233228779424898
Test Set:
  - MAE: 48782.15410025204
  - MSE: 4400631541.595689
  - R² Score: 0.667201326483289


  model = cd_fast.enet_coordinate_descent(


## 6. Train Ridge Regression Model

In [38]:

# Train Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_ridge_train = ridge_model.predict(X_train_scaled)
y_pred_ridge_val = ridge_model.predict(X_val_scaled)
y_pred_ridge_test = ridge_model.predict(X_test_scaled)

print('Ridge Regression Performance:')
print('Training Set:')
print('  - MAE:', mean_absolute_error(y_train, y_pred_ridge_train))
print('  - MSE:', mean_squared_error(y_train, y_pred_ridge_train))
print('  - R² Score:', r2_score(y_train, y_pred_ridge_train))
print('Validation Set:')
print('  - MAE:', mean_absolute_error(y_val, y_pred_ridge_val))
print('  - MSE:', mean_squared_error(y_val, y_pred_ridge_val))
print('  - R² Score:', r2_score(y_val, y_pred_ridge_val))
print('Test Set:')
print('  - MAE:', mean_absolute_error(y_test, y_pred_ridge_test))
print('  - MSE:', mean_squared_error(y_test, y_pred_ridge_test))
print('  - R² Score:', r2_score(y_test, y_pred_ridge_test))


Ridge Regression Performance:
Training Set:
  - MAE: 49930.60200604057
  - MSE: 4730387587.217243
  - R² Score: 0.6469058900853201
Validation Set:
  - MAE: 50793.610268198856
  - MSE: 4907281049.444644
  - R² Score: 0.6233188171816835
Test Set:
  - MAE: 48784.33026729786
  - MSE: 4400540039.597478
  - R² Score: 0.6672082463408915


7. Model Comparison


In [40]:
# Compare models based on Test Set performance
models_mse = {
    "Linear Regression": mean_squared_error(y_test, y_pred_linear_test),
    "Lasso Regression": mean_squared_error(y_test, y_pred_lasso_test),
    "Ridge Regression": mean_squared_error(y_test, y_pred_ridge_test)
}

models_mae = {
    "Linear Regression": mean_absolute_error(y_test, y_pred_linear_test),
    "Lasso Regression": mean_absolute_error(y_test, y_pred_lasso_test),
    "Ridge Regression": mean_absolute_error(y_test, y_pred_ridge_test)
}

models_r2 = {
    "Linear Regression": r2_score(y_test, y_pred_linear_test),
    "Lasso Regression": r2_score(y_test, y_pred_lasso_test),
    "Ridge Regression": r2_score(y_test, y_pred_ridge_test)
}

# Identify the best model for each metric
min_mse_model = min(models_mse, key=models_mse.get)
min_mae_model = min(models_mae, key=models_mae.get)
max_r2_model = max(models_r2, key=models_r2.get)

print('\nModel Comparison:')
print('------------------')
print(f"The model with the minimum MSE is: {min_mse_model}")
print(f"The model with the minimum MAE is: {min_mae_model}")
print(f"The model with the maximum R² Score is: {max_r2_model}")


Model Comparison:
------------------
The model with the minimum MSE is: Ridge Regression
The model with the minimum MAE is: Linear Regression
The model with the maximum R² Score is: Ridge Regression


 8. Comments on Results

In [42]:
print('\nAnalysis of Results:')
print('1. **Error Metrics**: The models exhibit comparable performance on the test set, with minor differences in MSE, MAE, and R² Score.')
print('2. **Effect of Regularization**: Lasso and Ridge Regression offer little advantage over Linear Regression, indicating that overfitting is not a major concern.')
print('3. **Model Selection**: Linear Regression is preferred due to its simplicity and similar performance. Regularization techniques (Lasso/Ridge) do not provide significant gains in this scenario.')
print('4. **Feature Scaling Influence**: Standardizing the features enhanced model performance, particularly for Lasso and Ridge Regression, which are sensitive to feature scaling.')
print('5. **Convergence Issue**: The Lasso Regression model required an increase in `max_iter` to achieve convergence, emphasizing the need for proper hyperparameter tuning.')



Analysis of Results:
1. **Error Metrics**: The models exhibit comparable performance on the test set, with minor differences in MSE, MAE, and R² Score.
2. **Effect of Regularization**: Lasso and Ridge Regression offer little advantage over Linear Regression, indicating that overfitting is not a major concern.
3. **Model Selection**: Linear Regression is preferred due to its simplicity and similar performance. Regularization techniques (Lasso/Ridge) do not provide significant gains in this scenario.
4. **Feature Scaling Influence**: Standardizing the features enhanced model performance, particularly for Lasso and Ridge Regression, which are sensitive to feature scaling.
5. **Convergence Issue**: The Lasso Regression model required an increase in `max_iter` to achieve convergence, emphasizing the need for proper hyperparameter tuning.
