In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load California housing dataset
data = fetch_california_housing()

In [3]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [4]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Length: 20640, dtype: float64

In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define models and parameters
model_params = {
    'Linear Regression': {
        'model': LinearRegression(),
        'param_grid': {}
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(),
        'param_grid': {
            'max_depth': [3, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'param_grid': {
            'n_estimators': [50, 100],
            'max_depth': [3, 5, 10]
        }
    },
    'XGBoost': {
        'model': XGBRegressor(objective='reg:squarederror', random_state=42),
        'param_grid': {
            'n_estimators': [50, 100],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1]
        }
    }
}

In [9]:
# Run GridSearchCV for each model
results = []

for name, mp in model_params.items():
    grid = GridSearchCV(mp['model'], param_grid=mp['param_grid'], cv=3, scoring='r2', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'MSE': mse,
        'R2 Score': r2
    })

# Output results
results_df = pd.DataFrame(results)
print("\nSummary Results:")
print(results_df)


Summary Results:
               Model                                        Best Params  \
0  Linear Regression                                                 {}   
1      Decision Tree         {'max_depth': 10, 'min_samples_split': 10}   
2      Random Forest             {'max_depth': 10, 'n_estimators': 100}   
3            XGBoost  {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...   

        MSE  R2 Score  
0  0.555892  0.575788  
1  0.417733  0.681219  
2  0.296245  0.773929  
3  0.243600  0.814104  
