In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
print("Dataset Shape:", X.shape)

Dataset Shape: (20640, 8)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

Train Shape: (16512, 8)
Test Shape: (4128, 8)


In [5]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    results[name] = {"RMSE": rmse, "R2": r2}

print("\nBaseline Results:")
print(pd.DataFrame(results).T)


Baseline Results:
                       RMSE        R2
Linear Regression  0.745581  0.575788
Decision Tree      0.703729  0.622076


In [6]:
print("\nCross Validation Scores:")

for name, model in models.items():
    scores = cross_val_score(
        model,
        X_train,
        y_train,
        scoring="neg_root_mean_squared_error",
        cv=5
    )
    print(f"{name} CV RMSE:", -scores.mean())


Cross Validation Scores:
Linear Regression CV RMSE: 0.7205271873526421
Decision Tree CV RMSE: 0.7240092833935426


In [7]:
tree = DecisionTreeRegressor(random_state=42)
tree.fit(X_train, y_train)

train_pred = tree.predict(X_train)
test_pred = tree.predict(X_test)

print("\nOverfitting Check:")
print("Train R2:", r2_score(y_train, train_pred))
print("Test R2:", r2_score(y_test, test_pred))


Overfitting Check:
Train R2: 1.0
Test R2: 0.622075845135081


In [8]:
param_grid = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [9]:
final_preds = best_model.predict(X_test)

final_rmse = np.sqrt(mean_squared_error(y_test, final_preds))
final_r2 = r2_score(y_test, final_preds)

print("\nTuned Model Performance:")
print("RMSE:", final_rmse)
print("R2:", final_r2)


Tuned Model Performance:
RMSE: 0.6390654005312799
R2: 0.6883380738855668


In [10]:
comparison_df = pd.DataFrame(results).T
comparison_df.loc["Tuned Decision Tree"] = [final_rmse, final_r2]

print("\nFinal Model Comparison:")
print(comparison_df)


Final Model Comparison:
                         RMSE        R2
Linear Regression    0.745581  0.575788
Decision Tree        0.703729  0.622076
Tuned Decision Tree  0.639065  0.688338
