In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Load California housing dataset
california_housing = fetch_california_housing()
X = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
y = pd.Series(california_housing.target)
housing_df = pd.concat([X, y], axis=1)
housing_df = housing_df.rename(columns={0:'Price'})
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 16512
Testing set size: 4128


In [4]:
# Define base models
base_models = [
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('svr', SVR()),
    ('rf', RandomForestRegressor(random_state=42, n_estimators=100))
]

# Define meta-model
meta_model = LinearRegression()

In [5]:
# Train base models and show individual results
for name, model in base_models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error of {name}: {mse:.2f}')

Mean Squared Error of dt: 0.50
Mean Squared Error of svr: 1.33
Mean Squared Error of rf: 0.26


In [6]:
# Create stacking regressor
stacked_model = StackingRegressor(estimators = base_models, final_estimator = meta_model)

In [8]:
# Fit the stacking model
stacked_model.fit(X_train, y_train)

In [9]:
# Check stacked model Mean Squared Error
y_stacked_pred = stacked_model.predict(X_test)
stacked_mse = mean_squared_error(y_test, y_stacked_pred)

print(f'\nMean Squared Error of Stacking Model: {stacked_mse:.2f}')


Mean Squared Error of Stacking Model: 0.25


In [10]:
# Displaying individual model predictions
for name, model in base_models:
    y_pred = model.predict(X_test)
    print(f'Predictions from {name}:', y_pred[:5])  # Print first 5 predictions
print('Stacking Model Predictions:', y_stacked_pred[:5])  # Print first 5 predictions


Predictions from dt: [0.414   1.203   5.00001 2.17    2.257  ]
Predictions from svr: [1.74839687 1.77561334 1.88771095 1.76104909 1.84286835]
Predictions from rf: [0.5095    0.74161   4.9232571 2.52961   2.27369  ]
Stacking Model Predictions: [0.46352386 0.70833247 4.9715121  2.52030711 2.267117  ]
