In [16]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

In [28]:
housing = fetch_california_housing()
X, y = housing.data, housing.target
# print(X)
print("**********")
# print(y)


# Convert to DataFrame and add column names
df = pd.DataFrame(X, columns=housing.feature_names)

# Add the target variable (house price)
df["Target"] = y

# Display the first few rows
print(df.head())  # Show first 5 records


**********
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  Target  
0    -122.23   4.526  
1    -122.22   3.585  
2    -122.24   3.521  
3    -122.25   3.413  
4    -122.25   3.422  


In [29]:
# Load the California Housing dataset
#loading the dataset and obtaining info about columns
df=pd.read_csv("california_housing.csv")

# let's create 2 more columns with the total bedrooms and rooms per population in the same block.
df['average_rooms']=df['total_rooms']/df['population']
df['average_bedrooms']=df['total_bedrooms']/df['population']

#Choosing the dependant variable and the regressors. In this case we want to predict the housing price
X=df[['longitude', #
 'latitude', #
 'housing_median_age', #
 'population', #
 'median_income', # 
 # 'ocean_proximity',
 'average_rooms', #
 'average_bedrooms']] #

y=df['median_house_value']

print(X)
# print("*************")
# print(df.columns)



       longitude  latitude  housing_median_age  population  median_income  \
0        -122.23     37.88                  41         322         8.3252   
1        -122.22     37.86                  21        2401         8.3014   
2        -122.24     37.85                  52         496         7.2574   
3        -122.25     37.85                  52         558         5.6431   
4        -122.25     37.85                  52         565         3.8462   
...          ...       ...                 ...         ...            ...   
20635    -121.09     39.48                  25         845         1.5603   
20636    -121.21     39.49                  18         356         2.5568   
20637    -121.22     39.43                  17        1007         1.7000   
20638    -121.32     39.43                  18         741         1.8672   
20639    -121.24     39.37                  16        1387         2.3886   

       average_rooms  average_bedrooms  
0           2.732919          0.40

In [19]:
# Print key information about the dataset
print(f"Dataset shape: {X.shape}")
print(f"Features: {housing.feature_names}")

Dataset shape: (20640, 7)
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [20]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [22]:
# This takes around 1-2 minutes

# Define parameter grid
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.05],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Create XGBRegressor
model = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=1)

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print best score and parameters
print(f"Best score: {-grid_search.best_score_:.3f} (MSE)")
print(f"Best parameters: {grid_search.best_params_}")


Best score: 2147759573.333 (MSE)
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 1.0}


In [25]:
# Access best model
best_model = grid_search.best_estimator_

# Save best model
best_model.save_model('best_model_housing.ubj')

In [30]:
# Load saved model
loaded_model = XGBRegressor()
loaded_model.load_model('best_model_housing.ubj')

# Use loaded model for predictions
predictions = loaded_model.predict(X_test)

# Print evaluation metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Mean Absolute Error: {mae:.3f}") # Not very common
print(f"Mean Squared Error: {mse:.3f}") # not very common
print(f"R^2 Score: {r2:.3f}") # very common


from sklearn.metrics import mean_absolute_percentage_error
mape = mean_absolute_percentage_error(y_test, predictions) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%") # very common

Mean Absolute Error: 32174.545
Mean Squared Error: 2409476352.000
R^2 Score: 0.816
Mean Absolute Percentage Error (MAPE): 19.11%


Example: If R² = 0.816, we say "81.6% of the variance in the target variable is explained by the model."

For the most part I focus on R square and MAPE:
- if MAPE < 10%, model is excellant
- if MAPE is in between 10 - 20 %, model is good
- If mape is in between 20 - 50%, model is moderate
- If MAPE > 50%, model is highly inaccurate.