In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from ngboost import NGBRegressor

Objective: Fit various regression models such as linear regression, random forests, extra random trees, AdaBoost, XGBoost, and NGBoost on the data and compute the RMSE and MAE for each model, compare their results and present an analysis.

In [3]:
file = r"C:\Users\Abhim\Downloads\california_housing\housing.csv"
df = pd.read_csv(file)
df = df.dropna()
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
#drop ocean_proximity

df = df.drop('ocean_proximity', axis=1)

In [5]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [6]:
scaler = StandardScaler()

X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y.values.reshape(-1,1)).flatten()

In [7]:
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462


In [8]:
#pca to identify top 3 most important features

pca = PCA(n_components=3)
pca = pca.fit_transform(X_scaled)

pca = pd.DataFrame(pca, columns=['PC1', 'PC2', 'PC3'])

pca

Unnamed: 0,PC1,PC2,PC3
0,-2.091848,1.395074,2.051422
1,2.936717,1.934054,2.154508
2,-1.968320,1.464482,1.183152
3,-1.917650,1.505893,0.386435
4,-1.775537,1.564319,-0.466925
...,...,...,...
20428,-1.048033,1.733655,-0.795678
20429,-1.864341,1.637843,0.026344
20430,-0.449721,1.807140,-0.529281
20431,-0.861387,1.795404,-0.423595


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
#linear regression

lr = LinearRegression()
lr = lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred))
mae_lr = mean_absolute_error(y_test, y_pred)

print(rmse_lr, mae_lr)


70156.12045736378 51372.67217050005


In [11]:
#random forest

rf = RandomForestRegressor()
rf = rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred))
mae_rf = mean_absolute_error(y_test, y_pred)

print(rmse_rf, mae_rf)

49214.06444403339 32004.85874235381


In [12]:
#extra trees

et = ExtraTreesRegressor()
et = et.fit(X_train, y_train)

y_pred = et.predict(X_test)

rmse_et = np.sqrt(mean_squared_error(y_test, y_pred))
mae_et = mean_absolute_error(y_test, y_pred)

print(rmse_et, mae_et)

53244.824233591535 35018.954274528995


In [13]:
#adaboost

adaboost = AdaBoostRegressor()
adaboost = adaboost.fit(X_train, y_train)

y_pred = adaboost.predict(X_test)

rmse_adaboost = np.sqrt(mean_squared_error(y_test, y_pred))
mae_adaboost = mean_absolute_error(y_test, y_pred)

print(rmse_adaboost, mae_adaboost)

92047.10225751337 79515.39632603253


In [14]:
#xgboost

xgb = XGBRegressor()
xgb = xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred))
mae_xgb = mean_absolute_error(y_test, y_pred)

print(rmse_xgb, mae_xgb)

49025.401198716805 32445.779015877324


In [15]:
ngb = NGBRegressor()
ngb = ngb.fit(X_train, y_train)

y_pred = ngb.predict(X_test)

rmse_ngb = np.sqrt(mean_squared_error(y_test, y_pred))
mae_ngb = mean_absolute_error(y_test, y_pred)

print(rmse_ngb, mae_ngb)

[iter 0] loss=13.0721 val_loss=0.0000 scale=1.0000 norm=90732.8760
[iter 100] loss=12.6838 val_loss=0.0000 scale=1.0000 norm=60466.4845
[iter 200] loss=12.4906 val_loss=0.0000 scale=2.0000 norm=101356.0795
[iter 300] loss=12.3646 val_loss=0.0000 scale=2.0000 norm=91206.4028
[iter 400] loss=12.2823 val_loss=0.0000 scale=1.0000 norm=42734.9913
60462.48854344614 42706.437005765045


In [16]:
#comparison of all used regression models and their metrics

models = ['Linear Regression', 'Random Forest', 'Extra Trees', 'AdaBoost', 'XGBoost', 'NGBoost']
rmse = [rmse_lr, rmse_rf, rmse_et, rmse_adaboost, rmse_xgb, rmse_ngb]
mae = [mae_lr, mae_rf, mae_et, mae_adaboost, mae_xgb, mae_ngb]

results = pd.DataFrame({'Model': models, 'RMSE': rmse, 'MAE': mae})

results

Unnamed: 0,Model,RMSE,MAE
0,Linear Regression,70156.120457,51372.672171
1,Random Forest,49214.064444,32004.858742
2,Extra Trees,53244.824234,35018.954275
3,AdaBoost,92047.102258,79515.396326
4,XGBoost,49025.401199,32445.779016
5,NGBoost,60462.488543,42706.437006


Conclusion: XGBoost Regression yields the lowest RMSE value and the second lowest MAE, showing that it can handle larger errors between the target and predicted data, and generalize well on the dataset. Random forest regression has the lowest MAE value and second lowest RMSE, which shows that it gives more consistent predictions of house prices than XGBoost, but might be sensitive to outliers in the data.