<a href="https://colab.research.google.com/github/adityayadav4507/100Days_of_ML/blob/main/64_Day_Bagging_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

As mentioned in the error message, the `load_boston` function has been removed from scikit-learn.

Here's how to load the California housing dataset as an alternative:

In [1]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [2]:
import numpy as np
import pandas as pd

In [3]:
housing_df = pd.DataFrame(housing.data, columns=housing.feature_names)
housing_df['target'] = housing.target
housing_df.sample(6)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
5513,3.6538,36.0,5.065116,1.088372,632.0,2.939535,33.99,-118.41,2.761
1283,2.2361,46.0,4.4,1.075,101.0,2.525,38.02,-121.82,0.938
19806,0.8362,17.0,8.471698,2.849057,141.0,2.660377,40.07,-123.41,0.875
6130,3.4327,31.0,4.342939,1.063401,1548.0,4.461095,34.07,-117.99,1.472
1536,4.8088,32.0,5.285546,1.012926,1818.0,2.13631,37.9,-122.05,3.212
17343,4.375,15.0,5.373626,1.027473,407.0,2.236264,34.86,-120.41,1.58


In [4]:
housing_df.shape

(20640, 9)

In [5]:
X = housing_df.iloc[:, :-1]
y = housing_df.iloc[:, -1]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [8]:
lr=LinearRegression()
knn=KNeighborsRegressor()
dt=DecisionTreeRegressor()
rf=RandomForestRegressor()

In [9]:
lr.fit(X_train, y_train)
knn.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [10]:
y_pred1=lr.predict(X_test)
y_pred2=knn.predict(X_test)
y_pred3=dt.predict(X_test)
y_pred4=rf.predict(X_test)

In [11]:
from sklearn.metrics import r2_score
print("R2 of lr =", r2_score(y_test,y_pred1))
print("R2 of knn =", r2_score(y_test,y_pred2))
print("R2 of dt =", r2_score(y_test,y_pred3))
print("R2 of rf =", r2_score(y_test,y_pred4))

R2 of lr = 0.5757877060324508
R2 of knn = 0.14631049965900345
R2 of dt = 0.6167126762521926
R2 of rf = 0.8058340527809869


# bagging

In [12]:
from sklearn.ensemble import BaggingRegressor

bg= BaggingRegressor(
    random_state=1
)

bg.fit(X_train,y_train)

In [13]:
y_pred=bg.predict(X_test)

print("R2 of bg =", r2_score(y_test,y_pred))

print(" train coeff of R^2 =", bg.score(X_train,y_train))
print(" test coeff of R^2 =", bg.score(X_test,y_test))


R2 of bg = 0.7953630448417408
 train coeff of R^2 = 0.961947552748356
 test coeff of R^2 = 0.7953630448417408


In [14]:
housing_df.shape

(20640, 9)

In [15]:
%%time
# give total time taking

n_samples=housing_df.shape[0]
n_featurs=housing_df.shape[1]

params={
    'estimator':[None], # Removed LinearRegression and KNeighborsRegressor
    'n_estimators':[2,5,10,20],
    'max_samples':[0.5,1.0],
    'max_features':[0.5,1.0],
    'bootstrap':[True,False],
    'bootstrap_features':[True,False]
}

from sklearn.model_selection import GridSearchCV

bg_grid=GridSearchCV(
    bg,
    param_grid=params,
    cv=3,
    n_jobs=-1,
    verbose=1
)

bg_grid.fit(X_train,y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
CPU times: user 6.99 s, sys: 2.48 s, total: 9.47 s
Wall time: 2min 18s


In [16]:

print(" train coeff of R^2 =", bg_grid.best_estimator_.score(X_train,y_train))
print(" test coeff of R^2 =", bg_grid.best_estimator_.score(X_test,y_test))

print(" best R^2 =", bg_grid.best_score_)
print(" best para=" , bg_grid.best_params_)

 train coeff of R^2 = 1.0
 test coeff of R^2 = 0.8047378980182737
 best R^2 = 0.8007662849676324
 best para= {'bootstrap': False, 'bootstrap_features': True, 'estimator': None, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10}
