In [8]:
import numpy as np
import pandas as pd
#loading the boston datasets
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [14]:
X,y=housing.data,housing.target
print("Dataset feature names:"+ str(housing.feature_names))
print("Dataset feature size:"+ str(housing.data.shape))
print("Dataset target size:"+ str(housing.target.shape))

Dataset feature names:['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Dataset feature size:(20640, 8)
Dataset target size:(20640,)


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [16]:
#Splitting the dataset into training and the testing
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print("Training  and the testing size:",x_train.shape,x_test.shape)

Training  and the testing size: (16512, 8) (4128, 8)


In [18]:
lr=LinearRegression()
clf=DecisionTreeRegressor()
knn=KNeighborsRegressor()

In [19]:
lr.fit(x_train,y_train)
clf.fit(x_train,y_train)
knn.fit(x_train,y_train)

In [20]:
y_pred1=lr.predict(x_test)
y_pred2=clf.predict(x_test)
y_pred3=knn.predict(x_test)

In [21]:
print("R2 score:",r2_score(y_test,y_pred1))
print("R2 score:",r2_score(y_test,y_pred2))
print("R2 score:",r2_score(y_test,y_pred3))

R2 score: 0.5757877060324521
R2 score: 0.6217907696244005
R2 score: 0.14631049965900345


In [22]:
from sklearn.ensemble import BaggingRegressor
bag=BaggingRegressor(random_state=1)
bag.fit(x_train,y_train)

In [23]:
Y_pred=bag.predict(x_test)

In [24]:
print("Training coefficient of R2:",bag.score(x_train,y_train))
print("Testing coefficient of R2:",bag.score(x_test,y_test))

Training coefficient of R2: 0.961947552748356
Testing coefficient of R2: 0.7953630448417408


In [26]:
%%time
n_samples=housing.data.shape[0]
n_features=housing.data.shape[1]
params={
    'base_estimator':[None,LinearRegression(),KNeighborsRegressor()],
    'n_estimators':[20,50,100],
    'max_samples':[0.5,1.0],
    'max_features':[0.5,1.0],
    'bootstrap':[True,False],
    'bootstrap_features':[True,False]
}
bag_grid=GridSearchCV(BaggingRegressor(),param_grid=params,n_jobs=-1,cv=5)
bag_grid.fit(x_train,y_train)
print("Train R2 score:",bag_grid.best_estimator_.score(x_train,y_train))
print("Test R2 score:",bag_grid.best_estimator_.score(x_test,y_test))
print("Best r2 score through Gridsearch:",bag_grid.best_score_)
print("Best parameters:",bag_grid.best_params_)



Train R2 score: 0.9726842989440826
Test R2 score: 0.8028874935841951
Best r2 score through Gridsearch: 0.8061861218782156
Best parameters: {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': True, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100}
CPU times: total: 29.9 s
Wall time: 7min
