#Regression using Bagging and Boosting

In this colab, we will auto_mpg data set for regression and will apply different techniques of bagging and boosting. In particular, we will apply the following techniques:

**Bagging:**
* `sklearn.ensemble.BaggingRegressor`
* `sklearn.ensemble.RandomForestRegressor`

**Boosting:**
* `sklearn.ensemble.GradientBoostingRegressor`
* `sklearn.ensemble.AdaBoostRegressor`

We will also apply **VotingRegressor** that is implemented in sklearn as:

`sklearn.ensemble.VotingRegressor`



#Load the dataset

Features:

* cylinders: multi-valued discrete
* displacement: continuous
* horsepower: continuous
* weight: continuous
* acceleration: continuous
* model year: multi-valued discrete
* origin: multi-valued discrete
* car name: string (unique for each instance)

Number of samples: 398

Target variable (continuous): mpg

The target (y) is defined as the miles per gallon (mpg).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
column_names = ['mpg', 'cylinders','displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
auto_mpg_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data', delim_whitespace=True, header=None,names=column_names)


In [None]:
auto_mpg_data.shape

(398, 9)

In [None]:
auto_mpg_data.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
5,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220.0,4354.0,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215.0,4312.0,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225.0,4425.0,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190.0,3850.0,8.5,70,1,amc ambassador dpl


In [None]:
auto_mpg_data.describe()


Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [None]:
auto_mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
car_name         object
dtype: object

In [None]:
# auto_mpg_data.isnull().sum(axis = 0)
auto_mpg_data.isnull().any()


mpg             False
cylinders       False
displacement    False
horsepower      False
weight          False
acceleration    False
model_year      False
origin          False
car_name        False
dtype: bool

In [None]:
auto_mpg_data['horsepower'].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)


We see that there is a '?', which needs to be handled.

In [None]:
# auto_mpg_data = auto_mpg_data[auto_mpg_data['horsepower'] != '?']
auto_mpg_data[auto_mpg_data['horsepower']=='?']


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
32,25.0,4,98.0,?,2046.0,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875.0,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835.0,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905.0,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320.0,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035.0,20.5,82,1,amc concord dl


In [None]:
auto_mpg_data = auto_mpg_data.replace('?', -1) #we are replacing as later when we convert this feature into float, ? will create problem. 

In [None]:
auto_mpg_data[auto_mpg_data['horsepower']=='?']

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name


In [None]:
auto_mpg_data[auto_mpg_data['horsepower']==-1]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
32,25.0,4,98.0,-1,2046.0,19.0,71,1,ford pinto
126,21.0,6,200.0,-1,2875.0,17.0,74,1,ford maverick
330,40.9,4,85.0,-1,1835.0,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,-1,2905.0,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,-1,2320.0,15.8,81,2,renault 18i
374,23.0,4,151.0,-1,3035.0,20.5,82,1,amc concord dl


In [None]:
auto_mpg_data['cylinders'].unique()

array([8, 4, 6, 3, 5])

In [None]:
auto_mpg_data['model_year'].unique()

array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])

In [None]:
auto_mpg_data['origin'].unique()

array([1, 3, 2])

In [None]:
auto_mpg_data['car_name'].unique()

array(['chevrolet chevelle malibu', 'buick skylark 320',
       'plymouth satellite', 'amc rebel sst', 'ford torino',
       'ford galaxie 500', 'chevrolet impala', 'plymouth fury iii',
       'pontiac catalina', 'amc ambassador dpl', 'dodge challenger se',
       "plymouth 'cuda 340", 'chevrolet monte carlo',
       'buick estate wagon (sw)', 'toyota corona mark ii',
       'plymouth duster', 'amc hornet', 'ford maverick', 'datsun pl510',
       'volkswagen 1131 deluxe sedan', 'peugeot 504', 'audi 100 ls',
       'saab 99e', 'bmw 2002', 'amc gremlin', 'ford f250', 'chevy c20',
       'dodge d200', 'hi 1200d', 'chevrolet vega 2300', 'toyota corona',
       'ford pinto', 'plymouth satellite custom', 'ford torino 500',
       'amc matador', 'pontiac catalina brougham', 'dodge monaco (sw)',
       'ford country squire (sw)', 'pontiac safari (sw)',
       'amc hornet sportabout (sw)', 'chevrolet vega (sw)',
       'pontiac firebird', 'ford mustang', 'mercury capri 2000',
       'opel 1900'

In [None]:
len(auto_mpg_data['car_name'].unique())

305

There are 305 unique car_names out of 398 samples. It may not help us in prediction. Lets frop this feature.

In [None]:
auto_mpg_data = auto_mpg_data.drop('car_name', axis = 1)

In [None]:
auto_mpg_data.shape

(398, 8)

In [None]:
auto_mpg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [None]:
auto_mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
dtype: object

We see that horsepower is an object. Let us convert it into float.

In [None]:
auto_mpg_data.horsepower = auto_mpg_data.horsepower.astype('float') #this would have given an error if we hadnt replaced '?' by -1

In [None]:
X = auto_mpg_data.iloc[:, 1:]
y = auto_mpg_data.iloc[:, 0]


In [None]:
X.head()


Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504.0,12.0,70,1
1,8,350.0,165.0,3693.0,11.5,70,1
2,8,318.0,150.0,3436.0,11.0,70,1
3,8,304.0,150.0,3433.0,12.0,70,1
4,8,302.0,140.0,3449.0,10.5,70,1


In [None]:
y.head()


0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OrdinalEncoder

In [None]:
numeric_features = ['displacement', 'horsepower', 'weight', 'acceleration']
categorical_features = ['model_year', 'origin']
ordinal_features = ['cylinders']


In [None]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(missing_values = -1, strategy="mean")), ("scaler", StandardScaler())]
)


In [None]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")



In [None]:
ordinal_transformer = OrdinalEncoder()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("ord", ordinal_transformer, ordinal_features),
    ]
)


In [None]:
from sklearn.metrics import r2_score

###BaggingRegressor

In [None]:
reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", BaggingRegressor())]
)

In [None]:
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.7761783992798316




---



In [None]:
from sklearn.model_selection import cross_val_score


r2 = cross_val_score(estimator = reg, X = X_train, y = y_train, scoring='r2', cv = 10)
print(type(r2))
print('r2 of each fold ', list(r2))
print("Mean r2: {:.2f} %".format(r2.mean()))

<class 'numpy.ndarray'>
r2 of each fold  [0.8297890892297034, 0.812470593202439, 0.7786981517461266, 0.7157879817075014, 0.8416620657925006, 0.8264936707882156, 0.788497615358106, 0.8438285115704216, 0.7906401486955608, 0.6009881557412352]
Mean r2: 0.78 %


In [None]:
X_train_new = preprocessor.fit_transform(X_train)


In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{"n_estimators": [50,100,200],
              "max_features":[1,2,4,6,8],
              "max_samples": [0.5,0.1],
            "bootstrap": [True, False],
         "bootstrap_features": [True, False]}
]

Bag_model_GS = GridSearchCV(BaggingRegressor(), param_grid=tuned_parameters)
Bag_model_GS.fit(X_train_new,y_train)

Bag_model_GS.best_params_


{'bootstrap': False,
 'bootstrap_features': False,
 'max_features': 8,
 'max_samples': 0.5,
 'n_estimators': 100}

In [None]:
reg2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", BaggingRegressor(bootstrap = False, bootstrap_features = False,  max_features = 8, max_samples = 0.5,  n_estimators = 100))]
)

reg2.fit(X_train, y_train)
y_pred = reg2.predict(X_test)
print(r2_score(y_test, y_pred))

0.8029201244160029


###RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF_reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", RandomForestRegressor())]
)

RF_reg.fit(X_train, y_train)
y_pred = RF_reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.7958701979084384


In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{"n_estimators": [10,20,30],
            "max_features" : ["auto", "sqrt", "log2"],
            "min_samples_split" : [2,4,8],
            "bootstrap": [True, False]}
]

RF_model_GS = GridSearchCV(RandomForestRegressor(), param_grid=tuned_parameters)
RF_model_GS.fit(X_train_new,y_train)

RF_model_GS.best_params_

{'bootstrap': False,
 'max_features': 'log2',
 'min_samples_split': 2,
 'n_estimators': 30}

In [None]:
RF_reg2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", RandomForestRegressor(bootstrap= False,  max_features = 'log2', min_samples_split = 2, n_estimators = 30))]
)

RF_reg2.fit(X_train, y_train)
y_pred = RF_reg2.predict(X_test)
print(r2_score(y_test, y_pred))

0.8169119516140646


###GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor



In [None]:
GB_reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", GradientBoostingRegressor())]
)

GB_reg.fit(X_train, y_train)
y_pred = GB_reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.8126371473291412


In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]}
]

GB_model_GS = GridSearchCV(GradientBoostingRegressor(), param_grid=tuned_parameters)
GB_model_GS.fit(X_train_new,y_train)

GB_model_GS.best_params_

{'max_depth': 110,
 'max_features': 2,
 'min_samples_leaf': 5,
 'min_samples_split': 8,
 'n_estimators': 100}

In [None]:
GB_reg2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", GradientBoostingRegressor(max_depth = 110, max_features = 2, min_samples_leaf = 5,  min_samples_split = 8, n_estimators = 100))]
)

GB_reg2.fit(X_train, y_train)
y_pred = GB_reg2.predict(X_test)
print(r2_score(y_test, y_pred))

0.8579349754456154


###AdaBoostRegressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor


In [None]:
AB_reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", AdaBoostRegressor())]
)

AB_reg.fit(X_train, y_train)
y_pred = AB_reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.7244591931163092


In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{'n_estimators': [50, 100],
 'learning_rate' : [0.01, 0.05, 0.1, 0.5],
 'loss' : ['linear', 'square', 'exponential']}
]

AB_model_GS = GridSearchCV(AdaBoostRegressor(), param_grid=tuned_parameters)
AB_model_GS.fit(X_train_new,y_train)

AB_model_GS.best_params_

{'learning_rate': 0.1, 'loss': 'linear', 'n_estimators': 100}

In [None]:
AB_reg2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", AdaBoostRegressor(learning_rate = 0.1, loss ='linear', n_estimators = 100))]
)

AB_reg2.fit(X_train, y_train)
y_pred = AB_reg2.predict(X_test)
print(r2_score(y_test, y_pred))

0.7272851731933592


###VotingRegressor

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

lr = LinearRegression()
dt = DecisionTreeRegressor()
svm= SVR()
knn= KNeighborsRegressor()

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
VR_reg = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", VotingRegressor(estimators=[('lr', lr), ('dt', dt), ('svc', svm), ('knn',knn)]))]
)

VR_reg.fit(X_train, y_train)
y_pred = VR_reg.predict(X_test)
print(r2_score(y_test, y_pred))


0.830224296322489
