# **Bagging Regressor**

## **Importing Libararies**

In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge

## **Data Overview**

In [58]:
df = pd.read_csv('Car_Data.csv')
df.head()

Unnamed: 0,ID,Brand,Model,Year,Color,Mileage,Price,Condition
0,1,Mazda,Generic Model 3,2003,Blue,22932,65732,New
1,2,Jaguar,Generic Model 2,2003,Silver,89937,58620,New
2,3,Land Rover,Generic Model 2,2022,Green,36616,50574,New
3,4,Porsche,Generic Model 2,1997,Black,82812,35436,Used
4,5,Land Rover,Generic Model 3,2000,Black,184616,63880,Used


In [59]:
df.columns

Index(['ID', 'Brand', 'Model', 'Year', 'Color', 'Mileage', 'Price',
       'Condition'],
      dtype='object')

In [60]:
df.shape

(100000, 8)

In [61]:
df.isnull().sum()

ID           0
Brand        0
Model        0
Year         0
Color        0
Mileage      0
Price        0
Condition    0
dtype: int64

In [62]:
df.describe(include='all')

Unnamed: 0,ID,Brand,Model,Year,Color,Mileage,Price,Condition
count,100000.0,100000,100000,100000.0,100000,100000.0,100000.0,100000
unique,,20,21,,6,,,2
top,,Ford,Generic Model 1,,Blue,,,Used
freq,,5133,28356,,16786,,,50033
mean,50000.5,,,2006.03409,,99819.395,42534.51937,
std,28867.657797,,,9.52722,,57710.087058,21632.296692,
min,1.0,,,1990.0,,2.0,5000.0,
25%,25000.75,,,1998.0,,49996.25,23826.75,
50%,50000.5,,,2006.0,,99807.0,42563.5,
75%,75000.25,,,2014.0,,149841.25,61197.5,


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ID         100000 non-null  int64 
 1   Brand      100000 non-null  object
 2   Model      100000 non-null  object
 3   Year       100000 non-null  int64 
 4   Color      100000 non-null  object
 5   Mileage    100000 non-null  int64 
 6   Price      100000 non-null  int64 
 7   Condition  100000 non-null  object
dtypes: int64(4), object(4)
memory usage: 6.1+ MB


In [64]:
df['Price'].mean()

42534.51937

## **Train Test Split**

In [65]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Price', 'ID']), df['Price'],
                                                    test_size=0.2,
                                                    random_state=42)

In [66]:
X_train.head()

Unnamed: 0,Brand,Model,Year,Color,Mileage,Condition
75220,Chevrolet,Generic Model 2,2004,Red,184450,New
48955,Subaru,Generic Model 3,2003,Red,68276,Used
44966,Subaru,Generic Model 2,2002,Red,150827,New
13568,Audi,Generic Model 3,1990,Silver,78381,New
92727,Tesla,Generic Model 3,1992,Red,20103,Used


In [67]:
numerical_columns = [2,4]
categorical_columns = [0, 1, 3, 5]

## **Data Preprocessing**

In [68]:
handle_numerical = Pipeline(steps=[
    ('impute_numerical', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

In [69]:
handle_categorical = Pipeline(steps=[
    ('impute_numerical', SimpleImputer(strategy='most_frequent')),
    ('encode_categorical', OneHotEncoder())
])

In [70]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical', handle_numerical, numerical_columns),
    ('categorical', handle_categorical, categorical_columns)
], remainder='passthrough')

In [71]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

## **Separate Models**

In [72]:
model_lr = Ridge(alpha=0.1)

In [73]:
pipe_lr = make_pipeline(preprocessing, model_lr)

In [74]:
pipe_lr.fit(X_train, y_train)

In [75]:
y_pred_lr = pipe_lr.predict(X_test)

In [76]:
print(r2_score(y_test, y_pred_lr))

-0.00039707261851074094


## **Bagging Regressor**

In [77]:
bagging = BaggingRegressor(estimator=pipe_lr)

In [78]:
bagging.fit(X_train, y_train)

In [79]:
y_pred_bagging = bagging.predict(X_test)

In [80]:
print(r2_score(y_pred_bagging, y_test))

-1825.561738626045
