### Graded assinment week 8

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer 

from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.pipeline import Pipeline

In [2]:

data_df = pd.read_csv('cars_data.csv')

In [3]:
data_df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,"$43,755","$39,014",3.5,6.0,225,18,24,3880,115,197


In [4]:
data_df.isna().sum()

Make           0
Model          0
Type           0
Origin         0
DriveTrain     0
MSRP           0
Invoice        0
EngineSize     0
Cylinders      2
Horsepower     0
MPG_City       0
MPG_Highway    0
Weight         0
Wheelbase      0
Length         0
dtype: int64

In [5]:
data_df.dropna(inplace= True)

In [6]:
data_df.isna().sum()

Make           0
Model          0
Type           0
Origin         0
DriveTrain     0
MSRP           0
Invoice        0
EngineSize     0
Cylinders      0
Horsepower     0
MPG_City       0
MPG_Highway    0
Weight         0
Wheelbase      0
Length         0
dtype: int64

In [7]:
data_df.drop(['Invoice', 'Model'], axis= 1, inplace= True)


In [8]:
data_df['MSRP'] = data_df['MSRP'].str.replace('$', '')
data_df['MSRP'] = data_df['MSRP'].str.replace(',', '')


In [9]:
data_df['MSRP'] = data_df['MSRP'].astype(int) 

In [10]:
labels = data_df['MSRP']
features = data_df.drop('MSRP', axis= 1)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size= 0.3, random_state= 1)

In [12]:
one_hot = OneHotEncoder()
features_scaling = ColumnTransformer([
    ('make_t', one_hot, ['Make']),
    ('type_t', one_hot, ['Type']),
    ('origin_t', one_hot, ['Origin']),
    ('drive_t', one_hot, ['DriveTrain']),
], remainder= 'passthrough')

In [13]:
def train_regressor(estimator, x_train, y_train,  features_scaling, name, x_test, y_test):
    pipeline = Pipeline(steps= [
        ('features_scaling', features_scaling),
        ('estimator', estimator)
    ])


    pipeline.fit(x_train, y_train)
    
    print(f'{name} train score: {pipeline.score(x_train, y_train)}')
    print(f'{name} test score: {pipeline.score(x_test, y_test)}')


In [14]:
bagging_reg = BaggingRegressor(random_state= 1)
train_regressor(bagging_reg, x_train, y_train, features_scaling, 'Bagging', x_test, y_test)

Bagging train score: 0.9720063408475845
Bagging test score: 0.7877504295794473


In [15]:
random_forest_reg = RandomForestRegressor(random_state= 1) 
train_regressor(random_forest_reg, x_train, y_train, features_scaling, 'Random Forest', x_test, y_test)

Random Forest train score: 0.9776067452283685
Random Forest test score: 0.8324407902795642


In [16]:
gradient_boosting_reeg = GradientBoostingRegressor(random_state= 1)
train_regressor(gradient_boosting_reeg, x_train, y_train, features_scaling, 'Gradient Boosting', x_test, y_test)

Gradient Boosting train score: 0.9861216097667501
Gradient Boosting test score: 0.8282499105058316


In [17]:
ada_boost_reg = AdaBoostRegressor(random_state= 1)
train_regressor(ada_boost_reg, x_train, y_train, features_scaling, 'Ada Boost', x_test, y_test)

Ada Boost train score: 0.9092628520143711
Ada Boost test score: 0.7206602823900117


In [18]:
params = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.5, 1, 2]
}
ada_gv = GridSearchCV(AdaBoostRegressor(random_state= 1), params, return_train_score= True, cv= 4)

In [19]:
train_regressor(ada_gv, x_train, y_train, features_scaling, 'Ada Boost Grid Search', x_test, y_test)

Ada Boost Grid Search train score: 0.9143723419572314
Ada Boost Grid Search test score: 0.7132402279121517


In [20]:
ada_gv.best_params_

{'learning_rate': 0.5, 'n_estimators': 1000}

In [None]:
gv = Gr