In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

In [2]:
raw_data = pd.read_csv('Car_price_cleaned.csv')

In [3]:
raw_data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,curbweight_squared,enginesize_squared,boreratio_squared,stroke_squared,compressionratio_squared,horsepower_squared,peakrpm_squared,citympg_squared,highwaympg_squared,log_enginesize
0,1,3,alfa-romero giulia,1,0,1,0,2,0,-1.690772,...,6492304,16900,12.0409,7.1824,81.0,12321,25000000,441,729,4.875197
1,2,3,alfa-romero stelvio,1,0,1,0,2,0,-1.690772,...,6492304,16900,12.0409,7.1824,81.0,12321,25000000,441,729,4.875197
2,3,1,alfa-romero Quadrifoglio,1,0,1,2,2,0,-0.708596,...,7969329,23104,7.1824,12.0409,81.0,23716,25000000,361,676,5.030438
3,4,2,audi 100 ls,1,0,0,3,1,0,0.173698,...,5461569,11881,10.1761,11.56,100.0,10404,30250000,576,900,4.70048
4,5,2,audi 100ls,1,0,0,3,0,0,0.10711,...,7974976,18496,10.1761,11.56,64.0,13225,30250000,324,484,4.919981


In [4]:
raw_data.drop(['car_ID','CarName'],axis=1,inplace=True)

In [5]:
raw_data.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,curbweight_squared,enginesize_squared,boreratio_squared,stroke_squared,compressionratio_squared,horsepower_squared,peakrpm_squared,citympg_squared,highwaympg_squared,log_enginesize
0,3,1,0,1,0,2,0,-1.690772,-0.426521,-0.844782,...,6492304,16900,12.0409,7.1824,81.0,12321,25000000,441,729,4.875197
1,3,1,0,1,0,2,0,-1.690772,-0.426521,-0.844782,...,6492304,16900,12.0409,7.1824,81.0,12321,25000000,441,729,4.875197
2,1,1,0,1,2,2,0,-0.708596,-0.231513,-0.190566,...,7969329,23104,7.1824,12.0409,81.0,23716,25000000,361,676,5.030438
3,2,1,0,0,3,1,0,0.173698,0.207256,0.136542,...,5461569,11881,10.1761,11.56,100.0,10404,30250000,576,900,4.70048
4,2,1,0,0,3,0,0,0.10711,0.207256,0.230001,...,7974976,18496,10.1761,11.56,64.0,13225,30250000,324,484,4.919981


In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
inputs = raw_data.drop('price',axis=1)
targets = raw_data['price']

In [8]:
scaler = StandardScaler()

In [9]:
scaler.fit(inputs)

In [10]:
inputs_scaled = scaler.transform(inputs)

In [12]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [13]:
models = {
    'LinearRegression':LinearRegression(),
    'XGBRegressor': XGBRegressor(random_state=42),
    'LGBMRegressor':LGBMRegressor(random_state=42, verbose=-1),
    'RandomForestRegressor':RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor':GradientBoostingRegressor(random_state=42),
    'AdaBoostRegressor':AdaBoostRegressor(random_state=42),
    'ExtraTreesRegressor':ExtraTreesRegressor(random_state=42),
    'BaggingRegressor':BaggingRegressor(random_state=42),
    'SVR':SVR()
}


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train,x_test,y_train,y_test = train_test_split(inputs_scaled,targets,test_size=0.2,random_state=42)

In [17]:
for name,model in models.items():
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)
    print(f"{name}:MAE={mae:.2f}, MSE={mse:.2f},R2={r2:.2f}")

LinearRegression:MAE=2040.88, MSE=10840614.78,R2=0.86
XGBRegressor:MAE=1654.81, MSE=5569883.13,R2=0.93
LGBMRegressor:MAE=1925.24, MSE=7944466.52,R2=0.90
RandomForestRegressor:MAE=1308.00, MSE=3492642.92,R2=0.96
GradientBoostingRegressor:MAE=1630.75, MSE=5188631.87,R2=0.93
AdaBoostRegressor:MAE=1980.58, MSE=6442492.56,R2=0.92
ExtraTreesRegressor:MAE=1282.47, MSE=3463370.57,R2=0.96
BaggingRegressor:MAE=1454.55, MSE=4335083.33,R2=0.95
SVR:MAE=5695.85, MSE=86845041.43,R2=-0.10


In [18]:
param_grid_rf = {
    'n_estimators':[100,200,300],
    'max_depth':[10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_et = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]

}



models = {
    'XGBRegressor': (XGBRegressor(random_state=42),param_grid_xgb),
    'RandomForestRegressor':(RandomForestRegressor(random_state=42),param_grid_rf),
    'GradientBoostingRegressor':(GradientBoostingRegressor(random_state=42),param_grid_gb),
    'ExtraTreesRegressor':(ExtraTreesRegressor(random_state=42),param_grid_et)
}

In [19]:
best_models={}
best_scores={}

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
for model_name, (model,param_grid) in models.items():
    grid_search = GridSearchCV(model,param_grid,cv=5,scoring = 'neg_mean_squared_error',n_jobs=-1,verbose=1)
    grid_search.fit(x_train,y_train)
    best_models[model_name] = grid_search.best_estimator_
    best_scores[model_name] = grid_search.best_score_


best_scores

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits


{'XGBRegressor': -5740822.9361735415,
 'RandomForestRegressor': -5631296.887081678,
 'GradientBoostingRegressor': -5318699.347217174,
 'ExtraTreesRegressor': -4793342.662672036}

In [22]:
best_models

{'XGBRegressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...),
 'RandomForestRegressor': RandomForestRegressor(max_depth=20, n_estimators=200, random_state=42),
 'GradientBoostingRegressor': GradientBoostingRegressor(learning_rate=0.05, n_estimators=200, random_state=42),
 'ExtraTreesRegressor'

In [23]:
for name,model in best_models.items():
    y_pred = model.predict(x_test)
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)
    print(f"{name}: MAE = {mae:.2f},MSE={mse:.2f},R2={r2:.2f}")

XGBRegressor: MAE = 1670.19,MSE=5618588.15,R2=0.93
RandomForestRegressor: MAE = 1287.08,MSE=3484267.04,R2=0.96
GradientBoostingRegressor: MAE = 1640.72,MSE=5192955.07,R2=0.93
ExtraTreesRegressor: MAE = 1338.74,MSE=4081645.91,R2=0.95
