# Ensemble Technique Regressor

## Problem Statement
* Predict the price of used cars listed on CarDekho using their features. The model can help sellers set competitive prices based on market conditions.
* Data scraped from CarDekho website, 13 features and  15,411 records

In [33]:
# Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [34]:
# Dataset
df = pd.read_csv('cardekho_imputated.csv',index_col=[0])
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


### Data Cleaning


In [35]:
# Missing Values
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [36]:
# Removing unnecessary columns
df = df.drop(['car_name', 'brand'],axis=1)

In [37]:
df.info()

<class 'pandas.DataFrame'>
Index: 15411 entries, 0 to 19543
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model              15411 non-null  str    
 1   vehicle_age        15411 non-null  int64  
 2   km_driven          15411 non-null  int64  
 3   seller_type        15411 non-null  str    
 4   fuel_type          15411 non-null  str    
 5   transmission_type  15411 non-null  str    
 6   mileage            15411 non-null  float64
 7   engine             15411 non-null  int64  
 8   max_power          15411 non-null  float64
 9   seats              15411 non-null  int64  
 10  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(5), str(4)
memory usage: 1.4 MB


In [38]:
# Handling Categorical Feature
df['model'].unique()

<StringArray>
[        'Alto',        'Grand',          'i20',     'Ecosport',
      'Wagon R',          'i10',        'Venue',        'Swift',
        'Verna',       'Duster',
 ...
     'Panamera',      'Alturas',       'Altroz',           'NX',
     'Carnival',            'C',           'RX',        'Ghost',
 'Quattroporte',       'Gurkha']
Length: 120, dtype: str

In [39]:
# Get all numerical features
num_features = [col for col in df.columns if df[col].dtype != 'str']
print('Number of Numerical Features: ',len(num_features))
# Get all categorical features
cat_features = [col for col in df.columns if df[col].dtype == 'str']
print('Number of Categorical Features: ',len(cat_features))
# Get all discrete features
discrete_features = [col for col in num_features if len(df[col].unique()) <= 25 ]
print('Number of Discrete Features: ',len(discrete_features))
# Get all continuous features
continuous_features = [col for col in num_features if col not in discrete_features ]
print('Number of Continuous Features: ',len(continuous_features))

Number of Numerical Features:  7
Number of Categorical Features:  4
Number of Discrete Features:  2
Number of Continuous Features:  5


In [40]:
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


### Feature selection and encoding

In [47]:
# Independent and dependent features
X = df.drop(['selling_price'],axis=1)
y = df['selling_price']


In [48]:
# Categorical Features Type
for col in cat_features:
    print(f"{col}: ",len(df[col].unique()))

model:  120
seller_type:  3
fuel_type:  5
transmission_type:  2


In [49]:
# Encoding for model
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model']=le.fit_transform(X['model'])

In [50]:
# Encoding and scaling independent features
num_features = X.select_dtypes(exclude='object').columns
oh_col = ['seller_type','fuel_type','transmission_type']
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
num_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    [
        ('StandardScaler',num_transformer,num_features),
        ('OneHotEncoder',oh_transformer,oh_col)
    ],remainder='passthrough'
)
X=preprocessor.fit_transform(X)


### Model Training

In [51]:
# Train, test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [52]:
# Model Libraries
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [53]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [54]:
## Beginning Model Training
models = {

    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost Regressor":AdaBoostRegressor(),
    "Gradient BoostRegressor":GradientBoostingRegressor(),
    "Xgboost Regressor":XGBRegressor()
   
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Decision Tree
Model performance for Training set
- Root Mean Squared Error: 20797.2352
- Mean Absolute Error: 5164.8199
- R2 Score: 0.9995
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 304550.3941
- Mean Absolute Error: 124642.0072
- R2 Score: 0.8768


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 125935.0789
- Mean Absolute Error: 39624.8108
- R2 Score: 0.9804
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 228381.5506
- Mean Absolute Error: 101656.6269
- R2 Score: 0.9307


Adaboost Regressor
Model performance for Training set
- Root Mean Squared Error: 419529.9991
- Mean Absolute Error: 306567.1246
- R2 Score: 0.7830
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 456498.7184
- Mean Absolute Error: 325411.2771
- R2 Score: 0.7232


Gradient BoostRegressor
Model performance for Training set
- Root Mean Squared E

### Hyper Parameter Tuning

In [58]:
# Parameters
rf_params = {
    "max_depth":[5,8,15,None,10],
    "max_features":[5,7,"auto",8],
    "min_samples_split":[2,8,15,20],
    "n_estimators":[100,200,500,1000]}
adaboost_params = {
    "n_estimators":[50,100,200],
    "learning_rate":[0.1,0.5,1],
}
gradient_params = {
    "loss":['squared_error','absolute_error'],
    "learning_rate":[0.1,0.01],
    "min_samples_split":[2,5,10],
    "n_estimators":[100,300],
    "max_depth":[3,5,8]
}
xgboost_params = {
    "learning_rate":[0.1,0.01],
    "n_estimators":[100,200,300],
    "max_depth":[5,8,12,20,30],
    "colsample_bytree":[0.5,0.8,1,0.3,0.4]
}

In [59]:
# Model list for HyperParameter Tuning
randomcv_models = [
    ("RF", RandomForestRegressor(),rf_params), ("AdaBoost",AdaBoostRegressor(),adaboost_params),("Gradient Boosting",GradientBoostingRegressor(),gradient_params),("XGBoost",XGBRegressor(),xgboost_params)
]

In [60]:
from sklearn.model_selection import RandomizedSearchCV
model_param = {}
for name,model,params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,cv=3,verbose=2,n_jobs=-1)
    random.fit(X_train,y_train)
    model_param[name]=random.best_params_
for model_name in model_param:
    print(f"------------ Best Params for {model_name} ------------")
    print(model_param[model_name])
    

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 72 candidates, totalling 216 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
------------ Best Params for RF ------------
{'n_estimators': 100, 'min_samples_split': 2, 'max_features': 7, 'max_depth': 15}
------------ Best Params for AdaBoost ------------
{'n_estimators': 200, 'learning_rate': 0.1}
------------ Best Params for Gradient Boosting ------------
{'n_estimators': 300, 'min_samples_split': 2, 'max_depth': 5, 'loss': 'squared_error', 'learning_rate': 0.1}
------------ Best Params for XGBoost ------------
{'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.5}


In [61]:
## Retraining the models with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=200, min_samples_split=2, max_features=5, max_depth=None, 
                                                     n_jobs=-1),
    "AdaBoost Regressor":AdaBoostRegressor(n_estimators= 200, learning_rate= 0.1),
    "Gradient Boosting Regressor":GradientBoostingRegressor(n_estimators= 300, min_samples_split= 2, max_depth= 5, loss= 'squared_error', learning_rate= 0.1),
     "Xgboost Regressor":XGBRegressor(n_estimators= 300,learning_rate=0.1,
                                     max_depth=5,colsample_bytree=0.5)
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 124520.2965
- Mean Absolute Error: 39133.7454
- R2 Score: 0.9809
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 207097.2780
- Mean Absolute Error: 98194.6776
- R2 Score: 0.9430


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 378703.8785
- Mean Absolute Error: 253856.8291
- R2 Score: 0.8232
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 462152.7322
- Mean Absolute Error: 275858.8875
- R2 Score: 0.7163


Gradient Boosting Regressor
Model performance for Training set
- Root Mean Squared Error: 99818.3930
- Mean Absolute Error: 68815.4381
- R2 Score: 0.9877
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 215967.9578
- Mean Absolute Error: 97098.5652
- R2 Score: 0.9380


Xgboost Regressor
Model performance for Training set
- Root Mean Sq