In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("scout_cleaned.csv", index_col = False)

In [4]:
df.shape

(15915, 160)

In [5]:
X = df.drop(["price", "Unnamed: 0"], axis=1)

In [6]:
X

Unnamed: 0,km,prev_owner,hp_kw,displacement_cc,gears,num_door,num_seat,cylinders,co2_emission,cc_air conditioning,...,body_color_Violet,body_color_White,body_color_Yellow,paint_type_Basic,paint_type_Metallic,upholstery_Cloth,upholstery_Part/Full Leather,drive_chain_4WD,drive_chain_front,drive_chain_rear
0,56013,2.0,66.0,1422.0,6.0,5.0,5.0,3.0,99.0,1,...,0,0,0,0,1,1,0,0,1,0
1,80000,1.0,141.0,1798.0,7.0,3.0,4.0,4.0,129.0,1,...,0,0,0,0,1,1,0,0,1,0
2,83450,1.0,85.0,1598.0,6.0,4.0,4.0,3.0,99.0,1,...,0,0,0,0,1,1,0,0,1,0
3,73000,1.0,66.0,1422.0,6.0,3.0,4.0,3.0,99.0,0,...,0,0,0,0,1,1,0,0,1,0
4,16200,1.0,66.0,1422.0,6.0,5.0,5.0,3.0,109.0,1,...,0,0,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15910,0,1.0,147.0,1997.0,6.0,5.0,5.0,4.0,139.0,1,...,0,0,0,0,1,1,0,0,1,0
15911,9900,1.0,165.0,1798.0,7.0,5.0,5.0,4.0,168.0,1,...,0,0,0,0,1,1,0,0,1,0
15912,15,1.0,146.0,1997.0,6.0,5.0,7.0,4.0,139.0,1,...,0,1,0,0,1,0,1,0,1,0
15913,10,1.0,147.0,1997.0,6.0,5.0,7.0,4.0,139.0,1,...,0,0,0,0,1,0,1,0,1,0


In [7]:
y = df["price"]

In [9]:
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print("r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

## Random Forest

In [14]:
# Gridsearch:
rf = RandomForestRegressor()

# hyper parameters
# "n_estimators" : number of trees - takes features every time random.
# "max_depth" : the size of tree
# "max_features" : number of columns
# "min_samples_split": the number of minimum sample to divide the node
rf_params = {"n_estimators": [50, 100, 300],
             "max_depth": [3, 5, 7],
            "max_features": [2, 4, 6, 8],
             "min_samples_split": [2, 4, 6]
            }

rf_cv_model = GridSearchCV(rf, rf_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  4.4min finished


In [15]:
rf_cv_model.best_params_

{'max_depth': 7,
 'max_features': 8,
 'min_samples_split': 2,
 'n_estimators': 100}

In [18]:
# rf_model = RandomForestClassifier(n_estimators = 300).fit(X_train, y_train)
rf_tuned = RandomForestRegressor(max_depth = 33,
                                  max_features = 50,
                                  min_samples_split = 2,
                                  n_estimators = 500).fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9781947297400577 
 mae: 541.1015105395688 
 mse: 1176777.6781572078 
 rmse: 1084.793841316039


## XGBoost

In [10]:
from xgboost import XGBRegressor

In [20]:
xgb = XGBRegressor()

xgb_params = {"n_estimators": [50, 100, 300],
             "subsample":[0.5,0.8,1],
             "max_depth":[3,5,7],
             "learning_rate":[0.1,0.01,0.3]}

In [21]:
xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 36.1min finished


In [22]:
xgb_cv_model.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}

In [23]:
# xgb_tuned = XGBClassifier().fit(X_train, y_train)
xgb_tuned = XGBRegressor(n_estimators=300,
                         subsample=0.8,
                         max_depth=7,
                         learning_rate=0.1).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9833655891915621 
 mae: 513.9736128701304 
 mse: 897718.9044350978 
 rmse: 947.4802923729326


In [24]:
# xgb_tuned = XGBClassifier().fit(X_train, y_train)
xgb_tuned = XGBRegressor(n_estimators=1000,
                         subsample=0.8,
                         max_depth=7,
                         learning_rate=0.1).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9847499426753916 
 mae: 451.89089391518417 
 mse: 823008.6963510273 
 rmse: 907.1982673875801


In [25]:
# xgb_tuned = XGBClassifier().fit(X_train, y_train)
xgb_tuned = XGBRegressor(n_estimators=1000,
                         subsample=0.8,
                         max_depth=33,
                         learning_rate=0.1).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9853421820988452 
 mae: 405.09885662724827 
 mse: 791046.9675883662 
 rmse: 889.4082120086177


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15915 entries, 0 to 15914
Columns: 160 entries, Unnamed: 0 to drive_chain_rear
dtypes: float64(10), int64(150)
memory usage: 19.4 MB


### Recursive Feature Elimination (RFE)

*Feature selection refers to techniques that select a subset of the most relevant features (columns) for a dataset. Fewer features can allow machine learning algorithms to run more efficiently (less space or time complexity) and be more effective. Some machine learning algorithms can be misled by irrelevant input features, resulting in worse predictive performance.*

In [12]:
# check scikit-learn version
import sklearn
from sklearn.feature_selection import RFE
print(sklearn.__version__)

0.22.1


*The RFE method is available via the RFE class in scikit-learn.*

*RFE is a transform. To use it, first the class is configured with the chosen algorithm specified via the “estimator” argument and the number of features to select via the “n_features_to_select” argument.*

In [49]:
# define the method
rfe = RFE(estimator=XGBRegressor(), n_features_to_select=10)
# fit the model
rfe.fit(X, y)

RFE(estimator=XGBRegressor(base_score=None, booster=None,
                           colsample_bylevel=None, colsample_bynode=None,
                           colsample_bytree=None, gamma=None, gpu_id=None,
                           importance_type='gain', interaction_constraints=None,
                           learning_rate=None, max_delta_step=None,
                           max_depth=None, min_child_weight=None, missing=nan,
                           monotone_constraints=None, n_estimators=100,
                           n_jobs=None, num_parallel_tree=None,
                           objective='reg:squarederror', random_state=None,
                           reg_alpha=None, reg_lambda=None,
                           scale_pos_weight=None, subsample=None,
                           tree_method=None, validate_parameters=None,
                           verbosity=None),
    n_features_to_select=10, step=1, verbose=0)

In [50]:
X.columns

Index(['km', 'prev_owner', 'hp_kw', 'displacement_cc', 'gears', 'num_door',
       'num_seat', 'cylinders', 'co2_emission', 'cc_air conditioning',
       ...
       'body_color_Violet', 'body_color_White', 'body_color_Yellow',
       'paint_type_Basic', 'paint_type_Metallic', 'upholstery_Cloth',
       'upholstery_Part/Full Leather', 'drive_chain_4WD', 'drive_chain_front',
       'drive_chain_rear'],
      dtype='object', length=158)

In [64]:
# transform the data
X = rfe.transform(X)
# y = rfe.transform(y)

In [67]:
X.shape

(15915, 10)

In [69]:
rfe.ranking_

array([  6, 133,   1,  18,   2,  37,  55,  43,  34, 104, 126,  19,  14,
       123,  80, 140,  11,  76,  20, 121,   5, 106,  61,  75, 134,  90,
        54,  86,  27,  85,  36,  45,   4,  22,  30,  46,  59, 113,  94,
        35, 100,  63,  41,  16, 112, 107, 149,  13,  91,  72, 108,  96,
        60, 105,  79,  93,  74,  44, 103,  64,  73,  52, 115, 109,  42,
        99, 110, 101,  78,  57,  50,  71,  89,  49,  56,   7,  29, 132,
        84,   9,  38,  66,  65,  70,  88,   8,  58, 116,  69, 131, 142,
        77,  51, 118, 147,  39,  31,  62,  53, 119, 125,  68, 122,  21,
         1,   1,  97,   1,   1,   1,   1,   1, 136,   1,  67,  95, 139,
       127,  32,  92, 124,  12,  87, 148,  10, 114,  26,  23,  47,  17,
         1,   3,  28,  33,  48, 146, 138, 128,  81,  40,  98, 120, 111,
       141, 102,  25, 130, 117, 129,  82, 135,  83, 143,  24, 145,  15,
       144, 137])

In [70]:
print("Optimal number of features: {}".format(rfe.n_features_))

Optimal number of features: 10


In [71]:
rfe.estimator_.feature_importances_

array([0.13244079, 0.09388006, 0.06200097, 0.16130538, 0.04983974,
       0.03891333, 0.02493072, 0.06173361, 0.34586877, 0.02908659],
      dtype=float32)

In [61]:
X_df =pd.DataFrame(X) 
dset = pd.DataFrame()
dset['attr'] = X_df.columns
dset['importance'] = rfe.ranking_
dset = dset.sort_values(by='importance')

In [63]:
dset.head(20)

Unnamed: 0,attr,importance
113,make_model_Renault Espace,1
107,make_model_Audi A3,1
105,make_model_Audi A1,1
110,make_model_Opel Insignia,1
111,make_model_Renault Clio,1
104,register_age,1
108,make_model_Opel Astra,1
109,make_model_Opel Corsa,1
2,hp_kw,1
130,gear_type_Manual,1


In [65]:
rfe.support_

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False,  True,
        True,  True,  True,  True, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [72]:
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [73]:
xgb_tuned = XGBRegressor(n_estimators=1000,
                         subsample=0.8,
                         max_depth=33,
                         learning_rate=0.1).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9166309586294097 
 mae: 1318.0282989895047 
 mse: 4499225.451679176 
 rmse: 2121.1377729132014


In [74]:
X.shape

(15915, 10)

In [75]:
y.shape

(15915,)

#### 2.Method (without designated number of features)

In [14]:
# define the method
rfe = RFE(estimator=XGBRegressor())

In [15]:
# fit the model in 10 min
rfe.fit(X_train, y_train)

RFE(estimator=XGBRegressor(base_score=None, booster=None,
                           colsample_bylevel=None, colsample_bynode=None,
                           colsample_bytree=None, gamma=None, gpu_id=None,
                           importance_type='gain', interaction_constraints=None,
                           learning_rate=None, max_delta_step=None,
                           max_depth=None, min_child_weight=None, missing=nan,
                           monotone_constraints=None, n_estimators=100,
                           n_jobs=None, num_parallel_tree=None,
                           objective='reg:squarederror', random_state=None,
                           reg_alpha=None, reg_lambda=None,
                           scale_pos_weight=None, subsample=None,
                           tree_method=None, validate_parameters=None,
                           verbosity=None),
    n_features_to_select=None, step=1, verbose=0)

In [17]:
print("Num Features: %d" % rfe.n_features_)

Num Features: 79


In [18]:
print("Selected Features: %s" % rfe.support_)

Selected Features: [ True False  True  True  True  True False  True  True False  True  True
  True False False False False False  True False  True  True  True  True
 False False False  True  True  True  True  True  True  True  True  True
  True False False  True False False False  True False False False  True
 False  True False False False False False False  True  True  True False
 False  True False False  True False False False False  True False  True
 False  True False  True  True False  True  True  True False  True False
 False  True  True False False False False False  True False False  True
  True False  True False False  True False  True  True  True False  True
  True  True  True  True False  True  True False False False  True  True
 False  True  True False  True False  True  True  True  True  True  True
  True False  True False False False  True  True False False  True False
 False  True False False False False False False False  True False  True
 False False]


In [19]:
print("Feature Ranking: %s" % rfe.ranking_)

Feature Ranking: [ 1 58  1  1  1  1 26  1  1 43  1  1  1 37 41 70 28 13  1 21  1  1  1  1
 64 12 39  1  1  1  1  1  1  1  1  1  1  7 18  1 30 51  2  1 57 56 75  1
 29  1 14 15 36 27 19 23  1  1  1 24  4  1 22  6  1 45 25 32 44  1 54  1
  5  1 46  1  1  8  1  1  1 17  1 10 52  1  1 48 20 67 68  3  1 38 74  1
  1 11  1 35 42  1 34  1  1  1 80  1  1  1  1  1 69  1  1  9 71 59  1  1
 47  1  1 77  1 53  1  1  1  1  1  1  1 16  1 73 78 65  1  1 55 61  1 60
 33  1 31 49 63 50 62 40 72  1 76  1 79 66]


In [32]:
len(rfe.ranking_)

158

In [26]:
X_df =pd.DataFrame(X) 
dset = pd.DataFrame()
dset['attr'] = X_df.columns
dset['importance'] = rfe.ranking_
dset = dset.sort_values(by='importance')

In [34]:
dset.head(79)

Unnamed: 0,attr,importance
0,km,1
105,make_model_Audi A1,1
104,register_age,1
103,cons_comb,1
101,ex_voice control,1
...,...,...
5,num_door,1
49,em_digital radio,1
7,cylinders,1
47,em_bluetooth,1


In [22]:
X_train_rfe = rfe.transform(X_train)  # election
X_test_rfe = rfe.transform(X_test)

In [24]:
# Fitting our baseline model with the transformed data
xgb_tuned = XGBRegressor(n_estimators=1000,
                         subsample=0.8,
                         max_depth=33,
                         learning_rate=0.1).fit(X_train_rfe, y_train)
y_pred = xgb_tuned.predict(X_test_rfe)
eval_metrics(y_test, y_pred)

r2_score: 0.9651183527464098 
 mae: 796.8764295610961 
 mse: 1882478.1062579334 
 rmse: 1372.034294854882


In [37]:
X_train_rfe

array([[6.3320e+03, 5.6000e+01, 8.9800e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [5.0000e+02, 8.5000e+01, 9.9900e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [7.2000e+04, 7.4000e+01, 1.3980e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       ...,
       [1.4405e+04, 6.6000e+01, 1.4220e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [0.0000e+00, 1.4700e+02, 1.9970e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [2.8285e+04, 9.2000e+01, 1.3990e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00]])