In [94]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'w'
matplotlib.rcParams['figure.figsize'] = (12,6)
%matplotlib inline

In [2]:
df = pd.read_csv('UsedCarFinal.csv')

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,12000,merc,GLS Class,2017,Automatic,12046,Diesel,150.0,37.2,3.0,38000
1,12001,vw,Amarok,2017,Automatic,37683,Diesel,260.0,36.2,3.0,23495
2,12004,merc,GLS Class,2019,Automatic,10000,Diesel,145.0,34.0,3.0,59999
3,12013,skoda,Scala,2019,Manual,3257,Petrol,145.0,49.6,1.0,16713
4,12017,audi,RS6,2015,Semi-Auto,20982,Petrol,325.0,29.4,4.0,46000


Get Dummies and Train/Test Split

In [6]:
df_dummies = pd.get_dummies(df, columns=['brand', 'model', 'transmission', 'fuelType'], drop_first=True)

In [7]:
df_dummies.head(3)

Unnamed: 0,carID,year,mileage,tax,mpg,engineSize,price,brand_bmw,brand_ford,brand_hyundi,...,model_Zafira Tourer,model_i3,model_i8,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,12000,2017,12046,150.0,37.2,3.0,38000,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,12001,2017,37683,260.0,36.2,3.0,23495,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12004,2019,10000,145.0,34.0,3.0,59999,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
scaler = MinMaxScaler()

In [14]:
X = df_dummies.drop('price', axis=1)
y = df_dummies.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_test = scaler.fit_transform(X_test)
X_train = scaler.fit_transform(X_train)

xgb = XGBRegressor()

In [15]:
results = cross_val_score(xgb, X_train, y_train, cv=25, verbose=1, scoring='neg_mean_squared_error')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   12.2s finished


In [18]:
MSE = -1 * (results[results.argmin()])
RMSE = np.sqrt(MSE)
print("Cross Validation Results using Root Mean Squared Error: ")
print(RMSE)

Cross Validation Results using Root Mean Squared Error: 
7917.027204788411


#### Base Model training and testing

In [19]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
pred = xgb.predict(X_test)
xgb_base_rmse = np.sqrt(mean_squared_error(y_test, pred))
xgb_base_mae = mean_absolute_error(y_test, pred)

In [23]:
print("XGBRegressor base model RMSE: ")
print(xgb_base_rmse)
print("------------------------------")
print("XGBRegressor base model MAE: ")
print(xgb_base_mae)

XGBRegressor base model RMSE: 
7328.675539041397
------------------------------
XGBRegressor base model MAE: 
5500.13680733469


Parameter Tuning:

In [24]:
params = {
    'learning_rate': np.linspace(0, 1, 10),
    'max_depth': np.arange(2, 10, 2),
    'n_estimators': np.arange(0, 300, 25),
}

In [25]:
cv = GridSearchCV(xgb, param_grid=params, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=3, verbose=1)
cv.fit(X_train, y_train)
print(cv.best_params_)
print(cv.best_score_)
best_score = str(cv.best_score_)

Fitting 3 folds for each of 480 candidates, totalling 1440 fits
{'learning_rate': 0.4444444444444444, 'max_depth': 2, 'n_estimators': 250}
-4779.469742335114


In [26]:
with open('params_list.txt', 'a') as file:
    for key,value in cv.best_params_.items():
        file.write(key + ": " + str(value))
        file.write("\n")
    file.write("Best RMSE Score: " + best_score)
    file.write("\n")
    file.write("----------------------------------")
    file.write("\n")
    file.close()

In [31]:
print(cv.best_score_)

-4779.469742335114


In [32]:
with open('params_list.txt', 'r') as file:
    for line in file.readlines():
        print(line)

learning_rate: 0.2222222222222222

max_depth: 4

n_estimators: 150

Best RMSE Score: -4869.4467198619905

----------------------------------



learning_rate: 0.4444444444444444

max_depth: 2

n_estimators: 250

Best RMSE Score: -4779.469742335114

----------------------------------



We'll try a model with these parameters on a test set

In [33]:
xgb_test = XGBRegressor(learning_rate=0.4444444444444444, max_depth=2, n_estimators=250)

In [34]:
xgb_test.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.4444444444444444, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=250, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [35]:
pred = xgb_test.predict(X_test)

In [39]:
test_RMSE = np.sqrt(mean_squared_error(y_test, pred))
test_MAE = mean_absolute_error(y_test, pred)

In [41]:
print(test_RMSE)
print(test_MAE)

6686.591014943902
4986.487137639909


In [43]:
xgb_test_2 = XGBRegressor(learning_rate= 0.2222222222222222, max_depth=4, n_estimators=150)

In [44]:
xgb_test_2.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2222222222222222, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=150, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
pred = xgb_test_2.predict(X_test)

In [46]:
test_RMSE2 = np.sqrt(mean_squared_error(y_test, pred))
test_MAE2 = mean_absolute_error(y_test, pred)

In [47]:
print(test_RMSE2)
print(test_MAE2)

6612.515908678946
4849.533673750195


Let's try a different model

In [48]:
from sklearn.ensemble import RandomForestRegressor

In [49]:
rf = RandomForestRegressor()

In [51]:
rf_cv = cross_val_score(rf, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.0s finished


In [56]:
rf_cv[rf_cv.argmax()]

-3187.386356650125

In [57]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [58]:
pred = rf.predict(X_test)

In [59]:
rf_RMSE = np.sqrt(mean_squared_error(y_test, pred))

In [61]:
print(rf_RMSE)

7003.859200565527


In [99]:
print(test_RMSE2)

6612.515908678946


The tuned XGB model did slightly better than the un-tuned RandomForestRegressor

In [100]:
xgb_test_2.save_model('XGB_Tuned.json')