## Train tuned xgb model to predict prices

In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR, MODELS_DIR

In [37]:
data = pd.read_csv(TRANSFORMED_DATA_DIR / 'transformed_data.csv')

In [38]:
data.shape

(4817, 23)

In [39]:
# Change data types from object to categorical
from src.data import convert_object_columns_to_category

data = convert_object_columns_to_category(data)

Split data into train and test

In [40]:
from src.data import get_train_test_data

In [41]:
data.columns

Index(['maker_key', 'model_key', 'mileage', 'engine_power',
       'registration_date', 'fuel', 'paint_color', 'car_type', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'price', 'sold_at', 'age_in_months_when_sold',
       'month_sold_at', 'season_sold_at', 'model_initial',
       'mileage_per_month'],
      dtype='object')

In [42]:
features = ['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color', 'car_type', 'feature_1',
    'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
    'feature_7', 'feature_8', 'age_in_months_when_sold',
    'month_sold_at', 'season_sold_at', 'model_initial', 'mileage_per_month']
target = 'price'

In [43]:
# Save the features and target to models directory
import joblib


features_path = MODELS_DIR / 'features.pkl'
target_path = MODELS_DIR / 'target.pkl'

joblib.dump(features, features_path)
joblib.dump(target, target_path)

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\target.pkl']

In [44]:
X, y, X_train, X_test, y_train, y_test = get_train_test_data(data, features, target)

In [45]:
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Train shape: {y_train.shape}")
print(f"Test shape: {y_test.shape}")

Train shape: (3853, 19)
Test shape: (964, 19)
Train shape: (3853,)
Test shape: (964,)


Train model

In [46]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [47]:
hyperparameters = {
    "max_depth": [1, 2, 3, 4, 5, 6],
    "n_estimators": [1, 5, 10, 50, 100],
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
    "random_state": [42],
    "max_delta_step" : [0, 0.5, 1]
}

In [48]:
xgb_estimator = xgb.XGBRegressor(enable_categorical=True)

In [49]:
regressor = RandomizedSearchCV(
    xgb_estimator, hyperparameters, n_iter=20, scoring='neg_mean_squared_error', random_state=0, cv=4, n_jobs=-1, verbose=3
    )

In [50]:
regressor.fit(X_train, y_train)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


In [51]:
regressor.best_params_

{'random_state': 42,
 'n_estimators': 10,
 'max_depth': 6,
 'max_delta_step': 0,
 'learning_rate': 0.3}

In [52]:
pred_train = regressor.predict(X_train)

In [53]:
pred_test = regressor.predict(X_test)

Calculate accuracy metrics

In [54]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [55]:
# Calculate metrics on train set
mse_train = mean_squared_error(y_train, pred_train)
rmse_train = root_mean_squared_error(y_train, pred_train)
mae_train = mean_absolute_error(y_train, pred_train)
r2_train = r2_score(y_train, pred_train)

In [56]:
# Calculate metrics on test set
mse_test = mean_squared_error(y_test, pred_test)
rmse_test = root_mean_squared_error(y_test, pred_test)
mae_test = mean_absolute_error(y_test, pred_test)
r2_test = r2_score(y_test, pred_test)

In [57]:
# Print metrics on train set
print(f"Train MSE: {mse_train}")
print(f"Train RMSE: {rmse_train}")
print(f"Train MAE: {mae_train}")
print(f"Train R2: {r2_train}")

Train MSE: 7223111.155648625
Train RMSE: 2687.5846322764655
Train MAE: 1524.5627098608227
Train R2: 0.9177988535933496


In [58]:
# Print metrics on test set
print(f"Test MSE: {mse_test}")
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")
print(f"Test R2: {r2_test}")

Test MSE: 8705488.272227302
Test RMSE: 2950.506443346176
Test MAE: 1914.0243094669834
Test R2: 0.8679980331910278


Find feature importances

In [59]:
# Find feature importance
features = regressor.best_estimator_.get_booster().feature_names

In [60]:
importances = regressor.best_estimator_.feature_importances_

In [61]:
# Create a DataFrame with feature names and importances
feature_importance_df = pd.DataFrame(zip(features, importances), columns=['feature', 'importance'])

In [62]:
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

In [63]:
feature_importance_df

Unnamed: 0,feature,importance
0,model_key,0.29245
12,feature_7,0.136117
14,age_in_months_when_sold,0.118553
13,feature_8,0.076528
2,engine_power,0.059833
1,mileage,0.049596
17,model_initial,0.041771
7,feature_2,0.035169
8,feature_3,0.033045
18,mileage_per_month,0.031212


In [64]:
# Save feature importances to model directory
joblib.dump(feature_importance_df, MODELS_DIR / 'feature_importance_df.pkl')

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\feature_importance_df.pkl']

Save the model

In [65]:
# Retrain the model with all the data
regressor.fit(X, y)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


In [66]:
regressor.best_params_

{'random_state': 42,
 'n_estimators': 50,
 'max_depth': 6,
 'max_delta_step': 0,
 'learning_rate': 0.1}

In [67]:
# Save the model
joblib.dump(regressor.best_estimator_, MODELS_DIR / 'model.pkl')

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\model.pkl']

In [68]:
# Save metadata about the model
metadata = {
    'mse_train': mse_train,
    'rmse_train': rmse_train,
    'mae_train': mae_train,
    'r2_train': r2_train,
    'mse_test': mse_test,
    'rmse_test': rmse_test,
    'mae_test': mae_test,
    'r2_test': r2_test,
    'features': features,
    'target': target,
    'name': 'XGBoost with trees tuned with RandomizedSearchCV',
}
import json
with open(MODELS_DIR / 'metadata.json', 'w') as f:
    json.dump(metadata, f)