## Train tuned xgb model with less features to predict prices

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR, MODELS_DIR

In [23]:
data = pd.read_csv(TRANSFORMED_DATA_DIR / 'transformed_data.csv')

In [24]:
data.shape

(4817, 22)

In [25]:
# Change data types from object to categorical
from src.data import convert_object_columns_to_category

data = convert_object_columns_to_category(data)

Split data into train and test

In [26]:
from src.data import get_train_test_data

In [27]:
data.columns

Index(['maker_key', 'model_key', 'mileage', 'engine_power',
       'registration_date', 'fuel', 'paint_color', 'car_type', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'price', 'sold_at', 'age_in_months_when_sold',
       'month_sold_at', 'season_sold_at', 'model_initial'],
      dtype='object')

Don't choose model_key to reduce overfitting.

In [28]:
features = ['mileage', 'engine_power', 'fuel', 'paint_color', 'car_type', 'feature_1',
    'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
    'feature_7', 'feature_8', 'age_in_months_when_sold',
    'month_sold_at', 'season_sold_at', 'model_initial']
target = 'price'

In [29]:
# Save the features and target to models directory
import joblib


features_path = MODELS_DIR / 'features.pkl'
target_path = MODELS_DIR / 'target.pkl'

joblib.dump(features, features_path)
joblib.dump(target, target_path)

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\target.pkl']

In [30]:
X, y, X_train, X_test, y_train, y_test = get_train_test_data(data, features, target)

In [31]:
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Train shape: {y_train.shape}")
print(f"Test shape: {y_test.shape}")

Train shape: (3853, 17)
Test shape: (964, 17)
Train shape: (3853,)
Test shape: (964,)


Train model

In [33]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [34]:
hyperparameters = {
    "max_depth": [1, 2, 3, 4, 5, 6],
    "n_estimators": [1, 5, 10, 50, 100],
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
    "random_state": [42],
    "max_delta_step" : [0, 0.5, 1]
}

In [35]:
xgb_estimator = xgb.XGBRegressor(enable_categorical=True)

In [36]:
regressor = RandomizedSearchCV(
    xgb_estimator, hyperparameters, n_iter=20, scoring='neg_mean_squared_error', random_state=0, cv=4, n_jobs=-1, verbose=3
    )

In [37]:
regressor.fit(X_train, y_train)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


In [38]:
regressor.best_params_

{'random_state': 42,
 'n_estimators': 50,
 'max_depth': 6,
 'max_delta_step': 0,
 'learning_rate': 0.1}

In [39]:
pred_train = regressor.predict(X_train)

In [40]:
pred_test = regressor.predict(X_test)

Calculate accuracy metrics

In [41]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [42]:
# Calculate metrics on train set
mse_train = mean_squared_error(y_train, pred_train)
rmse_train = root_mean_squared_error(y_train, pred_train)
mae_train = mean_absolute_error(y_train, pred_train)
r2_train = r2_score(y_train, pred_train)

In [43]:
# Calculate metrics on test set
mse_test = mean_squared_error(y_test, pred_test)
rmse_test = root_mean_squared_error(y_test, pred_test)
mae_test = mean_absolute_error(y_test, pred_test)
r2_test = r2_score(y_test, pred_test)

In [44]:
# Print metrics on train set
print(f"Train MSE: {mse_train}")
print(f"Train RMSE: {rmse_train}")
print(f"Train MAE: {mae_train}")
print(f"Train R2: {r2_train}")

Train MSE: 6649829.003278325
Train RMSE: 2578.726236590136
Train MAE: 1451.4761329886512
Train R2: 0.9243229744498392


In [45]:
# Print metrics on test set
print(f"Test MSE: {mse_test}")
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")
print(f"Test R2: {r2_test}")

Test MSE: 9939688.304513928
Test RMSE: 3152.727121796926
Test MAE: 1923.3444972374627
Test R2: 0.8492837662133469


Find feature importances

In [46]:
# Find feature importance
features = regressor.best_estimator_.get_booster().feature_names

In [47]:
importances = regressor.best_estimator_.feature_importances_

In [48]:
# Create a DataFrame with feature names and importances
feature_importance_df = pd.DataFrame(zip(features, importances), columns=['feature', 'importance'])

In [49]:
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

In [50]:
feature_importance_df

Unnamed: 0,feature,importance
1,engine_power,0.300262
12,feature_8,0.145404
16,model_initial,0.124069
13,age_in_months_when_sold,0.113622
0,mileage,0.057269
9,feature_5,0.047976
5,feature_1,0.030138
3,paint_color,0.02926
8,feature_4,0.02918
4,car_type,0.018999


In [51]:
# Save feature importances to model directory
joblib.dump(feature_importance_df, MODELS_DIR / 'feature_importance_df.pkl')

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\feature_importance_df.pkl']

Save the model

In [52]:
# Save the model
joblib.dump(regressor, MODELS_DIR / 'model.pkl')

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\model.pkl']