In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from hyperopt import hp
from hyperopt import fmin, tpe
import shap

In [2]:
def mse(y_pred, y):
    return ((y_pred - y)**2).mean()

In [3]:
FEATURES = ["category_gry i konsole", "category_komputery", "category_sprzęt rtv", "category_telefony i akcesoria", "city_Gdynia", "city_Konin", "city_Kutno", "city_Mielec", "city_Police", "city_Radom", "city_Szczecin", "city_Warszawa", "delivery_company_360", "delivery_company_516", "delivery_company_620", "price", "purchase_day", "purchase_day_cos", "purchase_day_sin", "purchase_dayofweek", "purchase_dayofweek_cos", "purchase_dayofweek_sin", "purchase_hour", "purchase_hour_cos", "purchase_hour_sin", "purchase_minute", "purchase_minute_cos", "purchase_minute_sin", "purchase_month", "purchase_month_cos", "purchase_month_sin", "purchase_second", "purchase_second_cos", "purchase_second_sin", "purchase_year"]
TARGET = "hours"

In [4]:
train = pd.read_csv("../data_preprocessed/train.csv")
dev = pd.read_csv("../data_preprocessed/dev.csv")
test = pd.read_csv("../data_preprocessed/test.csv")

In [5]:
def main(args):
    model = xgb.XGBRegressor(**args).fit(train[FEATURES], train[TARGET])

    y = dev[TARGET]
    y_pred = model.predict(dev[FEATURES])

    return mse(y_pred, y)

In [6]:
space = {
    "booster": "gbtree",
    "colsample_bylevel": hp.uniform("colsample_bylevel", 0, 1),
    "colsample_bynode": hp.uniform("colsample_bynode", 0, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0, 1),
    "learning_rate": hp.loguniform("learning_rate", np.log(1e-3), np.log(1e-1)),
    "max_depth": hp.choice("max_depth", [2, 3, 4, 5, 6, 7]),
    "min_child_weight": hp.choice("min_child_weight", [2, 3, 4, 5, 6, 7]),
    "n_estimators": hp.randint("n_estimators", 50, 2000),
    "random_state": hp.randint("random_state", 1000000),
    "reg_alpha": hp.uniform("reg_alpha", 0, 1),
    "reg_lambda": hp.uniform("reg_lambda", 0, 1),
    "subsample": hp.uniform("subsample", 0, 1),
    "objective": "reg:squarederror",
}

In [7]:
#best = fmin(main, space, algo=tpe.suggest, max_evals=100)

In [8]:
#best

In [9]:
args = {'colsample_bylevel': 0.6622004166419073,
 'colsample_bynode': 0.4190634491187126,
 'colsample_bytree': 0.844418831863196,
 'learning_rate': 0.0031913240654254365,
 'max_depth': 5,
 'min_child_weight': 1,
 'n_estimators': 1707,
 'random_state': 727732,
 'reg_alpha': 0.06890745402725085,
 'reg_lambda': 0.36490973396841186,
 'subsample': 0.796384735989931,
       "booster": "gbtree",
       "objective": "reg:squarederror"}

In [10]:
model = xgb.XGBRegressor(**args).fit(train[FEATURES], train[TARGET])

In [11]:
model.save_model("xgb_model.json")

In [12]:
train["hours_pred"] = model.predict(train[FEATURES])
dev["hours_pred"] = model.predict(dev[FEATURES])
test["hours_pred"] = model.predict(test[FEATURES])

In [13]:
mse(train["hours_pred"].to_numpy(), train["hours"].to_numpy())

103.03487344902906

In [14]:
mse(dev["hours_pred"].to_numpy(), dev["hours"].to_numpy())

123.46375825417827

In [15]:
mse(test["hours_pred"].to_numpy(), test["hours"].to_numpy())

127.32264872582734

In [16]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test[FEATURES])

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


In [17]:
shap.initjs()

In [18]:
shap.plots.force(explainer.expected_value, shap_values[0], feature_names=FEATURES)

In [19]:
shap.plots.force(explainer.expected_value, shap_values[1], feature_names=FEATURES)

In [20]:
shap.plots.force(explainer.expected_value, shap_values[2], feature_names=FEATURES)

In [21]:
shap.plots.force(explainer.expected_value, shap_values, feature_names=FEATURES)