# XGBoost model

In [1]:
from pathlib import Path
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

import pandas as pd
from prefect import Flow, unmapped

from nbaspa.model.tasks import (
    SurvivalData,
    SegmentData,
    CollapseData,
    XGBoostTuning,
    PlotTuning,
    FitXGBoost,
    Predict,
    ConcordanceIndex,
    AUROC,
    PlotMetric,
    WinProbability,
    PlotProbability,
)

pd.options.mode.chained_assignment = None

In [2]:
df = pd.concat(
    pd.read_csv(fpath, sep="|", dtype={"GAME_ID": str}, index_col=0)
    for fpath in Path("..", "nba-data", "2018-19", "model-data").glob("data_*.csv")
).reset_index(drop=True)

In [4]:
# Initial formatting and segmentation
format_data = SurvivalData(name="Convert input data to range form")
segdata = SegmentData(name="Split data")
train_last = CollapseData(name="Create training dataset")
tune_data = CollapseData(name="Create tuning data")
stopping_last = CollapseData(name="Create stopping dataset")
test_benchmark = CollapseData(name="Create final test data")
test_data = CollapseData(name="Create metric test data")
# Model training
tuning = XGBoostTuning(name="Hyperparameter tuning")
tuning_plots = PlotTuning(name="Hyperparameter plotting")
trained = FitXGBoost(name="Fit XGBoost")
# Evaluation
hazpred = Predict(name="Predict partial hazard function")
concord = ConcordanceIndex(name="Calculate Concordance index")
# Predict win probability and get AUROC
predict = WinProbability(name="Get win probability")
benchmark_prob = WinProbability(name="Get benchmark win probability")
auroc = AUROC(name="Calculate model AUROC")
aurocb = AUROC(name="Calculate benchmark AUROC")
metricplot = PlotMetric(name="Plot AUROC")
# Predict win probability against Margin
survpredict = WinProbability(name="Predict win probability")
probplot = PlotProbability(name="Plot the survival probability")

times = [0, 360, 730, 1080, 1440, 1800, 2160, 2520, 2880]

In [5]:
with Flow(name="XGBoost pipeline") as flow:
    # Format and segment the data into train, tune, test
    alldata = format_data(df)
    data = segdata(alldata, splits=[0.7, 0.15, 0.075], keys=["train", "tune", "stop", "test"])
    tune = tune_data(data["tune"])
    train_lr = train_last(data["train"])
    stop_lr = stopping_last(data["stop"])
    test_lr = test_benchmark(data["test"])
    # Create test data for time steps
    test = test_data.map(data=unmapped(data["test"]), timestep=times)
    # Hyperparameter tuning (with plots)
    params = tuning(
        train_data=train_lr,
        tune_data=tune,
        stopping_data=stop_lr,
        num_boost_round=1000,
        early_stopping_rounds=5,
        verbose_eval=True
    )
    tuning = tuning_plots(params["trials"])
    # Fit the model
    fitted = trained(
        params=params["best"],
        train_data=train_lr,
        stopping_data=stop_lr,
        num_boost_round=1000,
        early_stopping_rounds=5,
        verbose_eval=True
    )
    # Evaluate
    predt = hazpred(model=fitted, data=test_lr)
    cind = concord(data=test_lr, predt=predt)
    # Predict the win probability throughout the game
    surv = predict.map(model=unmapped(fitted), data=test)
    benchmark = benchmark_prob.map(model=unmapped("nba"), data=test)
    # Get the AUROC
    metric = auroc.map(data=surv)
    metric_bench = aurocb.map(data=benchmark, mode=unmapped("benchmark"))
    plots = metricplot(times=times, metric="AUROC", survival=metric, nba=metric_bench)
    # Plot the final survival probability against the margin of the game
    survpred = survpredict(model=fitted, data=test_lr)
    prob = probplot(data=survpred)

In [6]:
output = flow.run()

[2021-02-18 09:24:17+0000] INFO - prefect.FlowRunner | Beginning Flow run for 'XGBoost pipeline'
[2021-02-18 09:24:18+0000] INFO - prefect.TaskRunner | Task 'Convert input data to range form': Starting task run...
[2021-02-18 09:24:44+0000] INFO - prefect.TaskRunner | Task 'Convert input data to range form': Finished task run for task with final state: 'Success'
[2021-02-18 09:24:44+0000] INFO - prefect.TaskRunner | Task 'Split data': Starting task run...
[2021-02-18 09:24:44+0000] INFO - prefect.Split data | Setting the seed to 42
[2021-02-18 09:24:44+0000] INFO - prefect.Split data | Dataset ``train`` has 861 games with 101533 rows
[2021-02-18 09:24:44+0000] INFO - prefect.Split data | Dataset ``tune`` has 184 games with 21920 rows
[2021-02-18 09:24:44+0000] INFO - prefect.Split data | Dataset ``stop`` has 92 games with 10901 rows
[2021-02-18 09:24:44+0000] INFO - prefect.Split data | Dataset ``test`` has 93 games with 10973 rows
[2021-02-18 09:24:44+0000] INFO - prefect.TaskRunner |

job exception: Invalid Parameter format for max_depth expect int but value='8.0'



  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]
[2021-02-18 09:24:45+0000] ERROR - prefect.TaskRunner | Unexpected error: XGBoostError("Invalid Parameter format for max_depth expect int but value='8.0'")
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/prefect/engine/runner.py", line 48, in inner
    new_state = method(self, state, *args, **kwargs)
  File "/usr/local/lib/python3.8/site-packages/prefect/engine/task_runner.py", line 856, in get_task_run_state
    value = prefect.utilities.executors.run_task_with_timeout(
  File "/usr/local/lib/python3.8/site-packages/prefect/utilities/executors.py", line 298, in run_task_with_timeout
    return task.run(*args, **kwargs)  # type: ignore
  File "/app/Documents/GitHub/nba_survival/nbaspa/model/tasks/tuning.py", line 184, in run
    best = fmin(
  File "/usr/local/lib/python3.8/site-packages/hyperopt/fmin.py", line 507, in fmin
    return trials.fmin(
  File "/usr/local/lib/python3.8/site-packages/hyp