# Cox time-varying proportional hazards model with hyperparameter tuning

In [1]:
from datetime import timedelta
import sys
import logging
from pathlib import Path
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

from hyperopt import hp
import pandas as pd
from prefect import Flow

from nbaspa.model.tasks import (
    SurvivalData,
    SegmentData,
    InitializeLifelines,
    FitLifelinesModel,
    CollapseData,
    PredictLifelines,
    ConcordanceIndex,
    HyperparameterTuning
)
from nbaspa.model.tasks.meta import META

## Load the data

In [2]:
df = pd.concat(
    pd.read_csv(fpath, sep="|", dtype={"GAME_ID": str}, index_col=0)
    for fpath in Path("..", "nba-data", "2018-19", "model-data").glob("data_*.csv")
).reset_index(drop=True)

In [3]:
df[
    [META["id"]] + META["dynamic"] + META["static"]
].head(n=5)

Unnamed: 0,GAME_ID,SCOREMARGIN,HOME_LINEUP_PLUS_MINUS,VISITOR_LINEUP_PLUS_MINUS,HOME_NET_RATING,VISITOR_NET_RATING,HOME_W_PCT,VISITOR_W_PCT,LAST_GAME_WIN,HOME_GAMES_IN_LAST_3_DAYS,HOME_GAMES_IN_LAST_5_DAYS,HOME_GAMES_IN_LAST_7_DAYS,VISITOR_GAMES_IN_LAST_3_DAYS,VISITOR_GAMES_IN_LAST_5_DAYS,VISITOR_GAMES_IN_LAST_7_DAYS
0,21800001,0,0.1,-0.1,5.1,3.3,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0
1,21800001,0,0.1,-0.1,5.1,3.3,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0
2,21800001,0,0.1,-0.1,5.1,3.3,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0
3,21800001,0,0.1,-0.1,5.1,3.3,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0
4,21800001,0,0.1,-0.1,5.1,3.3,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0


## Build the model with hyperparameter tuning

In this example we will use a $Unif(0, 1)$ distribution for both the `penalizer` and the `l1_ratio`.

In [4]:
format_data = SurvivalData(name="Convert input data to range form", cache_for=timedelta(hours=1))
segdata = SegmentData(name="Split data", cache_for=timedelta(hours=1))
tune_data = CollapseData(name="Create tuning data", cache_for=timedelta(hours=1))
test_data = CollapseData(name="Create test data", cache_for=timedelta(hours=1))
tuning = HyperparameterTuning(name="Hyperparameter tuning")
model = InitializeLifelines(name="Initialize Cox model")
trained = FitLifelinesModel(name="Fit Cox model")
predict = PredictLifelines(name="Predict partial hazard")
concord = ConcordanceIndex(name="Calculate C-Index")

In [5]:
with Flow(name="My model pipeline") as flow:
    alldata = format_data(df)
    data = segdata(alldata, splits=[0.6, 0.25], keys=["train", "tune", "test"])
    tune = tune_data(data["tune"], tail=True)
    test = test_data(data["test"], tail=True)
    params = tuning(data["train"], tune)
    model_obj = model(params["best"])
    fitted = trained(model=model_obj, data=data["train"])
    predt = predict(model=fitted, data=test)
    cind = concord(data=test, predt=predt)

In [6]:
output = flow.run()

[2021-02-13 05:02:52+0000] INFO - prefect.FlowRunner | Beginning Flow run for 'My model pipeline'
[2021-02-13 05:02:53+0000] INFO - prefect.TaskRunner | Task 'Convert input data to range form': Starting task run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv[duration_col] += delay


[2021-02-13 05:03:19+0000] INFO - prefect.TaskRunner | Task 'Convert input data to range form': Finished task run for task with final state: 'Cached'
[2021-02-13 05:03:19+0000] INFO - prefect.TaskRunner | Task 'Split data': Starting task run...
[2021-02-13 05:03:19+0000] INFO - prefect.Split data | Setting the seed to 42
[2021-02-13 05:03:19+0000] INFO - prefect.Split data | Dataset ``train`` has 737 games with 84607 rows
[2021-02-13 05:03:19+0000] INFO - prefect.Split data | Dataset ``tune`` has 307 games with 35766 rows
[2021-02-13 05:03:19+0000] INFO - prefect.Split data | Dataset ``test`` has 185 games with 20939 rows
[2021-02-13 05:03:19+0000] INFO - prefect.TaskRunner | Task 'Split data': Finished task run for task with final state: 'Cached'
[2021-02-13 05:03:19+0000] INFO - prefect.TaskRunner | Task 'GetItem': Starting task run...
[2021-02-13 05:03:19+0000] INFO - prefect.TaskRunner | Task 'GetItem': Finished task run for task with final state: 'Success'
[2021-02-13 05:03:19+000

0,1
model,lifelines.CoxTimeVaryingFitter
event col,'WIN'
penalizer,0.072731
number of subjects,737
number of periods,84607
number of events,442
partial log-likelihood,-2645.81
time fit was run,2021-02-13 05:08:07 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
HOME_GAMES_IN_LAST_3_DAYS,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,-0.0,1.00,0.0
HOME_GAMES_IN_LAST_5_DAYS,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,-0.0,1.00,0.0
HOME_GAMES_IN_LAST_7_DAYS,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,-0.0,1.00,0.0
HOME_LINEUP_PLUS_MINUS,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.01,0.99,0.01
HOME_NET_RATING,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.03,0.98,0.03
HOME_W_PCT,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.02,0.98,0.02
LAST_GAME_WIN,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.01,0.99,0.01
SCOREMARGIN,0.0,1.0,0.0,0.0,0.01,1.0,1.01,3.98,<0.005,13.85
VISITOR_GAMES_IN_LAST_3_DAYS,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.01,1.00,0.01
VISITOR_GAMES_IN_LAST_5_DAYS,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,1.00,0.0

0,1
Partial AIC,5319.63
log-likelihood ratio test,15.82 on 14 df
-log2(p) of ll-ratio test,1.62


[2021-02-13 05:08:10+0000] INFO - prefect.TaskRunner | Task 'Fit Cox model': Finished task run for task with final state: 'Success'
[2021-02-13 05:08:10+0000] INFO - prefect.TaskRunner | Task 'Predict partial hazard': Starting task run...
[2021-02-13 05:08:10+0000] INFO - prefect.TaskRunner | Task 'Predict partial hazard': Finished task run for task with final state: 'Success'
[2021-02-13 05:08:10+0000] INFO - prefect.TaskRunner | Task 'Calculate C-Index': Starting task run...
[2021-02-13 05:08:10+0000] INFO - prefect.Calculate C-Index | Model has a C-index of 0.757
[2021-02-13 05:08:10+0000] INFO - prefect.TaskRunner | Task 'Calculate C-Index': Finished task run for task with final state: 'Success'
[2021-02-13 05:08:10+0000] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded
