<a href="https://www.kaggle.com/code/inzeyun/notebook4407d644ef?scriptVersionId=168497117" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Time to Event (ToE)
* Trial details: https://classic.clinicaltrials.gov/ct2/show/record/NCT00000625

In [None]:
import numpy as np
import pandas as pd
import scipy.stats

from sklearn.model_selection import ShuffleSplit
import xgboost as xgb

In [None]:
data=pd.read_csv("/kaggle/input/aids-clinical-trials/AIDS_ClinicalTrial_GroupStudy175.csv")
data.info()

In [None]:
data.time.agg(["mean","median","min","max","std"]).to_frame().T.round(2)

In [None]:
data["cd4"]=data.cd420-data.cd40
data["cd8"]=data.cd820-data.cd80
data.loc[:,["cd4","cd8"]].plot.box(figsize=(3,3),xlabel="Cell Count Diff.")

In [None]:
byLabel=data.loc[:,["label","cd4","cd8"]]

byLabelCd4=scipy.stats.ttest_ind(
    byLabel.query("label==0").cd4,
    byLabel.query("label==1").cd4,
)

byLabelCd8=scipy.stats.ttest_ind(
    byLabel.query("label==0").cd8,
    byLabel.query("label==1").cd8,
)

print(data.groupby("label")[["cd4","cd8"]].agg("mean").T)
print(
    "CD4 Difference (vs. failure) 95% CI:",
    np.round(byLabelCd4.confidence_interval(),2),
    byLabelCd4.statistic.round(2),
    "p:",
    round(byLabelCd4.pvalue,2)
)
print(
    "CD8 Difference (vs. failure) 95% CI:",
    np.round(byLabelCd8.confidence_interval(),2),
    byLabelCd8.statistic.round(2),
    "p:",
    round(byLabelCd8.pvalue,2)
)

In [None]:
byTrt=data.loc[:,["trt","cd4","cd8"]]

byTrtCd4=scipy.stats.ttest_ind(
    byTrt.query("trt==0").cd4,
    byTrt.query("trt==1").cd4,
)

print(data.groupby("trt")[["cd4","cd8"]].agg("mean").T)
print(
    "CD4 Difference (vs. better regimen) 95% CI:",
    np.round(byTrtCd4.confidence_interval(),2),
    "p:",
    round(byTrtCd4.pvalue,2)
)

In [None]:
byBaseline=data.loc[:,["wtkg","karnof","age","cd4","cd8"]]
byBaseline.corr().round(2)

In [None]:
data["survival0"]=data.time.copy()
data["survival1"]=data.apply(lambda q:"inf" if q.at["label"]==0 else q.at["time"],axis=1).rename("survival1")

In [None]:
x=data.drop(["label","time","cd40","cd420","cd80","cd820","survival0","survival1"],axis=1)
y=data[["survival0","survival1"]]
print("Features:",x.shape)

In [None]:
splitter=ShuffleSplit(n_splits=10,train_size=.6,random_state=15222330)
trainIdx,testIdx=next(splitter.split(x))

train=xgb.DMatrix(x.values[trainIdx,:])
train.set_float_info("label_lower_bound",y.survival0.values[trainIdx])
train.set_float_info("label_upper_bound",y.survival1.values[trainIdx])

test=xgb.DMatrix(x.values[testIdx,:])
test.set_float_info("label_lower_bound",y.survival0.values[testIdx])
test.set_float_info("label_upper_bound",y.survival1.values[testIdx])

In [None]:
param={
    "objective":"survival:aft",
    "aft_loss_distribution_scale":.8,
    "aft_loss_distribution":"normal",
    "eval_metric":"aft-nloglik",
    "learning_rate":.01,
    "verbosity":0
}

model=xgb.train(
    param,train,
    num_boost_round=5000,
    evals=[(train,"train"),(test,"test")],
    early_stopping_rounds=100,
    verbose_eval=100,
)

result=pd.DataFrame({
    "lower":y.survival0.values,
    "upper":y.survival1.values,
    "prediction":model.predict(xgb.DMatrix(x.values))
})

In [None]:
tot=pd.concat([x,result],axis=1)
tot.sample(5)

In [None]:
tot.groupby(["trt","symptom"]).prediction.mean().to_frame().T

* In context of better regimen, trt==1 (ZDV+ddl) is the best
* No symptom prior to the baseline visit is an indicator for longer ToE