In [1]:
# import libraries
import numpy as np
import pandas as pd
import optuna
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

  from pandas import MultiIndex, Int64Index


In [2]:
# read data into memory
train_data = pd.read_csv('./hivprogression/training_data.csv')
test_data = pd.read_csv('./hivprogression/test_data_mod.csv')

In [3]:
test_data.head()

Unnamed: 0,PatientID,Resp,PR Seq,RT Seq,VL-t0,CD4-t0
0,1,1,NCTCTATTAGATACAGGAGCAGATGACACAGTATTAGAAGARATGG...,CCTATTAGTCCTATTGAAACTGTACCAGTRAAATTAAAGCCAGGAA...,5.6,69
1,2,0,NCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGA...,CCCATCAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.3,119
2,3,1,GGGCAAATAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.7,41
3,4,0,GGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAG...,CCTATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.2,48
4,5,1,GGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATA...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.5,311


In [4]:
# print(train_data['PR Seq'].isnull().sum())
# print(train_data['RT Seq'].isnull().sum())
# print(test_data['PR Seq'].isnull().sum())
# print(test_data['RT Seq'].isnull().sum())

# drop 80 rows because of NaNs
train_data.dropna(subset=['PR Seq'], inplace=True)

In [5]:
# prepare X and y

# needed when model can not handle categorical data
train_data.drop(['PR Seq', 'RT Seq'], axis=1, inplace=True)
test_data.drop(['PR Seq', 'RT Seq'], axis=1, inplace=True)

X = train_data.drop(['PatientID', 'Resp'], axis=1)
Y = train_data['Resp'].values

In [6]:
# split sequences into multiple columns
# TODO: 

In [7]:
# perform train test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
x_train.tail()

Unnamed: 0,VL-t0,CD4-t0
106,5.4,319
270,5.4,56
860,4.2,496
435,3.9,327
102,4.7,546


In [8]:
# best found params with optuna
param = {'loss_function': 'CrossEntropy', 'learning_rate': 0.5956840896672528, 'l2_leaf_reg': 0.24629732316062503, 'colsample_bylevel': 0.09221224327044178, 'depth': 6, 'boosting_type': 'Plain', 'min_data_in_leaf': 3, 'one_hot_max_size': 6}

catboost = CatBoostClassifier(**param, verbose=False) #, cat_features=['PR Seq', 'RT Seq'])
catboost.fit(x_train, y_train)

# on test partition of train data
y_pred = catboost.predict(x_test)
accuracy_score(y_true=y_test, y_pred=y_pred)

0:	learn: 0.5336523	total: 139ms	remaining: 2m 18s
1:	learn: 0.4651043	total: 140ms	remaining: 1m 9s
2:	learn: 0.4635248	total: 141ms	remaining: 47s
3:	learn: 0.4632709	total: 144ms	remaining: 35.7s
4:	learn: 0.4632297	total: 144ms	remaining: 28.8s
5:	learn: 0.4632230	total: 146ms	remaining: 24.1s
6:	learn: 0.4632219	total: 147ms	remaining: 20.9s
7:	learn: 0.4632217	total: 148ms	remaining: 18.4s
8:	learn: 0.4632217	total: 149ms	remaining: 16.4s
9:	learn: 0.4632216	total: 150ms	remaining: 14.9s
10:	learn: 0.4632216	total: 151ms	remaining: 13.6s
11:	learn: 0.4632216	total: 152ms	remaining: 12.5s
12:	learn: 0.4632216	total: 155ms	remaining: 11.7s
13:	learn: 0.4490962	total: 161ms	remaining: 11.4s
14:	learn: 0.4490942	total: 163ms	remaining: 10.7s
15:	learn: 0.4490939	total: 164ms	remaining: 10.1s
16:	learn: 0.4490938	total: 165ms	remaining: 9.54s
17:	learn: 0.4393509	total: 167ms	remaining: 9.09s
18:	learn: 0.4342242	total: 168ms	remaining: 8.68s
19:	learn: 0.4342148	total: 170ms	remainin

0.7572463768115942

In [10]:
# 1. Define an objective function to be maximized.
def objective(trial):
    
    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'loss_function': trial.suggest_categorical('loss_function', ['Logloss', 'CrossEntropy']),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        'depth': trial.suggest_int('depth', 6, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
    }
    
    catboost = CatBoostClassifier(**param) #, cat_features=['PR Seq', 'RT Seq'])
    catboost.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False, early_stopping_rounds=100)
    # on test partition of train data
    y_pred = catboost.predict(x_test)
    
    return accuracy_score(y_true=y_test, y_pred=y_pred)

optune = False
if optune:
    # 3. Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10, timeout=60)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-05-11 10:19:44,946][0m A new study created in memory with name: no-name-ae9c7fc1-620c-433d-8eb2-b2aedc96e812[0m
[32m[I 2022-05-11 10:19:46,157][0m Trial 0 finished with value: 0.8152173913043478 and parameters: {'loss_function': 'Logloss', 'learning_rate': 0.00932882418626806, 'l2_leaf_reg': 0.9513388943642612, 'colsample_bylevel': 0.0869742928063322, 'depth': 6, 'boosting_type': 'Plain', 'min_data_in_leaf': 14, 'one_hot_max_size': 19}. Best is trial 0 with value: 0.8152173913043478.[0m
[32m[I 2022-05-11 10:19:47,439][0m Trial 1 finished with value: 0.8152173913043478 and parameters: {'loss_function': 'CrossEntropy', 'learning_rate': 0.0007359086224772867, 'l2_leaf_reg': 0.21591626812425035, 'colsample_bylevel': 0.03215775751722875, 'depth': 10, 'boosting_type': 'Plain', 'min_data_in_leaf': 15, 'one_hot_max_size': 17}. Best is trial 0 with value: 0.8152173913043478.[0m
[32m[I 2022-05-11 10:19:50,438][0m Trial 2 finished with value: 0.8152173913043478 and paramete

Number of finished trials: 10
Best trial:
  Value: 0.8152173913043478
  Params: 
    loss_function: Logloss
    learning_rate: 0.00932882418626806
    l2_leaf_reg: 0.9513388943642612
    colsample_bylevel: 0.0869742928063322
    depth: 6
    boosting_type: Plain
    min_data_in_leaf: 14
    one_hot_max_size: 19


In [17]:
# optuna.visualization.plot_param_importances(study)
# optuna.visualization.plot_optimization_history(study)
# optuna.visualization.plot_slice(study, params=['depth', 'learning_rate'])

In [11]:
# instead of 0s and 1s there is an H in the response
test_data[test_data['Resp']!='H']

Unnamed: 0,PatientID,Resp,VL-t0,CD4-t0
0,1,1,5.60,69
1,2,0,5.30,119
2,3,1,5.70,41
3,4,0,5.20,48
4,5,1,5.50,311
...,...,...,...,...
687,688,1,5.30,366
688,689,1,4.90,151
689,690,1,4.90,411
690,691,1,4.50,268


In [13]:
# apply it to the actual test set
catboost.fit(X, Y, verbose=False)
yt_true = test_data['Resp'].values
Xt = test_data.drop(['PatientID', 'Resp'], axis=1)
Yt = catboost.predict(Xt)
accuracy_score(y_true=yt_true, y_pred=Yt)

0.5375722543352601

In [None]:
# submissions = pd.DataFrame({
#     'PatientID': np.arange(1,yt_pred.shape[0]+1),
#     'ResponderStatus': yt_pred
# })
# submissions.to_csv('submission.csv', index=False)