## HIV Progression

### Libraries

In [1]:
# import libraries
import re
import numpy as np
import pandas as pd
import optuna
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

  from pandas import MultiIndex, Int64Index


### Preprocessing

In [2]:
# read data into memory
train_data = pd.read_csv('./hivprogression/training_data.csv')
test_data = pd.read_csv('./hivprogression/test_data_mod.csv')

# drop 80 rows because of NaNs
train_data.dropna(subset=['PR Seq'], inplace=True)

In [3]:
PR_splits = pd.DataFrame([re.findall('...', x) for x in train_data['PR Seq']])
PR_splits.columns = [f'PR_{x}' for x in range(0,PR_splits.shape[1])]
RT_splits = pd.DataFrame([re.findall('...', x) for x in train_data['RT Seq']])
RT_splits.columns = [f'RT_{x}' for x in range(0,RT_splits.shape[1])]
train_data = pd.concat([train_data, PR_splits, RT_splits.iloc[:, :300]], axis=1)
train_data.dropna(inplace=True)


In [4]:

PR_splits = pd.DataFrame([re.findall('...', x) for x in test_data['PR Seq']])
PR_splits.columns = [f'PR_{x}' for x in range(0,PR_splits.shape[1])]
RT_splits = pd.DataFrame([re.findall('...', x) for x in test_data['RT Seq']])
RT_splits.columns = [f'RT_{x}' for x in range(0,RT_splits.shape[1])]
test_data = pd.concat([test_data, PR_splits, RT_splits.iloc[:, :300]], axis=1)
test_data.dropna(inplace=True)

In [5]:
# prepare X and y

# needed when model can not handle categorical data
if True:
    train_data.drop(['VL-t0', 'CD4-t0', 'PR Seq', 'RT Seq'], axis=1, inplace=True)
    test_data.drop(['VL-t0', 'CD4-t0', 'PR Seq', 'RT Seq'], axis=1, inplace=True)

X = train_data.drop(['PatientID', 'Resp'], axis=1)
y = train_data['Resp'].values

In [6]:
categories = X.select_dtypes(exclude=[np.number]).keys()

In [7]:
# perform train test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [8]:
x_train.head()

Unnamed: 0,PR_0,PR_1,PR_2,PR_3,PR_4,PR_5,PR_6,PR_7,PR_8,PR_9,...,RT_290,RT_291,RT_292,RT_293,RT_294,RT_295,RT_296,RT_297,RT_298,RT_299
456,CCT,CAG,ATC,ACT,CTT,TGG,CAA,CGA,CCC,CTC,...,GAA,ATA,GTA,CCA,CTA,ACA,GAA,GAA,GCA,GAG
175,CCT,CAG,ATC,ACT,CTT,TGG,CAA,CGA,CCC,ATC,...,GAA,GTA,GTA,CCT,TTA,ACA,GAA,GAA,GCA,GAG
89,CCT,CAA,ATC,ACT,CTT,TGG,CAA,CGA,CCC,TTC,...,GAA,GTA,ATA,CCA,CTA,ACA,GCA,GAA,GCA,GAG
883,CCT,CAR,ATC,ACT,CTT,TGG,CAA,CGA,CCC,MTC,...,GAA,GTA,ATA,CCT,CTA,ACA,AAA,GAA,GSA,GAG
461,CCT,CAG,ATC,ACT,CTT,TGG,CAA,CGA,CCC,CTC,...,GAA,GTA,GTA,CCA,CTA,ACA,GAA,GAA,GCA,GAG


### Modelling

In [9]:
# best found params with optuna
param = {'loss_function': 'CrossEntropy', 'learning_rate': 0.5956840896672528, 'l2_leaf_reg': 0.24629732316062503, 'colsample_bylevel': 0.09221224327044178, 'depth': 6, 'boosting_type': 'Plain', 'min_data_in_leaf': 3, 'one_hot_max_size': 6}

catboost = CatBoostClassifier(**param, verbose=False, cat_features=categories.values)
catboost.fit(x_train, y_train)

# on test partition of train data
y_pred = catboost.predict(x_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))
#print(cross_val_score(catboost, X, y, cv=3))

0.7513513513513513


In [10]:
# apply it to the actual test set
# catboost.fit(X, y, verbose=False)
yt_true = test_data['Resp'].values
Xt = test_data.drop(['PatientID', 'Resp'], axis=1)
yt = catboost.predict(Xt)
accuracy_score(y_true=yt_true, y_pred=yt)

0.5119047619047619

### Misc and Tuning

In [None]:
# submissions = pd.DataFrame({
#     'PatientID': np.arange(1,yt_pred.shape[0]+1),
#     'ResponderStatus': yt_pred
# })
# submissions.to_csv('submission.csv', index=False)

In [None]:
# 1. Define an objective function to be maximized.
def objective(trial):
    
    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'loss_function': trial.suggest_categorical('loss_function', ['Logloss', 'CrossEntropy']),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        'depth': trial.suggest_int('depth', 6, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
    }
    
    catboost = CatBoostClassifier(**param) #, cat_features=['PR Seq', 'RT Seq'])
    catboost.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False, early_stopping_rounds=100)
    # on test partition of train data
    y_pred = catboost.predict(x_test)
    
    return accuracy_score(y_true=y_test, y_pred=y_pred)

optune = False
if optune:
    # 3. Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10, timeout=60)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
        
# optuna.visualization.plot_param_importances(study)
# optuna.visualization.plot_optimization_history(study)
# optuna.visualization.plot_slice(study, params=['depth', 'learning_rate'])