## HIV Progression

### Libraries

In [14]:
# import libraries
from helpers.needlemanwunsch import nw

import re
import numpy as np
import pandas as pd
import optuna
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTENC
# from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

### Preprocessing

In [15]:
# read data into memory
train_data = pd.read_csv('./hivprogression/training_data.csv')
test_data = pd.read_csv('./hivprogression/test_data_mod.csv')

# drop 80 rows because of NaNs
train_data.dropna(subset=['PR Seq'], inplace=True)

In [16]:
def split_sequences(data, seq_unit, seq_cutoff):
    pr_splits = pd.DataFrame([re.findall('.'*seq_unit, x) for x in data['PR Seq']])
    pr_splits.columns = [f'pr_{x}' for x in range(0, pr_splits.shape[1])]
    
    rt_splits = pd.DataFrame([re.findall('.'*seq_unit, x) for x in data['RT Seq']])
    rt_splits.columns = [f'rt_{x}' for x in range(0,rt_splits.shape[1])]
    data = pd.concat([data, pr_splits, rt_splits.iloc[:, :seq_cutoff]], axis=1)
    return data.dropna()

# data with RT seq starts to be NaN index 300 onwards
seq_unit = 3 # as of biological nature
seq_cutoff = 60
train_data = split_sequences(train_data, seq_unit=seq_unit, seq_cutoff=seq_cutoff)
test_data = split_sequences(test_data, seq_unit=seq_unit, seq_cutoff=seq_cutoff)

In [17]:
# prepare X and y

# needed when model can not handle categorical data
if False:
    train_data.drop(['PR Seq', 'RT Seq'], axis=1, inplace=True) 
    test_data.drop(['PR Seq', 'RT Seq'], axis=1, inplace=True)

X = train_data.drop(['PatientID', 'Resp'], axis=1)
y = train_data['Resp'].values

In [18]:
# combine train and test data
# X = pd.concat([train_data.drop(['PatientID', 'Resp'], axis=1), test_data.drop(['PatientID', 'Resp'], axis=1)])
# y = pd.concat([pd.DataFrame(train_data['Resp'].values), pd.DataFrame(test_data['Resp'].values)])

In [31]:
print(nw(X['PR Seq'][1][0:150], X['PR Seq'][2][0:150], gap=0))

CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAA-GATAGGGGGGCAACT-AAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATG-G-AATTGCCAGG-AAGATGGAAACCAAAAAT-AATAGGGGGAATT
CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAAGG-TAGGGGGGCAACTAAAA-GAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGAGT--TTGCCAGGAAA-ATGGAAACCAAAAATG-ATAGGGGGAATT


In [None]:
categories = X.select_dtypes(exclude=[np.number]).keys()

# smote for all pipelines
cat_col_index = X.columns.isin(categories) 
smotenc = SMOTENC(categorical_features=cat_col_index, random_state=42)
X, y = smotenc.fit_resample(X, y)

In [None]:
# perform train test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
x_train.head()

### Modelling

In [None]:
# on test partition of train data

# best found params with optuna
param = {'loss_function': 'CrossEntropy', 'learning_rate': 0.5956840896672528, 'l2_leaf_reg': 0.24629732316062503, 'colsample_bylevel': 0.09221224327044178, 'depth': 6, 'boosting_type': 'Plain', 'min_data_in_leaf': 3, 'one_hot_max_size': 6}

catboost = CatBoostClassifier(**param, verbose=False, cat_features=categories.values)
catboost.fit(x_train, y_train)

y_pred = catboost.predict(x_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))
#print(cross_val_score(catboost, X, y, cv=3))

In [None]:
# apply it to the actual test set
# catboost.fit(X, y, verbose=False)
yt_true = test_data['Resp'].values
Xt = test_data.drop(['PatientID', 'Resp'], axis=1)
yt = catboost.predict(Xt)
accuracy_score(y_true=yt_true, y_pred=yt)

In [None]:
def display_top_features(clf, feature_list, n_features):
    # Get numerical feature importances
    importances = list(clf.feature_importances_)
    # List of tuples with features and respective importance
    clf_feature_importance = [
        (feature, round(importance, 5))
        for feature, importance in zip(feature_list, importances)
    ]
    # Sort the feature importances by most important first
    clf_feature_importance = sorted(
        clf_feature_importance, key=lambda x: x[1], reverse=True
    )

    # Print out the feature and importances
    [
        print("Variable: {:20} Importance: {}".format(*pair))
        for pair in clf_feature_importance[:n_features]
    ]
    
    return clf_feature_importance

display_top_features(catboost, x_train.columns, 15)

seq cutoff at 60

### seq_unit=1: (85% / 57%)
Variable: VL-t0                Importance: 13.86521
Variable: pr_5                 Importance: 10.39549
Variable: pr_27                Importance: 5.6937
Variable: pr_159               Importance: 3.41444
Variable: pr_109               Importance: 3.07486
Variable: CD4-t0               Importance: 2.74411

### seq_unit=3: (85% / 55%)
Variable: VL-t0                Importance: 9.28557
Variable: pr_9                 Importance: 7.53769
Variable: pr_1                 Importance: 7.27599
Variable: CD4-t0               Importance: 4.46829
Variable: pr_36                Importance: 3.19629
Variable: rt_59                Importance: 2.69794

### seq_unit=6: (83% / 55%)
Variable: VL-t0                Importance: 7.3187
Variable: pr_4                 Importance: 5.75921
Variable: pr_0                 Importance: 5.32162
Variable: pr_18                Importance: 3.63693
Variable: pr_32                Importance: 2.94053

### seq_unit=9 (87% / 53%)
Variable: VL-t0                Importance: 6.30741
Variable: pr_3                 Importance: 4.18024
Variable: pr_0                 Importance: 3.93211
Variable: rt_34                Importance: 3.06481
Variable: pr_12                Importance: 2.78733
Variable: pr_15                Importance: 2.54252

### seq_unit=12 (86% / 53%)
Variable: VL-t0                Importance: 7.67306
Variable: pr_0                 Importance: 6.37918
Variable: pr_23                Importance: 4.42358
Variable: rt_7                 Importance: 3.65446
Variable: rt_25                Importance: 3.19963
Variable: pr_20                Importance: 3.12038

### seq_unit=15 (87% / 52%)
Variable: VL-t0                Importance: 8.17894
Variable: pr_0                 Importance: 5.20007
Variable: rt_8                 Importance: 4.47401
Variable: pr_9                 Importance: 4.14473
Variable: rt_25                Importance: 3.44481
Variable: CD4-t0               Importance: 3.25052

### Misc and Tuning

seq cutoff at 300

### seq_unit=15: (55% / 59%)
Variable: rt_44                Importance: 18.45517
Variable: rt_60                Importance: 10.64342
Variable: pr_6                 Importance: 7.49245

### seq_unit=12:
Variable: rt_74                Importance: 23.79079
Variable: VL-t0                Importance: 23.77396
Variable: rt_36                Importance: 18.20179
Variable: rt_77                Importance: 18.20108

### seq_unit=9:
Variable: rt_70                Importance: 34.26852
Variable: rt_138               Importance: 33.70472
Variable: rt_91                Importance: 11.2332

### seq_unit=6:
Variable: pr_1                 Importance: 9.74413
Variable: VL-t0                Importance: 9.33733
Variable: pr_89                Importance: 4.40009
Variable: CD4-t0               Importance: 2.76098
Variable: rt_248               Importance: 1.8484

### seq_unit=3:
Variable: pr_1                 Importance: 9.74413
Variable: VL-t0                Importance: 9.33733
Variable: pr_89                Importance: 4.40009
Variable: CD4-t0               Importance: 2.76098
Variable: rt_248               Importance: 1.8484

### seq_unit=2:
Variable: VL-t0                Importance: 9.67627
Variable: pr_2                 Importance: 8.45288
Variable: CD4-t0               Importance: 6.85079
Variable: rt_274               Importance: 3.6759
Variable: pr_13                Importance: 3.59485

### seq_unit=1: (80% / 53%)
Variable: pr_5                 Importance: 16.89232
Variable: VL-t0                Importance: 12.00046
Variable: pr_27                Importance: 3.24595
Variable: rt_291               Importance: 2.66615
Variable: pr_211               Importance: 1.7225 

In [None]:
# submissions = pd.DataFrame({
#     'PatientID': np.arange(1,yt_pred.shape[0]+1),
#     'ResponderStatus': yt_pred
# })
# submissions.to_csv('submission.csv', index=False)

In [None]:
# 1. Define an objective function to be maximized.
def objective(trial):
    
    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'loss_function': trial.suggest_categorical('loss_function', ['Logloss', 'CrossEntropy']),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        'depth': trial.suggest_int('depth', 6, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
    }
    
    catboost = CatBoostClassifier(**param) #, cat_features=['PR Seq', 'RT Seq'])
    catboost.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False, early_stopping_rounds=100)
    # on test partition of train data
    y_pred = catboost.predict(x_test)
    
    return accuracy_score(y_true=y_test, y_pred=y_pred)

optune = False
if optune:
    # 3. Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10, timeout=60)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
        
# optuna.visualization.plot_param_importances(study)
# optuna.visualization.plot_optimization_history(study)
# optuna.visualization.plot_slice(study, params=['depth', 'learning_rate'])