In [35]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [36]:
train_data = pd.read_csv('./hivprogression/training_data.csv')
test_data = pd.read_csv('./hivprogression/test_data_mod.csv')

In [37]:
test_data.head()

Unnamed: 0,PatientID,Resp,PR Seq,RT Seq,VL-t0,CD4-t0
0,1,1,NCTCTATTAGATACAGGAGCAGATGACACAGTATTAGAAGARATGG...,CCTATTAGTCCTATTGAAACTGTACCAGTRAAATTAAAGCCAGGAA...,5.6,69
1,2,0,NCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGA...,CCCATCAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.3,119
2,3,1,GGGCAAATAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.7,41
3,4,0,GGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAG...,CCTATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.2,48
4,5,1,GGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATA...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.5,311


In [38]:
print(train_data['PR Seq'].isnull().sum())
print(train_data['RT Seq'].isnull().sum())
print(test_data['PR Seq'].isnull().sum())
print(test_data['RT Seq'].isnull().sum())

80
0
0
0


In [39]:
# drop 80 rows because of NaNs
train_data.dropna(subset=['PR Seq'], inplace=True)

In [40]:

Y = train_data['Resp'].values
X = train_data.drop(['PatientID', 'Resp'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [41]:
x_train.tail()

Unnamed: 0,PR Seq,RT Seq,VL-t0,CD4-t0
106,CCTCAAATCACTCTTTGGCAGCGACCCATTGTTACAGTAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.4,319
270,CCTCAGATCACTCTTTGGCAACGACCCCGCGTCACAGTAARGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.4,56
860,CCTCARATCACTCTTTGGCAACGACCCCTCGTCACAATAAGGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.2,496
435,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCCATAAGTCCTATTGAAACTGTACCAGTAAAATTRAAGCCAGGAA...,3.9,327
102,CCTCAAATCACTCTTTGGCAAMGACCCCTCGTCACAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.7,546


In [42]:
catboost = CatBoostClassifier(cat_features=['PR Seq', 'RT Seq'], verbose=False)
catboost.fit(x_train, y_train)
# on test partition of train data
y_pred = catboost.predict(x_test)
accuracy_score(y_true=y_test, y_pred=y_pred)

0.8043478260869565

In [47]:
import optuna

# 1. Define an objective function to be maximized.
def objective(trial):
    
    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'depth': trial.suggest_float('depth', 6, 10)
    }
    
    catboost = CatBoostClassifier(cat_features=['PR Seq', 'RT Seq'], verbose=False)
    catboost.fit(x_train, y_train)
    
    
    
    # on test partition of train data
    y_pred = catboost.predict(x_test)
    return accuracy_score(y_true=y_test, y_pred=y_pred)

In [43]:
# instead of 0s and 1s there is an H in the response
test_data[test_data['Resp']!='H']

Unnamed: 0,PatientID,Resp,PR Seq,RT Seq,VL-t0,CD4-t0
0,1,1,NCTCTATTAGATACAGGAGCAGATGACACAGTATTAGAAGARATGG...,CCTATTAGTCCTATTGAAACTGTACCAGTRAAATTAAAGCCAGGAA...,5.60,69
1,2,0,NCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGA...,CCCATCAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.30,119
2,3,1,GGGCAAATAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.70,41
3,4,0,GGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAG...,CCTATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.20,48
4,5,1,GGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATA...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.50,311
...,...,...,...,...,...,...
687,688,1,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.30,366
688,689,1,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.90,151
689,690,1,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCTATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.90,411
690,691,1,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,4.50,268


In [44]:
# apply it to the actual test set
# catboost.fit(X, Y)
yt_true = test_data['Resp'].values
Xt = test_data.drop(['PatientID', 'Resp'], axis=1)
yt_pred = catboost.predict(Xt)
accuracy_score(y_true=yt_true, y_pred=yt_pred)

0.5303468208092486

In [45]:
# submissions = pd.DataFrame({
#     'PatientID': np.arange(1,yt_pred.shape[0]+1),
#     'ResponderStatus': yt_pred
# })
# submissions.to_csv('submission.csv', index=False)