In [1]:
import sys
import os
import lzma
import random

In [2]:
import catboost
import pandas
import xgboost as xgb
from sklearn.metrics import precision_recall_curve, auc
from catboost import Pool, CatBoostClassifier

In [3]:
cancer_data_dir = '/home/leron/projects/cancer/data'
dump_dir = os.path.join(cancer_data_dir, 'bcDump/example15bmc')
merged_path = os.path.join(dump_dir, 'ex15bmcMerged.csv.xz')
bmc_all_path = os.path.join(dump_dir, 'bmc15mldata1.csv')

In [4]:
dtype = {'DFS': pandas.Int64Dtype(),
         'pCR': pandas.Int64Dtype(),
         'RFS': pandas.Int64Dtype(), 
         'DFS': pandas.Int64Dtype(), 
         'posOutcome': pandas.Int64Dtype()}

In [5]:
surgery_mapping = dict()

In [6]:
def convert_surgery(x):
    if x not in surgery_mapping:
        surgery_mapping[x] = len(surgery_mapping) + 1
    return surgery_mapping[x]

In [7]:
bmc = pandas.read_csv(bmc_all_path, dtype=dtype, converters=dict(surgery=convert_surgery))
bmc = bmc.sort_values(by='patient_ID')

In [8]:
bmc.dtypes

study         object
patient_ID     int64
radio          int64
surgery        int64
chemo          int64
hormone        int64
pCR            Int64
RFS            Int64
DFS            Int64
posOutcome     Int64
dtype: object

In [9]:
gene_expression = pandas.read_csv(lzma.open(merged_path))

In [10]:
gene_expression.head(5)

Unnamed: 0,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,0.104353,...,-1.564143,0.466733,0.827552,-0.617981,0.303161,1.260602,-0.217995,0.219529,0.389849,1.313703
1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,-0.214238,...,0.711752,0.358388,0.037911,2.304784,0.328942,-1.028791,-0.850002,-0.292574,-0.068982,0.722123
2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,0.027471,...,-0.011786,-0.474762,-0.349981,-0.097197,0.100946,-0.5547,-0.367363,0.094464,-0.372665,-0.790771
3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,0.233178,...,0.757255,0.590212,0.06015,2.287583,-0.108866,-1.1325,-0.106976,-0.216267,0.393671,-0.027349
4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,-0.132271,...,0.407159,0.570637,0.851658,-0.41295,0.105692,-1.047445,0.08448,-0.224081,-0.021074,0.764555


In [11]:
genes_features = gene_expression[gene_expression.patient_ID.isin(bmc.patient_ID)]

In [12]:
genes_features = genes_features.sort_values(by='patient_ID')


In [13]:
feature_columns = genes_features.columns.to_list()[1:] + ['radio', 'surgery', 'chemo', 'hormone']
label_columns = ['pCR', 'RFS', 'DFS', 'posOutcome']
label_columns = ['posOutcome']

In [14]:
merged = pandas.merge(genes_features, bmc, left_on='patient_ID', right_on='patient_ID')

In [15]:
def split():
    for eval_study in set(bmc.study): 
        bmc_train = bmc[bmc.study != eval_study]
        bmc_val = bmc[bmc.study == eval_study]
        assert (not set(bmc_train.patient_ID).intersection(set(bmc_val.patient_ID)))

        train_split = merged[merged.patient_ID.isin(bmc_train.patient_ID)]
        val_split = merged[merged.patient_ID.isin(bmc_val.patient_ID)]
        assert val_split.patient_ID.to_list() == bmc_val.patient_ID.to_list()
        train_data = train_split[feature_columns].to_numpy()
        train_labels = train_split[label_columns].to_numpy().astype(int)
        val_data = val_split[feature_columns].to_numpy()
        val_labels = val_split[label_columns].to_numpy().astype(int)
        yield train_data, train_labels, val_data, val_labels

In [17]:


model = CatBoostClassifier(iterations=140,
                           depth=4,
                           learning_rate=0.5,
                           loss_function='Logloss',
                           verbose=False,
                           l2_leaf_reg=1)

total_xgboost = []
total_catboost = []
for train_data, train_labels, val_data, val_labels in split(): 
    catboost_pool = Pool(train_data, 
                        train_labels)

    test_data = Pool(val_data,
                     val_labels) 
    # train the model
    res = model.fit(train_data, train_labels, 
              eval_set=test_data,
              save_snapshot=False, snapshot_file='vasya')
    res.eval_metrics(catboost_pool, ['F1', 'Recall', 'Precision', 'AUC'])
    test_res = res.eval_metrics(test_data, ['F1', 'Recall', 'Precision', 'AUC'])
    total_catboost.append(max(test_res['AUC'])))
    print('catboost: {0}'.format(total_catboost[-1])
    
    
    clf = xgb.XGBClassifier()
    clf = clf.fit(train_data, train_labels)
    y_pred = clf.predict(val_data)
    precision, recall, thresholds = precision_recall_curve(y_pred, val_labels.flatten())
    total_xgboost.append(auc(recall, precision)))
    print('xgboost {0}'.format(total_xgboost[-1])

catboost: 0.6260775862068966


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


xgboost 0.9800924778966601


KeyboardInterrupt: 