In [2]:
import sys
import os
import lzma
import random
from collections import defaultdict

In [3]:
import numpy
import pandas

In [None]:
import xgboost as xgb

In [5]:
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import f1_score

In [None]:
from catboost import Pool, CatBoostClassifier
import catboost

In [4]:
cancer_data_dir = '/home/leron/projects/cancer/data'
dump_dir = os.path.join(cancer_data_dir, 'bcDump/example15bmc')
merged_path = os.path.join(dump_dir, 'ex15bmcMerged.csv.xz')
bmc_all_path = os.path.join(dump_dir, 'bmc15mldata1.csv')

In [5]:
dtype = {'DFS': pandas.Int64Dtype(),
         'pCR': pandas.Int64Dtype(),
         'RFS': pandas.Int64Dtype(), 
         'DFS': pandas.Int64Dtype(), 
         'posOutcome': pandas.Int64Dtype()}

In [6]:
surgery_mapping = dict()

In [7]:
def convert_surgery(x):
    if x not in surgery_mapping:
        surgery_mapping[x] = len(surgery_mapping) + 1
    return surgery_mapping[x]

In [8]:
bmc = pandas.read_csv(bmc_all_path, dtype=dtype, converters=dict(surgery=convert_surgery))
bmc = bmc.sort_values(by='patient_ID')

In [11]:
gene_expression = pandas.read_csv(lzma.open(merged_path))

In [12]:
gene_expression.head(5)

Unnamed: 0,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,0.104353,...,-1.564143,0.466733,0.827552,-0.617981,0.303161,1.260602,-0.217995,0.219529,0.389849,1.313703
1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,-0.214238,...,0.711752,0.358388,0.037911,2.304784,0.328942,-1.028791,-0.850002,-0.292574,-0.068982,0.722123
2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,0.027471,...,-0.011786,-0.474762,-0.349981,-0.097197,0.100946,-0.5547,-0.367363,0.094464,-0.372665,-0.790771
3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,0.233178,...,0.757255,0.590212,0.06015,2.287583,-0.108866,-1.1325,-0.106976,-0.216267,0.393671,-0.027349
4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,-0.132271,...,0.407159,0.570637,0.851658,-0.41295,0.105692,-1.047445,0.08448,-0.224081,-0.021074,0.764555


In [13]:
genes_features = gene_expression[gene_expression.patient_ID.isin(bmc.patient_ID)]

In [14]:
genes_features = genes_features.sort_values(by='patient_ID')

In [15]:
feature_columns = genes_features.columns.to_list()[1:] + ['radio', 'surgery', 'chemo', 'hormone']
label_columns = ['pCR', 'RFS', 'DFS', 'posOutcome']
label_columns = ['posOutcome']
feature_columns = ['radio', 'surgery', 'chemo', 'hormone']

In [16]:
merged = pandas.merge(genes_features, bmc, left_on='patient_ID', right_on='patient_ID')

In [17]:
def split(study_name=None):
    for eval_study in set(bmc.study):
        if study_name:
            eval_study = study_name
        print(eval_study)
        bmc_train = bmc[bmc.study != eval_study]
        bmc_val = bmc[bmc.study == eval_study]
        assert (not set(bmc_train.patient_ID).intersection(set(bmc_val.patient_ID)))

        train_split = merged[merged.patient_ID.isin(bmc_train.patient_ID)]
        val_split = merged[merged.patient_ID.isin(bmc_val.patient_ID)]
        assert val_split.patient_ID.to_list() == bmc_val.patient_ID.to_list()
        train_data = train_split[feature_columns].to_numpy()
        train_labels = train_split[label_columns].to_numpy().astype(int)
        val_data = val_split[feature_columns].to_numpy()
        val_labels = val_split[label_columns].to_numpy().astype(int)
        yield train_data, train_labels, val_data, val_labels
        if study_name:
            break

In [18]:
def compute_metrics(result, y_true, y_pred, x_true, x_pred):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    result['f1'].append(f1)
    result['auc'].append(auc(recall, precision))
    result['recall'].append(recall[1])
    result['precision'].append(precision[1])
    result['train_f1'].append(f1_score(x_true, x_pred))

In [17]:
model = CatBoostClassifier(iterations=150,
                           depth=5,
                           learning_rate=0.025,
                           loss_function='Logloss',
                           model_size_reg=2,
                           verbose=False,
                           scale_pos_weight=0.605,
                           l2_leaf_reg=1)

#total_xgboost = defaultdict(list)
total_catboost = defaultdict(list)

for train_data, train_labels, val_data, val_labels in split(): 
    catboost_pool = Pool(train_data, 
                        train_labels)

    test_data = Pool(val_data,
                     val_labels) 
    # train the model
    clf = model.fit(train_data, train_labels, 
              eval_set=test_data,
              save_snapshot=False, snapshot_file='vasya')
    y_pred = clf.predict(val_data)
    x_pred = clf.predict(train_data)
    compute_metrics(total_catboost, val_labels.flatten(), y_pred, train_labels, x_pred)
    for key in total_catboost:
        print('catboost  {0}: {1}'.format(key, total_catboost[key][-1]))
    
    
    #clf = xgb.XGBClassifier()
    #clf = clf.fit(train_data, train_labels)
    #y_pred = clf.predict(val_data)
    #x_pred = clf.predict(train_data)
    #compute_metrics(total_xgboost, val_labels.flatten(), y_pred, train_labels, x_pred)
    #for key in total_xgboost:
    #    print('xgboost  {0}: {1}'.format(key, total_xgboost[key][-1]))

study_17705_GPL96_JBI_Tissue_BC_Tamoxifen-bmc15


KeyboardInterrupt: 

In [None]:
for key in total_xgboost:
    print('{0}: {1}'.format(key, numpy.mean(total_xgboost[key])))

In [None]:
for key in total_catboost:
    print('{0}: {1}'.format(key, numpy.mean(total_catboost[key])))

In [18]:
res = defaultdict(list)
model = CatBoostClassifier(iterations=150,
                           depth=5,
                           learning_rate=0.025,
                           loss_function='Logloss',
                           model_size_reg=2,
                           verbose=True,
                           scale_pos_weight=0.605,
                           l2_leaf_reg=1)
train_data, train_labels, val_data, val_labels = next(split('study_16446_GPL570_all-bmc15'))
catboost_pool = Pool(train_data, 
                    train_labels)

test_data = Pool(val_data,
                 val_labels) 
# train the model
clf = model.fit(train_data, train_labels, 
          eval_set=test_data,
          save_snapshot=False, snapshot_file='vasya')
y_pred = clf.predict(val_data)
x_pred = clf.predict(train_data)
compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
res

study_16446_GPL570_all-bmc15
0:	learn: 0.6875831	test: 0.6977231	best: 0.6977231 (0)	total: 1.07s	remaining: 2m 39s
1:	learn: 0.6832383	test: 0.7021462	best: 0.6977231 (0)	total: 1.85s	remaining: 2m 17s
2:	learn: 0.6784103	test: 0.7018743	best: 0.6977231 (0)	total: 2.65s	remaining: 2m 9s
3:	learn: 0.6738339	test: 0.7031812	best: 0.6977231 (0)	total: 3.49s	remaining: 2m 7s
4:	learn: 0.6689830	test: 0.7014024	best: 0.6977231 (0)	total: 4.27s	remaining: 2m 3s
5:	learn: 0.6651231	test: 0.6992907	best: 0.6977231 (0)	total: 5.05s	remaining: 2m 1s
6:	learn: 0.6603499	test: 0.6975495	best: 0.6975495 (6)	total: 5.82s	remaining: 1m 58s
7:	learn: 0.6559719	test: 0.7013024	best: 0.6975495 (6)	total: 6.6s	remaining: 1m 57s
8:	learn: 0.6516918	test: 0.6985079	best: 0.6975495 (6)	total: 7.36s	remaining: 1m 55s
9:	learn: 0.6476093	test: 0.6971920	best: 0.6971920 (9)	total: 8.14s	remaining: 1m 53s
10:	learn: 0.6439463	test: 0.6943995	best: 0.6943995 (10)	total: 9s	remaining: 1m 53s
11:	learn: 0.6408989

93:	learn: 0.5042675	test: 0.6583167	best: 0.6533143 (81)	total: 1m 18s	remaining: 46.6s
94:	learn: 0.5032890	test: 0.6579454	best: 0.6533143 (81)	total: 1m 19s	remaining: 45.8s
95:	learn: 0.5023024	test: 0.6575665	best: 0.6533143 (81)	total: 1m 19s	remaining: 44.9s
96:	learn: 0.5016706	test: 0.6570660	best: 0.6533143 (81)	total: 1m 20s	remaining: 44.1s
97:	learn: 0.5005017	test: 0.6578499	best: 0.6533143 (81)	total: 1m 21s	remaining: 43.2s
98:	learn: 0.4989630	test: 0.6592116	best: 0.6533143 (81)	total: 1m 22s	remaining: 42.4s
99:	learn: 0.4980987	test: 0.6589632	best: 0.6533143 (81)	total: 1m 23s	remaining: 41.6s
100:	learn: 0.4972622	test: 0.6590636	best: 0.6533143 (81)	total: 1m 24s	remaining: 40.8s
101:	learn: 0.4964582	test: 0.6592480	best: 0.6533143 (81)	total: 1m 24s	remaining: 39.9s
102:	learn: 0.4954886	test: 0.6589985	best: 0.6533143 (81)	total: 1m 25s	remaining: 39.1s
103:	learn: 0.4947223	test: 0.6583046	best: 0.6533143 (81)	total: 1m 26s	remaining: 38.2s
104:	learn: 0.493

defaultdict(list,
            {'f1': [0.8541666666666666],
             'auc': [0.8894341641165202],
             'recall': [0.9213483146067416],
             'precision': [0.7961165048543689],
             'train_f1': [0.8671698113207548]})

In [19]:
from opencog.atomspace import AtomSpace
from opencog.pymoses import moses
from opencog.scheme_wrapper import scheme_eval

In [20]:
train_data, train_labels, val_data, val_labels = next(split('study_16446_GPL570_all-bmc15'))

study_16446_GPL570_all-bmc15


In [42]:
input_data = numpy.concatenate([train_labels, train_data], axis=1)

In [45]:
input_data[:,[0, 2]] = input_data[:,[2,0]]

dtype('int64')

In [43]:
mos = moses()

In [46]:
output = mos.run(input=input_data, python=True, args='--balance=1 -m 100000')

  """Entry point for launching an IPython kernel.


MosesException: Error: exception occurred calling C++ MOSES.

In [41]:
output[0].program

b'#!/usr/bin/env python\n\n#score: -814\nimport operator as op\nfrom functools import reduce\nfrom math import log, exp, sin\ndef l0(i): return 0 < i\ndef adds(*args): return sum(args)\ndef muls(*args): return reduce(op.mul, args)\ndef pdiv(a, b): return a / (b + 0.000001)\ndef moses_eval(i):\n    return True \n'

In [58]:
mos = moses()
input_data = [[0, 0, 0], [1, 1, 0], [1, 0, 1], [2, 1, 1]]
output = mos.run(input=input_data, python=True)
print (output[0].score) # Prints: 0
model = output[0].eval
print(model([0, 1]))  # Returns: True
print(model([1, 1]))  # Returns: False

0


SyntaxError: invalid syntax (<string>, line 12)