In [58]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support,roc_auc_score,auc,roc_curve, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
import sklearn.preprocessing as pp
#alt.renderers.enable('notebook')
from IPython.display import display
from mord import LogisticAT

import stages_DE.stages_library
import importlib
importlib.reload(stages_DE.stages_library)

from networks.functionsDENet import loadPickle,savePickle
from stages_DE.stages_library import PHENOTYPES, PHENOTYPES_X, summary_classification, summary_classification_print_sort, scatter_catgory


In [5]:
proteus=True
if proteus:
    pathClassification = '/home/khrovatin/timeTrajectoriesNet/data/stages/classification/'
    dataPath= '/home/khrovatin/timeTrajectoriesNet/data/RPKUM/'
else:
    pathClassification = '/home/karin/Documents/timeTrajectories/data/stages/classification/'

In [6]:
genes = pd.read_csv(dataPath + 'mergedGenes_RPKUM.tsv', sep='\t', index_col=0)
conditions = pd.read_csv(dataPath + 'conditions_mergedGenes.tsv', sep='\t', index_col=None)

# Retain only samples with annotations
Y = conditions[(conditions[PHENOTYPES] != 0).any(axis=1)]
X = genes[Y.Measurment].T.values

# Remove targets with too little positive samples
order=PHENOTYPES.copy()
order.remove('tag_spore')
Y = Y[order].values

# Remove constant features
X=X[:,(X.std(axis=0)!=0)]

In [27]:
# Transform multi-target data into single target data
# (assign sample to each Y that it has annotated - some samples will be repeated)
# Transform Y form multi target with labels {0,1} to single target with labels 1...N (ordered ints)
X_transformed=[]
Y_transformed=[]
for idx_sample in range(Y.shape[0]):
    y=Y[idx_sample,:]
    x=X[idx_sample,:]
    for idx_phenotype,phenotype in enumerate(order):
        if y[idx_phenotype] ==1:
            Y_transformed.append(PHENOTYPES_X[phenotype])
            X_transformed.append(x)
X_transformed=np.array(X_transformed)
Y_transformed=np.array(Y_transformed)

In [59]:
X_model=X_transformed.copy()
Y_model=Y_transformed.copy()
order_model=order.copy()

split = StratifiedKFold(n_splits=5)
fold=0
                
# Cross validation
for train_index, test_index in split.split(X_model, Y_model):
    fold += 1
    print(fold)
    scaler = pp.MinMaxScaler()
    #Scale X features to [0,1], use X_train_fold scaller to also scale X_test_fold
    X_train_fold, X_test_fold = X_model[train_index], X_model[test_index]
    Y_train_fold, Y_test_fold = Y_model[train_index], Y_model[test_index]
    X_train_fold=scaler.fit_transform(X_train_fold)
    X_test_fold=scaler.transform(X_test_fold)
    
    classifier=LogisticAT().fit(X_train_fold,Y_train_fold)
    
            # Quality metrics for the model
        Y_predict_fold = classifier.predict(X_test_fold)
        Y_p_fold = classifier.predict_proba(X_test_fold)
        
        prfs=pd.DataFrame(precision_recall_fscore_support(Y_test_fold, Y_predict_fold),index=['precision','recall','F_score','support']).T
        prfs['Group']=order_model
        prfs['params']=[c]*prfs.shape[0]
        prfs_all=prfs_all.append(prfs)
        prfs=list(precision_recall_fscore_support(Y_test_fold, Y_predict_fold, average='micro'))
        prfs.extend(['micro',c])
        prfs=dict(zip(['precision','recall','F_score','support','Group',"params"],prfs))
        prfs_all = prfs_all.append( prfs,ignore_index=True)
        prfs=list(precision_recall_fscore_support(Y_test_fold, Y_predict_fold, average='macro'))
        prfs.extend(['macro',c])
        prfs=dict(zip(['precision','recall','F_score','support','Group',"params"],prfs))
        prfs_all = prfs_all.append( prfs,ignore_index=True)
        
        rac=pd.DataFrame(roc_auc_score(Y_test_fold, Y_p_fold,average=None),columns=['roc_auc'])
        rac['Group']=order_model
        rac['params']=[c]*rac.shape[0]
        rac_all = rac_all.append(rac,ignore_index=True)
        rac=dict(zip(['roc_auc','Group','params'],[roc_auc_score(Y_test_fold, Y_p_fold, average='micro'),'micro',c]))
        rac_all=rac_all.append(rac,ignore_index=True)
        rac=dict(zip(['roc_auc','Group','params'],[roc_auc_score(Y_test_fold, Y_p_fold, average='macro'),'macro',c]))
        rac_all=rac_all.append(rac,ignore_index=True)
        
        # N used features in the model
        feats_combined = set()
        for i in range(len(order_model)):
            cl = classifier.estimators_[i]
            feats_stage = set(pd.Series(range(X_model.shape[1]))[(cl.coef_ != 0).flatten()[:X_model.shape[1]]].index)
            feats_combined = feats_combined | feats_stage
            #print(len(feats_stage), len(feats_combined))
            feats=dict(zip(['N_features','params','Group'],[len(feats_stage),c,order_model[i]]))
            feats_all=feats_all.append(feats,ignore_index=True)
        feats= dict(zip(['N_features', 'params', 'Group'],[len(feats_combined), c, 'all']))
        feats_all = feats_all.append(feats,ignore_index=True)
savePickle(pathClassification+'logisticOrdinalRegression.pkl',{'prfs':prfs_all,'rac':rac_all,'featsN':feats_all})

1


KeyboardInterrupt: 

In [60]:
classifier=LogisticAT().fit(X_test_fold,Y_test_fold)

In [63]:
classifier.predict_proba(X_train_fold[:5,:])

array([[5.49539022e-12, 8.32295123e-12, 6.15387897e-11, 1.57287722e-09,
        2.67192562e-06, 2.21331836e-04, 3.29258868e-01, 6.17198312e-01,
        5.30921426e-02, 2.26671094e-04],
       [3.85478936e-14, 5.83820667e-14, 4.31669203e-13, 1.10330844e-11,
        1.87425049e-08, 1.55290099e-06, 3.43345967e-03, 1.07316221e-01,
        8.57939151e-01, 3.13095969e-02],
       [2.19997922e-13, 3.33194168e-13, 2.46359317e-12, 6.29672701e-11,
        1.06965942e-07, 8.86254476e-06, 1.92832690e-02, 3.96183602e-01,
        5.78892687e-01, 5.63147192e-03],
       [2.19997922e-13, 3.33194168e-13, 2.46359317e-12, 6.29672701e-11,
        1.06965942e-07, 8.86254476e-06, 1.92832690e-02, 3.96183602e-01,
        5.78892687e-01, 5.63147192e-03],
       [9.75086280e-11, 1.47680060e-10, 1.09192663e-09, 2.79086815e-08,
        4.74077585e-05, 3.91239866e-03, 8.93149091e-01, 9.97269321e-02,
        3.15136388e-03, 1.27774602e-05]])

In [62]:
Y_train_fold[:5]

array([8, 8, 8, 9, 8])