# Ordinal logistic regression for phenotypes

In [130]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support,roc_auc_score
from sklearn.model_selection import StratifiedKFold
import sklearn.preprocessing as pp
#alt.renderers.enable('notebook')
from IPython.display import display
from mord import LogisticAT

import stages_DE.stages_library
import importlib
importlib.reload(stages_DE.stages_library)

from networks.functionsDENet import loadPickle,savePickle
from stages_DE.stages_library import PHENOTYPES, PHENOTYPES_X, summary_classification, summary_classification_print_sort, scatter_catgory


## Prepare data

In [5]:
proteus=True
if proteus:
    pathClassification = '/home/khrovatin/timeTrajectoriesNet/data/stages/classification/'
    dataPath= '/home/khrovatin/timeTrajectoriesNet/data/RPKUM/'
else:
    pathClassification = '/home/karin/Documents/timeTrajectories/data/stages/classification/'

In [6]:
genes = pd.read_csv(dataPath + 'mergedGenes_RPKUM.tsv', sep='\t', index_col=0)
conditions = pd.read_csv(dataPath + 'conditions_mergedGenes.tsv', sep='\t', index_col=None)

# Retain only samples with annotations
Y = conditions[(conditions[PHENOTYPES] != 0).any(axis=1)]
X = genes[Y.Measurment].T.values

# Remove targets with too little positive samples
order=PHENOTYPES.copy()
order.remove('tag_spore')
Y = Y[order].values

# Remove constant features
X=X[:,(X.std(axis=0)!=0)]

As ordinal regression works on  single target multi class data the phenotype data (multi target, each with 2 classes - yes/no - multilabel data) must be transformed. For this a single target feature (1-N) was created. Sample was assigned a single value (1-N). If it had multiple annotated phenotypes (n) it was used multiple times (n-times) with different assignmnets. 

In [27]:
# Transform multi-target data into single target data
# (assign sample to each Y that it has annotated - some samples will be repeated)
# Transform Y form multi target with labels {0,1} to single target with labels 1...N (ordered ints)
X_transformed=[]
Y_transformed=[]
for idx_sample in range(Y.shape[0]):
    y=Y[idx_sample,:]
    x=X[idx_sample,:]
    for idx_phenotype,phenotype in enumerate(order):
        if y[idx_phenotype] ==1:
            Y_transformed.append(PHENOTYPES_X[phenotype])
            X_transformed.append(x)
X_transformed=np.array(X_transformed)
Y_transformed=np.array(Y_transformed)

## Train
Data is split using 5-fold cross validation, scaled to [0,1], and used for training/evaluation of ordinal logistic regression.

In [None]:
prfs_all=pd.DataFrame()
rac_all=pd.DataFrame()
feats_all=pd.DataFrame()

X_model=X_transformed.copy()
Y_model=Y_transformed.copy()
order_model=order.copy()

split = StratifiedKFold(n_splits=5)
fold=0
                
# Cross validation
for train_index, test_index in split.split(X_model, Y_model):
    fold += 1
    print(fold)
    scaler = pp.MinMaxScaler()
    #Scale X features to [0,1], use X_train_fold scaller to also scale X_test_fold
    X_train_fold, X_test_fold = X_model[train_index], X_model[test_index]
    Y_train_fold, Y_test_fold = Y_model[train_index], Y_model[test_index]
    X_train_fold=scaler.fit_transform(X_train_fold)
    X_test_fold=scaler.transform(X_test_fold)
    
    classifier=LogisticAT().fit(X_train_fold,Y_train_fold)
    
    # Quality metrics for the model
    Y_predict_fold = classifier.predict(X_test_fold)
    Y_p_fold = classifier.predict_proba(X_test_fold)

    prfs=pd.DataFrame(precision_recall_fscore_support(Y_test_fold, Y_predict_fold),index=['precision','recall','F_score','support']).T
    prfs['Group']=order_model
    prfs_all=prfs_all.append(prfs)
    prfs=list(precision_recall_fscore_support(Y_test_fold, Y_predict_fold, average='micro'))
    prfs.extend(['micro'])
    prfs=dict(zip(['precision','recall','F_score','support','Group'],prfs))
    prfs_all = prfs_all.append( prfs,ignore_index=True)
    prfs=list(precision_recall_fscore_support(Y_test_fold, Y_predict_fold, average='macro'))
    prfs.extend(['macro'])
    prfs=dict(zip(['precision','recall','F_score','support','Group'],prfs))
    prfs_all = prfs_all.append( prfs,ignore_index=True)

    rac=dict(zip(['roc_auc','Group'],[roc_auc_score(Y_test_fold, Y_p_fold,multi_class='ovr',average='weighted'),'weighted_ovr']))
    rac_all=rac_all.append(rac,ignore_index=True)
    rac=dict(zip(['roc_auc','Group'],[roc_auc_score(Y_test_fold, Y_p_fold,multi_class='ovo', average='macro'),'macro_ovo']))
    rac_all=rac_all.append(rac,ignore_index=True)

    # N used features in the model
    feats= dict(zip(['N_features',  'Group'],[(classifier.coef_ !=0).sum(),  'all']))
    feats_all = feats_all.append(feats,ignore_index=True)
savePickle(pathClassification+'logisticOrdinalRegression.pkl',{'prfs':prfs_all,'rac':rac_all,'featsN':feats_all})

## Result

In [126]:
result=loadPickle(pathClassification+'logisticOrdinalRegression.pkl')

In [131]:
summary=summary_classification(result['prfs'],'F_score',"Group",print_df=False)
print('F score summary for ordinal logistic regression')
summary_classification_print_sort(summary,statistic='F score',averages=['macro','micro'],groups=order)
print('\n')

F score summary for ordinal logistic regression
Mean cross validation F score averaged across all phenotypes and standard error
macro       0.23  +- 0.03
micro       0.35  +- 0.04
Mean cross validation F score of individual phenotypes and standard error
no_agg      0.65  +- 0.06
mhat        0.28  +- 0.08
tip         0.27  +- 0.12
tag         0.23  +- 0.04
lag         0.22  +- 0.03
slug        0.16  +- 0.10
cul         0.15  +- 0.09
FB          0.14  +- 0.10
disappear   0.13  +- 0.08
stream      0.04  +- 0.04




## Conclusion
The ordinal regression can not directly deal with multiple target features that are ordered. This explains the poor performance compared to the OvR classifier. When triing to convert multi label data into single label multi class data each sample was assigned a single label (with same sample occuring multiple times with different labels). At prediction time each such repeated sample originating form the same original sample got the same predicted label with highest probability. Therefore all but one of the repeated samples necesarily got the wrong label. An option to mitigate this would be to use original samples in predictions (non repeated), assigning them all classes with p > X. However, it is not clear how to select X.