# Phenotypic classification based on gene expression
Below are classification models for assignment of multiple phenotypes to expression samples (multi-label classification).

In [1]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)


import pandas as pd
import numpy as np
from skmultilearn.problem_transform import ClassifierChain, BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support,roc_auc_score
from skmultilearn.model_selection import IterativeStratification
import sklearn.preprocessing as pp
import altair as alt
#alt.renderers.enable('notebook')
from IPython.display import display

from networks.functionsDENet import loadPickle,savePickle
from stages_DE.stages_library import PHENOTYPES

In [2]:
proteus=True
if proteus:
    pathClassification = '/home/khrovatin/timeTrajectoriesNet/data/stages/classification/'
else:
    pathClassification = '/home/karin/Documents/timeTrajectories/data/stages/classification/'

## Training models

Data was previoulsy split into training and test set (10%). For cross validation used above only the training set is used. The split was done so that it preserves proportions of individual labels in training and test set. 

Phenotype tag_spore was not modeled as it has only 3 samples. Nevertheless, its data is still used in modeling other target features (all labels are 0).

In [3]:
X_train, Y_train, X_test, Y_test = loadPickle(pathClassification + 'train_test.pkl')
order=loadPickle(pathClassification+'target_order.pkl')

Classification is based on logistic regression. To create multi-label models problem transformation techniques are used: classificator chains and binary relevance. Classificator chain produces a model for each target in order, by using original features and the labels of previously modeled target features as features. Binary relevance constructs a model for each target separately and then predicts the final label set as a union of predictuions from target feature specific models. 

Features are selected with l1 regularisation. Five fold cross validation is used to compare results of different regularisation parameters (C), using the training set. 

In [None]:
# split = LeaveOneOut()
prfs_all=pd.DataFrame()
rac_all=pd.DataFrame()
feats_all=pd.DataFrame()
split = IterativeStratification(n_splits=5, order=1)
fold=0
for train_index, test_index in split.split(X_train, Y_train):
    fold += 1
    print(fold)
    scaler = pp.MinMaxScaler()
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    Y_train_fold, Y_test_fold = Y_train[train_index], Y_train[test_index]
    X_train_fold=scaler.fit_transform(X_train_fold)
    X_test_fold=scaler.transform(X_test_fold)
    # Sample weights do not work in multilabel models. Thus oversample training data instances instead.
    if True:
        sample_weight=Y_train_fold.sum(axis=1)
        sample_weight[sample_weight==0]=sample_weight.max()
        sample_weight=(sample_weight.max()+1)-sample_weight
        X_train_fold_new=pd.DataFrame()
        Y_train_fold_new=pd.DataFrame()
        for i in range(X_train_fold.shape[0]):
            for j in range(int(sample_weight[i])):
                X_train_fold_new=X_train_fold_new.append(pd.DataFrame(X_train_fold[i]).T)
                Y_train_fold_new=Y_train_fold_new.append(pd.DataFrame(Y_train_fold[i]).T)
        X_train_fold=X_train_fold_new
        Y_train_fold=Y_train_fold_new
    for c in [10,5,2,0.9,0.6,0.3,0.1]:
        print(c)
        # Order already ensured when selecting Y columns
        #classifier = ClassifierChain(
        classifier = BinaryRelevance(
            classifier=LogisticRegression(penalty='l1', n_jobs=20, C=c, solver='saga', max_iter=1000)
            ).fit(X_train_fold,Y_train_fold)

        Y_predict_fold = classifier.predict(X_test_fold)
        Y_p_fold = classifier.predict_proba(X_test_fold)
        prfs=pd.DataFrame(precision_recall_fscore_support(Y_test_fold, Y_predict_fold),index=['precision','recall','F_score','support']).T
        prfs['Stage']=order
        prfs['C']=[c]*prfs.shape[0]
        prfs_all=prfs_all.append(prfs)
        prfs=list(precision_recall_fscore_support(Y_test_fold, Y_predict_fold, average='micro'))
        prfs.extend(['all',c])
        prfs=dict(zip(['precision','recall','F_score','support','Stage',"C"],prfs))
        prfs_all = prfs_all.append( prfs,ignore_index=True)
        rac=dict(zip(['roc_auc','C'],[ roc_auc_score(Y_test_fold, Y_p_fold.toarray(), average='micro'),c]))
        rac_all=rac_all.append(rac,ignore_index=True)

        feats_combined = set()
        for i in range(len(order)):
            cl = classifier.classifiers_[i]
            feats_stage = set(pd.Series(range(X_train.shape[1]))[(cl.coef_ != 0).flatten()[:X_train.shape[1]]].index)
            feats_combined = feats_combined | feats_stage
            #print(len(feats_stage), len(feats_combined))
            feats=dict(zip(['N_features','C','Stage'],[len(feats_stage),c,order[i]]))
            feats_all=feats_all.append(feats,ignore_index=True)
        feats= dict(zip(['N_features', 'C', 'Stage'],[len(feats_combined), c, 'all']))
        feats_all = feats_all.append(feats,ignore_index=True)
savePickle(pathClassification+'logisticRegressionBinaryRelevanceWeighted.pkl',{'prfs':prfs_all,'rac':rac_all,'featsN':feats_all})

1
10




5




2




0.9




0.6


  _warn_prf(average, modifier, msg_start, len(result))


0.3


  _warn_prf(average, modifier, msg_start, len(result))


0.1


  _warn_prf(average, modifier, msg_start, len(result))


2
10




5




2




0.9


  _warn_prf(average, modifier, msg_start, len(result))


0.6


  _warn_prf(average, modifier, msg_start, len(result))


0.3


  _warn_prf(average, modifier, msg_start, len(result))


0.1


  _warn_prf(average, modifier, msg_start, len(result))


3
10




5




2




0.9




0.6




0.3




0.1


  _warn_prf(average, modifier, msg_start, len(result))


4
10


  _warn_prf(average, modifier, msg_start, len(result))


5


  _warn_prf(average, modifier, msg_start, len(result))


2


  _warn_prf(average, modifier, msg_start, len(result))


0.9


  _warn_prf(average, modifier, msg_start, len(result))


0.6


  _warn_prf(average, modifier, msg_start, len(result))


0.3




0.1


  _warn_prf(average, modifier, msg_start, len(result))


5
10




## Results
Five fold cross validation statistics for different models and regularization strengths (C). Stage: results for individual targets (phenotypes) or combined across all target features ('all'). 

The models are not able to reliably predict phenotypes. Besides the below metrics the problems of training the models were also indicated by the models not converging, even when raising max iter for 10x or 40x times compared to the default. 

In [66]:
# Plot scatter plot with multiple jittered categories
# From https://datavizpyr.com/stripplot-with-altair-in-python/
def scatter_catgory(df,categories, Y, colour=None,shape=None,title=''):
    params_dict={}
    if colour is not None:
        params_dict['color']=alt.Color(colour)
    if shape is not None:
        params_dict['shape']=alt.Shape(shape)
    return alt.Chart(df, width=120,title=title).mark_point(size=20).encode(
        x=alt.X('jitter:Q',title=None,axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
            scale=alt.Scale(),),
        y=alt.Y(Y,axis=alt.Axis( grid=False)),
        column=alt.Column(categories, header=alt.Header(
            labelAngle=0,titleOrient='bottom',labelOrient='bottom',labelAlign='center',labelPadding=10)),
        **params_dict
    ).transform_calculate(jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
    ).configure_facet(spacing=0
    ).configure_view( stroke=None)

In [153]:
# Mean and SE of scores from cross validation
def summary(df,statistic, split):
    print(statistic,'mean and standard error for each group')
    groups=df[[statistic,split]].groupby(split)
    for group_name in groups.groups.keys():
        data=groups.get_group(group_name)
        print('%-12s%-6.3f%-3s%-3.3f' % (group_name, data.mean()[0],'+-',data.sem()[0]))

### Classifier Binary Relevance with logistic regression

In [155]:
data_binary=loadPickle(pathClassification+'logisticRegressionBinaryRelevance.pkl')
display(scatter_catgory(data_binary['prfs'],'C', 'F_score', 'Stage','Stage'))
display(scatter_catgory(data_binary['prfs'],'C', 'precision', 'Stage','Stage'))
display(scatter_catgory(data_binary['prfs'],'C', 'recall', 'Stage','Stage'))
display(scatter_catgory(data_binary['featsN'],'C', 'N_features','Stage','Stage'))
#display(scatter_catgory(data_binary['rac'],'C', 'roc_auc',title='Micro ROC AUC (across all categories)'))
summary(data_binary['prfs'].query('C ==2'),'F_score',"Stage")

F_score mean and standard error for each group
FB          0.684 +- 0.089
all         0.741 +- 0.018
cul         0.791 +- 0.055
disappear   0.370 +- 0.112
lag         0.739 +- 0.040
mhat        0.831 +- 0.092
no_agg      0.873 +- 0.025
slug        0.720 +- 0.031
stream      0.515 +- 0.055
tag         0.779 +- 0.023
tip         0.474 +- 0.086


### Classifier Chain with logistic regression

In [156]:
data_chain=loadPickle(pathClassification+'logisticRegressionChain.pkl')
display(scatter_catgory(data_chain['prfs'],'C', 'F_score', 'Stage'))
display(scatter_catgory(data_chain['prfs'],'C', 'precision', 'Stage'))
display(scatter_catgory(data_chain['prfs'],'C', 'recall', 'Stage'))
display(scatter_catgory(data_chain['featsN'],'C', 'N_features','Stage'))
#display(scatter_catgory(data_chain['rac'],'C', 'roc_auc',title='Micro ROC AUC (across all categories)'))
summary(data_chain['prfs'].query('C ==2'),'F_score',"Stage")

F_score mean and standard error for each group
FB          0.683 +- 0.072
all         0.730 +- 0.013
cul         0.762 +- 0.079
disappear   0.230 +- 0.102
lag         0.717 +- 0.026
mhat        0.880 +- 0.049
no_agg      0.850 +- 0.023
slug        0.724 +- 0.028
stream      0.585 +- 0.065
tag         0.755 +- 0.040
tip         0.443 +- 0.131
