# OvR logistic regression with weights adjusted based on weights of other phenotypes

In [1]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support,roc_auc_score
from skmultilearn.model_selection import iterative_train_test_split
import sklearn.preprocessing as pp
#alt.renderers.enable('notebook')
import matplotlib.pyplot as plt
from IPython.display import display

import stages_DE.stages_library
import importlib
importlib.reload(stages_DE.stages_library)

from networks.functionsDENet import loadPickle,savePickle
from stages_DE.stages_library import PHENOTYPES, PHENOTYPES_X, summary_classification, summary_classification_print_sort, scatter_catgory


## Prepare data

In [2]:
proteus=True
if proteus:
    pathClassification = '/home/khrovatin/timeTrajectoriesNet/data/stages/classification/'
    dataPath= '/home/khrovatin/timeTrajectoriesNet/data/RPKUM/'
else:
    pathClassification = '/home/karin/Documents/timeTrajectories/data/stages/classification/'

In [218]:
genes = pd.read_csv(dataPath + 'mergedGenes_RPKUM.tsv', sep='\t', index_col=0)
conditions = pd.read_csv(dataPath + 'conditions_mergedGenes.tsv', sep='\t', index_col=None)

# Retain only samples with annotations
Y = conditions[(conditions[PHENOTYPES] != 0).any(axis=1)]
X = genes[Y.Measurment].T.values
#Y = conditions.query('Group =="WT"')[(conditions.query('Group =="WT"')[PHENOTYPES] != 0).any(axis=1)]
#X = genes[Y.Measurment].T.values

# Remove targets with too little positive samples
order=['no_agg','disappear', 'stream', 'lag', 'tag', 'tip', 'slug', 'mhat', 'cul', 'FB']
#order=['no_agg', 'stream', 'lag', 'tag',  'slug', 'mhat', 'cul', 'FB']
Y = Y[order].values

# Remove constant features
X=X[:,(X.std(axis=0)!=0)]

In [219]:
# Split in train and test as was done for 5-fold cross validation, but using only 1 fold for now
X_train, Y_train, X_test, Y_test = iterative_train_test_split(X, Y, test_size=0.2)
#Scale X features to [0,1], use X_train scaller to also scale X_test
scaler = pp.MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

## Train 

Train unmodified OvR for comparison

In [220]:
classifier = OneVsRestClassifier(estimator=LogisticRegression(n_jobs=20,  solver='saga',penalty='none',
                                            class_weight='balanced'
                                            #,warm_start=True,max_iter=1
                                             ), n_jobs=Y_train.shape[1])
classifier.fit(X_train,Y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto', n_jobs=20,
                                                 penalty='none',
                                                 random_state=None,
                                                 solver='saga', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=10)

Train modified weights model - adjust weights based on weights for other phenotypes. Train sub-models so that in the middle of the training the weights are adjusted after each itteration based on weight of other sub-models/phenotypes. Currently this finds largest region of positive weights (across targets/phenotypes, according to weight sum) for each gene and then downvotes (divides) any positive weiths (of other phenotypes) that are outside of this region.

In [227]:
classifier_ordered = OneVsRestClassifier(estimator=LogisticRegression(n_jobs=20,  solver='saga',penalty='none',
                                            class_weight='balanced'
                                            ,warm_start=True,max_iter=1
                                             ), n_jobs=Y_train.shape[1])

max=100
for i in range(max):
    classifier_ordered.fit(X_train,Y_train)
    if 5 < i < max-5:
        for feature_idx in range(X_train.shape[1]):
            start=0
            end=0
            peaks=[]
            running=False
            curr_peak=0
            positive_coefs=[]
            for target_idx in range(Y_train.shape[1]):
                coef=classifier_ordered.estimators_[target_idx].coef_[0][feature_idx]
                if coef > 0:
                    positive_coefs.append(target_idx)
                    if not running:
                        running=True
                        start=target_idx
                    end=target_idx
                    curr_peak+=coef
                elif coef < 0 and running:
                    running=False
                    peaks.append((start,end,curr_peak))
            if len(peaks)>0:
                best_peak = sorted(peaks, key=lambda tup: tup[2])[-1]
                modify_down=[idx for idx in positive_coefs if idx < best_peak[0] or idx > best_peak[1]]
                for modify_idx in modify_down:
                    coef=classifier_ordered.estimators_[modify_idx].coef_[0][feature_idx]
                    classifier_ordered.estimators_[modify_idx].coef_[0][feature_idx]=coef/2

                

## Evaluation

In [None]:
# Compare weights of individual genes in both models
c=classifier_ordered
plt.hlines(0,0,len(c.estimators_)-1)
# Find a gene with high weight for a target in modified model
for i in range(X.shape[1]):
#for i in [1000]:
    if c.estimators_[2].coef_[0][i]>0.01:
        #Plot weights of both models
        for e in range(len(c.estimators_)):
            plt.scatter(e,c.estimators_[e].coef_[0][i],c='b',alpha=0.5)
            plt.scatter(e,classifier.estimators_[e].coef_[0][i],c='r',alpha=0.5)
        print(i,genes[genes.std(axis=1)!=0].index[i])
        break

In [244]:
print('Unmodified model')
Y_predicted=classifier.predict(X_test)
prfs=pd.DataFrame(precision_recall_fscore_support(Y_test, Y_predicted),index=['precision','recall','F_score','support']).T
prfs['Group']=order
print(prfs.round(2))
print('\nModified weights model')
Y_predicted=classifier_ordered.predict(X_test)
prfs=pd.DataFrame(precision_recall_fscore_support(Y_test, Y_predicted),index=['precision','recall','F_score','support']).T
prfs['Group']=order
print(prfs.round(2))

Unmodified model
   precision  recall  F_score  support      Group
0       0.90    0.87     0.89     31.0     no_agg
1       0.67    0.50     0.57      4.0  disappear
2       0.78    0.64     0.70     11.0     stream
3       0.68    0.76     0.72     17.0        lag
4       0.75    0.69     0.72     13.0        tag
5       0.50    0.20     0.29      5.0        tip
6       0.88    0.64     0.74     11.0       slug
7       0.60    1.00     0.75      3.0       mhat
8       0.75    1.00     0.86      6.0        cul
9       1.00    0.80     0.89      5.0         FB

Modified weights model
   precision  recall  F_score  support      Group
0       0.88    0.90     0.89     31.0     no_agg
1       0.18    1.00     0.31      4.0  disappear
2       0.75    0.55     0.63     11.0     stream
3       0.54    0.88     0.67     17.0        lag
4       0.46    0.85     0.59     13.0        tag
5       0.12    0.60     0.20      5.0        tip
6       0.33    0.73     0.46     11.0       slug
7       0

### Distance of falsely predicted labels to true labels
For each sample that has at least some labels calculate the distance to the closest true label of FP and closest TP of FN. Average this over all FP/FN. It would be desired that: 1.) FP would be close to the real label (low FP distance). 2.) FN would be away from the closest TP (high FN distance) - The only FN would be those that are not likely based on the TP. 

In [251]:
def distance_error(Y_test,Y_predicted):
    wrong_total=0
    n_wrong=0
    missing_total=0
    n_missing=0
    order_arr=np.array(order)
    for row_idx in range(Y_test.shape[0]):
        y_test=Y_test[row_idx,:]
        # Use only samples with at least some ground truth positive labels and some wrongly predicted
        if y_test.sum()>0:
            y_predicted=Y_predicted[row_idx,:]
            if (y_predicted!=y_test).any():
                #print('***********')
                #print(y_test.astype('int'))
                #print(y_predicted)
                # Which phenotypes were predicted/are in fact present
                targets=order_arr[y_test==1]
                predicted_targets=order_arr[y_predicted==1]
                true_x=[PHENOTYPES_X[phenotype] for phenotype in targets]
                predicted_x=[PHENOTYPES_X[phenotype] for phenotype in predicted_targets]
                # Find closest actuall lable to the FP
                for x in predicted_x:
                    if x not in true_x:
                        n_wrong+=1
                        min_diff=np.inf
                        for x_true in true_x:
                            diff=abs(x-x_true)
                            if diff<min_diff:
                                min_diff=diff
                        wrong_total+=min_diff
                # Find closest TP label to the FN
                for x in true_x:
                    if x not in predicted_x and len(predicted_x)>0:
                        n_missing+=1
                        min_diff=np.inf
                        for x_predicted in predicted_x:
                            diff=abs(x-x_predicted)
                            if diff<min_diff:
                                min_diff=diff
                        missing_total+=min_diff

    print('Average distance of missing annotations (FN) to the closest TP one:',round(missing_total/n_missing,2))     
    print('Average distance of wrong annotations (FP) to the closest true one:',round(wrong_total/n_wrong,2)) 

In [252]:
print('Unmodified model:')
Y_predicted=classifier.predict(X_test)
distance_error(Y_test,Y_predicted)
print('\nModified weights model:')
Y_predicted=classifier_ordered.predict(X_test)
distance_error(Y_test,Y_predicted)

Unmodified model:
Average distance of missing annotations (FN) to the closest TP one: 1.85
Average distance of wrong annotations (FP) to the closest true one: 1.62

Modified weights model:
Average distance of missing annotations (FN) to the closest TP one: 2.0
Average distance of wrong annotations (FP) to the closest true one: 1.74


## Conclusion
The algorithm, as currently implemented, performs worse than unmodified model - both based on OvR F score and distance of false predictions to the truth. 