# OvR logistic regression with weights adjusted based on weights of other phenotypes

In [301]:
import os
import sys
from importlib import reload

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

import pandas as pd
import numpy as np
#from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support,roc_auc_score
from skmultilearn.model_selection import iterative_train_test_split
import sklearn.preprocessing as pp
#alt.renderers.enable('notebook')
import matplotlib.pyplot as plt
from IPython.display import display

import stages_DE.stages_library
import importlib
importlib.reload(stages_DE.stages_library)

from networks.functionsDENet import loadPickle,savePickle
from stages_DE.stages_library import PHENOTYPES, PHENOTYPES_X, summary_classification, summary_classification_print_sort, scatter_catgory
import stages_DE.OvR as OvR
importlib.reload(OvR)


<module 'stages_DE.OvR' from '/home/khrovatin/git/baylor-dicty/stages_DE/OvR.py'>

## Prepare data

In [255]:
proteus=True
if proteus:
    pathClassification = '/home/khrovatin/timeTrajectoriesNet/data/stages/classification/'
    dataPath= '/home/khrovatin/timeTrajectoriesNet/data/RPKUM/'
else:
    pathClassification = '/home/karin/Documents/timeTrajectories/data/stages/classification/'

In [256]:
genes = pd.read_csv(dataPath + 'mergedGenes_RPKUM.tsv', sep='\t', index_col=0)
conditions = pd.read_csv(dataPath + 'conditions_mergedGenes.tsv', sep='\t', index_col=None)

# Retain only samples with annotations
Y = conditions[(conditions[PHENOTYPES] != 0).any(axis=1)]
X = genes[Y.Measurment].T.values
#Y = conditions.query('Group =="WT"')[(conditions.query('Group =="WT"')[PHENOTYPES] != 0).any(axis=1)]
#X = genes[Y.Measurment].T.values

# Remove targets with too little positive samples
order=['no_agg','disappear', 'stream', 'lag', 'tag', 'tip', 'slug', 'mhat', 'cul', 'FB']
#order=['no_agg', 'stream', 'lag', 'tag',  'slug', 'mhat', 'cul', 'FB']
Y = Y[order].values

# Remove constant features
X=X[:,(X.std(axis=0)!=0)]

In [568]:
# Split in train and test as was done for 5-fold cross validation, but using only 1 fold for now
X_train, Y_train, X_test, Y_test = iterative_train_test_split(X, Y, test_size=0.2)

In [575]:
#Scale X features to [0,1], use X_train scaller to also scale X_test
scaler = pp.MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

## Train 

Train unmodified OvR for comparison

In [576]:
classifier = OvR.OneVsRestClassifier(estimator=LogisticRegression(n_jobs=20,  solver='saga',penalty='none',
                                            class_weight='balanced'
                                            #,warm_start=True,max_iter=1
                                             ), n_jobs=Y_train.shape[1])
classifier.fit(X_train,Y_train)

(Re)setting estimators


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto', n_jobs=20,
                                                 penalty='none',
                                                 random_state=None,
                                                 solver='saga', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=10, warm_start=False)

Train modified weights model - adjust weights based on weights for other phenotypes. Train sub-models so that in the middle of the training the weights are adjusted after each itteration based on weight of other sub-models/phenotypes. Currently this finds largest region of positive weights (across targets/phenotypes, according to weight sum) for each gene and then downvotes (divides) any positive weiths (of other phenotypes) that are outside of this region.

In [577]:
classifier_ordered = OvR.OneVsRestClassifier(estimator=LogisticRegression(n_jobs=20,  solver='saga',penalty='none',
                                            class_weight='balanced'
                                            ,warm_start=True,max_iter=1
                                             ), n_jobs=Y_train.shape[1],warm_start=True)

max=100
for i in range(max):
    classifier_ordered.fit(X_train,Y_train)
    if 5 < i < max-5:
        for feature_idx in range(X_train.shape[1]):
            start=0
            end=0
            peaks=[]
            running=False
            curr_peak=0
            positive_coefs=[]
            for target_idx in range(Y_train.shape[1]):
                coef=classifier_ordered.estimators_[target_idx].coef_[0][feature_idx]
                if coef > 0:
                    positive_coefs.append(target_idx)
                    if not running:
                        running=True
                        start=target_idx
                    end=target_idx
                    curr_peak+=coef
                elif coef < 0 and running:
                    running=False
                    peaks.append((start,end,curr_peak))
            if len(peaks)>0:
                best_peak = sorted(peaks, key=lambda tup: tup[2])[-1]
                modify_down=[idx for idx in positive_coefs if idx < best_peak[0] or idx > best_peak[1]]
                for modify_idx in modify_down:
                    coef=classifier_ordered.estimators_[modify_idx].coef_[0][feature_idx]
                    classifier_ordered.estimators_[modify_idx].coef_[0][feature_idx]=coef/2

                

(Re)setting estimators


## Evaluation

In [None]:
# Compare weights of individual genes in both models
c=classifier_ordered
plt.hlines(0,0,len(c.estimators_)-1)
# Find a gene with high weight for a target in modified model
for i in range(X.shape[1]):
#for i in [1000]:
    if c.estimators_[2].coef_[0][i]>0.01:
        #Plot weights of both models
        for e in range(len(c.estimators_)):
            plt.scatter(e,c.estimators_[e].coef_[0][i],c='b',alpha=0.5)
            plt.scatter(e,classifier.estimators_[e].coef_[0][i],c='r',alpha=0.5)
        print(i,genes[genes.std(axis=1)!=0].index[i])
        break

In [579]:
print('Unmodified model')
Y_predicted=classifier.predict(X_test)
prfs=pd.DataFrame(precision_recall_fscore_support(Y_test, Y_predicted),index=['precision','recall','F_score','support']).T
prfs['Group']=order
print(prfs.round(2))
print('\nModified weights model')
Y_predicted=classifier_ordered.predict(X_test)
prfs=pd.DataFrame(precision_recall_fscore_support(Y_test, Y_predicted),index=['precision','recall','F_score','support']).T
prfs['Group']=order
print(prfs.round(2))

Unmodified model
   precision  recall  F_score  support      Group
0       0.90    0.84     0.87     31.0     no_agg
1       0.33    0.25     0.29      4.0  disappear
2       0.70    0.64     0.67     11.0     stream
3       0.72    0.76     0.74     17.0        lag
4       0.82    0.69     0.75     13.0        tag
5       0.50    0.20     0.29      5.0        tip
6       0.89    0.73     0.80     11.0       slug
7       0.50    1.00     0.67      3.0       mhat
8       0.75    1.00     0.86      6.0        cul
9       1.00    0.80     0.89      5.0         FB

Modified weights model
   precision  recall  F_score  support      Group
0       0.85    0.90     0.88     31.0     no_agg
1       0.22    1.00     0.36      4.0  disappear
2       0.34    1.00     0.51     11.0     stream
3       0.70    0.41     0.52     17.0        lag
4       0.41    0.92     0.57     13.0        tag
5       0.50    0.20     0.29      5.0        tip
6       0.32    1.00     0.49     11.0       slug
7       0

### Distance of falsely predicted labels to true labels
For each sample that has at least some labels calculate the distance to the closest true label of FP and closest TP of FN. Average this over all FP/FN. It would be desired that: 1.) FP would be close to the real label (low FP distance). 2.) FN would be away from the closest TP (high FN distance) - The only FN would be those that are not likely based on the TP. 

In [264]:
def distance_error(Y_test,Y_predicted):
    wrong_total=0
    n_wrong=0
    missing_total=0
    n_missing=0
    order_arr=np.array(order)
    for row_idx in range(Y_test.shape[0]):
        y_test=Y_test[row_idx,:]
        # Use only samples with at least some ground truth positive labels and some wrongly predicted
        if y_test.sum()>0:
            y_predicted=Y_predicted[row_idx,:]
            if (y_predicted!=y_test).any():
                #print('***********')
                #print(y_test.astype('int'))
                #print(y_predicted)
                # Which phenotypes were predicted/are in fact present
                targets=order_arr[y_test==1]
                predicted_targets=order_arr[y_predicted==1]
                true_x=[PHENOTYPES_X[phenotype] for phenotype in targets]
                predicted_x=[PHENOTYPES_X[phenotype] for phenotype in predicted_targets]
                # Find closest actuall lable to the FP
                for x in predicted_x:
                    if x not in true_x:
                        n_wrong+=1
                        min_diff=np.inf
                        for x_true in true_x:
                            diff=abs(x-x_true)
                            if diff<min_diff:
                                min_diff=diff
                        wrong_total+=min_diff
                # Find closest TP label to the FN
                for x in true_x:
                    if x not in predicted_x and len(predicted_x)>0:
                        n_missing+=1
                        min_diff=np.inf
                        for x_predicted in predicted_x:
                            diff=abs(x-x_predicted)
                            if diff<min_diff:
                                min_diff=diff
                        missing_total+=min_diff

    print('Average distance of missing annotations (FN) to the closest TP one:',round(missing_total/n_missing,2))     
    print('Average distance of wrong annotations (FP) to the closest true one:',round(wrong_total/n_wrong,2)) 

In [580]:
print('Unmodified model:')
Y_predicted=classifier.predict(X_test)
distance_error(Y_test,Y_predicted)
print('\nModified weights model:')
Y_predicted=classifier_ordered.predict(X_test)
distance_error(Y_test,Y_predicted)

Unmodified model:
Average distance of missing annotations (FN) to the closest TP one: 1.89
Average distance of wrong annotations (FP) to the closest true one: 1.55

Modified weights model:
Average distance of missing annotations (FN) to the closest TP one: 1.58
Average distance of wrong annotations (FP) to the closest true one: 1.97


## Conclusion
The algorithm, as currently implemented, performs worse than unmodified model - both based on OvR F score and distance of false predictions to the truth. 

# Constrain weights to positive
Use R glmnet package for logistic regression, as it can constrain weights to be above 0.

In [472]:
from rpy2.robjects.packages import importr
from rpy2 import  robjects
from rpy2.robjects import pandas2ri 
from rpy2.rinterface_lib.embedded import RRuntimeError
from sklearn.utils.class_weight import compute_sample_weight

In [553]:
base = importr('base')
utils = importr('utils')
glmnet=importr('glmnet')

In [583]:
Y_predicted=[]
N_features=[]
for target_idx in range(Y.shape[1]):
    print(target_idx)
    #Sample weights as in sklearn models
    weights=compute_sample_weight('balanced',Y_train[:,target_idx])
    
    #Convert to R objects
    rX_train = robjects.r.matrix(robjects.FloatVector(X_train.T.ravel()), nrow=X_train.shape[0],
                                 ncol=X_train.shape[1])
    rY_train=robjects.FactorVector(pd.Series(Y_train[:,target_idx]).astype('str').values)
    rX_test = robjects.r.matrix(robjects.FloatVector(X_test.T.ravel()), nrow=X_test.shape[0],
                                ncol=X_test.shape[1])
    rweights=robjects.FloatVector(weights.ravel())
    
    #Glmnet fit
    fit=glmnet.glmnet(rX_train,rY_train,family = 'binomial',weights=rweights,alpha=1,
                      **{'lambda':0,
                         #Comment the next line out for unconstrained model
                         'lower.limits':0,
                         'standardize':False,'trace.it':1})

    coefs=pandas2ri.rpy2py(robjects.r.matrix(fit.rx2['beta']))

    N_features.append((coefs>0).sum())
    
    #Predict proba and convert to predict
    Y_p=robjects.r.predict(fit,rX_test,type="response",s=1)

    Y_p=pandas2ri.rpy2py(Y_p)
    
    Y_predicted_target=Y_p.copy()
    Y_predicted_target[Y_predicted_target>0.5]=1
    Y_predicted_target[Y_predicted_target<=0.5]=0
    Y_predicted.append(Y_predicted_target.ravel())
Y_predicted=np.array(Y_predicted).T

0
1
2
3
4
5
6
7
8
9


Evaluation of positive weights model and unconstrained model.

In [582]:
#Print F scores and N features
print('\nPositive weights model')
prfs=pd.DataFrame(precision_recall_fscore_support(Y_test, Y_predicted),index=['precision','recall','F_score','support']).T
prfs['Group']=order
print(prfs.round(2))
print()
print('N used features')
pd.Series(N_features,index=order)


Positive weights model
   precision  recall  F_score  support      Group
0       0.70    0.61     0.66     31.0     no_agg
1       0.00    0.00     0.00      4.0  disappear
2       0.42    0.45     0.43     11.0     stream
3       0.47    0.53     0.50     17.0        lag
4       0.44    0.62     0.52     13.0        tag
5       0.10    0.20     0.13      5.0        tip
6       0.50    0.36     0.42     11.0       slug
7       0.11    0.33     0.17      3.0       mhat
8       0.75    1.00     0.86      6.0        cul
9       0.50    0.80     0.62      5.0         FB

N used features


no_agg       2375
disappear    2048
stream       2205
lag          2242
tag          2078
tip          2375
slug         2038
mhat         2060
cul          2210
FB           2310
dtype: int64

In [584]:
#Evaluation to be used after unconstrained model was fitted
print('\nUnconstrained weights model')
prfs=pd.DataFrame(precision_recall_fscore_support(Y_test, Y_predicted),index=['precision','recall','F_score','support']).T
prfs['Group']=order
print(prfs.round(2))
print()
print('N used features')
pd.Series(N_features,index=order)


Unconstrained weights model
   precision  recall  F_score  support      Group
0       0.73    0.77     0.75     31.0     no_agg
1       0.29    0.50     0.36      4.0  disappear
2       0.67    0.55     0.60     11.0     stream
3       0.44    0.41     0.42     17.0        lag
4       0.53    0.62     0.57     13.0        tag
5       0.00    0.00     0.00      5.0        tip
6       0.33    0.27     0.30     11.0       slug
7       0.43    1.00     0.60      3.0       mhat
8       0.60    0.50     0.55      6.0        cul
9       0.00    0.00     0.00      5.0         FB

N used features


no_agg       6220
disappear    6202
stream       6291
lag          6230
tag          6232
tip          6286
slug         6302
mhat         6182
cul          6254
FB           6280
dtype: int64

The models perform worse than sklearn models (even when weights are unconstrained). They also use less features, even when there is no regularisation.