In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import ast
import math

from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from matplotlib import pyplot as plt
from collections import defaultdict

In [2]:
print("A.Schueller method")
AS_csv = '/home/cvigilv/Repos/lppnet/chembl23_GS3_v2.mphase_gt_0.fp2.fix.txt.co.out'
AS_df = pd.read_csv(AS_csv,sep='\t',header=None, names=['Fold','QL','HT','Tc','HL','QT','TP'])
print(AS_df[AS_df['Tc'] > -99].info())
print(AS_df.nunique())
print(AS_df.describe())

print("\nC.Vigil method")
CV_csv = '/home/cvigilv/Repos/lppnet/lppnet_AS.out'
CV_df = pd.read_csv(CV_csv,sep='\t',header=None, names=['Fold','QL','HT','Tc','Path','Communities','TP', 'Degree', 'Betweenness Centrality'])
CV_df['Tc'] = pd.to_numeric(CV_df['Tc'])
print(CV_df[CV_df['Tc'] != -99].info())
print(CV_df.nunique())
print(CV_df.describe())

def CountCommunities(row):
    return len(set(row['Communities'].split('-')))

def ScoreCommunities(row, alpha):
    if row['Tc'] == -99:
        return -99
    else:
        return math.exp(-(row['N communities']-1)) * alpha + row['Tc'] * (1.0 - alpha)


print("\COM method")
alpha=0.5
COM_csv = '/home/cvigilv/Repos/lppnet/chembl23_GS3_v2.mphase_gt_0.txt.co.CM_Complete.R_0.5.out'
COM_df = pd.read_csv(COM_csv,sep='\t',header=None, names=['Fold','QL','HT','Tc','Path','Communities','TP', 'Degree', 'Betweenness Centrality'])
COM_df['Tc'] = pd.to_numeric(COM_df['Tc'])
COM_df['N communities'] = COM_df.apply(CountCommunities, axis = 1)
COM_df['Combined score %.2f'%(alpha)] = COM_df.apply(ScoreCommunities, axis = 1, args=(alpha,))


print('\nAll entries:')
print('AS method dataframe shape:',AS_df.shape)
print('CV method dataframe shape:',CV_df.shape)

print('\nBad predictions:')
print('AS method dataframe shape:',AS_df[AS_df['Tc'] < -1].shape)
print('CV method dataframe shape:',CV_df[CV_df['Tc'] < -1].shape)

print('\nCorrect predictions:')
print('AS method dataframe shape:',AS_df[AS_df['Tc'] > -1].shape)
print('CV method dataframe shape:',CV_df[CV_df['Tc'] > -1].shape)

A.Schueller method
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1076887 entries, 0 to 1105103
Data columns (total 7 columns):
Fold    1076887 non-null int64
QL      1076887 non-null object
HT      1076887 non-null object
Tc      1076887 non-null float64
HL      1076887 non-null int64
QT      1076887 non-null object
TP      1076887 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 65.7+ MB
None
Fold       10
QL       1232
HT        897
Tc      15263
HL       1230
QT       1230
TP          2
dtype: int64
               Fold            Tc            HL            TP
count  1.105104e+06  1.105104e+06  1.105104e+06  1.105104e+06
mean   4.493506e+00 -2.284694e+00  6.849806e+02  8.723161e-03
std    2.874535e+00  1.565583e+01  3.545904e+02  9.298965e-02
min    0.000000e+00 -9.900000e+01 -1.000000e+00  0.000000e+00
25%    2.000000e+00  1.702130e-01  3.960000e+02  0.000000e+00
50%    4.000000e+00  2.410260e-01  7.280000e+02  0.000000e+00
75%    7.000000e+00  3.085110e-01 

In [3]:
def ROC(y_pred, y_scores, title, filename, color, string):
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_pred, y_scores, pos_label=1)
    auc_roc = sklearn.metrics.auc(fpr,tpr)
    plt.plot(fpr, tpr, color=color, lw=2, label=string+' (Area = %0.2f)' % (auc_roc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    print(auc_roc)

def PR(y_pred, y_scores, title, filename, color, string):
    average_precision = average_precision_score(y_pred, y_scores)
    precision, recall, _ = precision_recall_curve(y_pred, y_scores)
    plt.step(recall, precision, color=color, where='post', label = string+' (AVG. Precision = %0.2f)' % (average_precision))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    print(average_precision)
    
def ROC_point(y_pred, y_scores, x, filename, string):
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_pred, y_scores, pos_label=1)
    auc_roc = sklearn.metrics.auc(fpr,tpr)
    plt.plot(x, auc_roc, 'o' ,color='black', ms=5, label=string +' (Area = %0.2f)' % (auc_roc))
    plt.xlim([0.0, 1.05])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Alpha')
    plt.ylabel('AUC-ROC')
    
    return x, auc_roc
    
def PR_point(y_pred, y_scores, x, filename, string):
    average_precision = average_precision_score(y_pred, y_scores)
    plt.plot(x, average_precision, 'o', color='black', ms=5, label = string +' (AVG. Precision = %0.2f)' % (average_precision))
    plt.xlabel('Alpha')
    plt.ylabel('Average Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.05])
    
    return x, average_precision

In [8]:
plt.figure()
ROC(AS_df[AS_df['Tc'] > -99]['TP'], AS_df[AS_df['Tc'] > -99]['Tc'], 'AS_ROC', 'AS_ROC','orange','A.Schueller')
ROC(CV_df[CV_df['Tc'] > -99]['TP'], CV_df[CV_df['Tc'] > -99]['Tc'], 'CV_ROC', 'CV_ROC','purple','Reimplementación')
ROC(COM_df[COM_df['Tc'] > -99]['TP'], COM_df[COM_df['Tc'] > -99]['Combined score 0.50'], 'COM_ROC', 'COM_ROC','green','Comunidades')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right", prop={'size': 10})
#plt.show()
plt.savefig('ROC.png', dpi=300)
plt.close('all')

plt.figure()
PR(AS_df[AS_df['Tc'] > -99]['TP'], AS_df[AS_df['Tc'] > -99]['Tc'], 'AS_PR', 'AS_PR','orange','A.Schueller')
PR(CV_df[CV_df['Tc'] > -99]['TP'], CV_df[CV_df['Tc'] > -99]['Tc'], 'CV_PR', 'CV_PR','purple','Reimplementación')
PR(COM_df[COM_df['Tc'] > -99]['TP'], COM_df[COM_df['Tc'] > -99]['Combined score 0.50'], 'COM_PR', 'COM_PR','green','Comunidades')
plt.title('Precision-Recall curve')
plt.legend(loc="upper right", prop={'size': 10})
#plt.show()
plt.savefig('PR.png', dpi=300)
plt.close('all')

roc_pr = defaultdict(list)

colour = ["#67001f","#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#d1d1d1", "#d1e5f0", "#92c5de", "#4393c3", "#2166ac", "#053061"]
ROCs = defaultdict(list)
for i in range(0,11):
    alpha = 0.1 * i
    alpha, roc = ROC_point(CV_df[CV_df['Tc'] > -99]['TP'], CV_df[CV_df['Tc'] > -99]['Combined score %.2f'%(alpha)], alpha, 'CV_PR', 'Alpha == {}'.format(i))
    ROCs['Alpha'].append(alpha)
    ROCs['AUC-ROC'].append(roc)
    roc_pr['AUC-ROC'].append(roc)
plt.title('Receiver operating characteristic for different Alpha values')
#plt.legend(loc="lower right", prop={'size': 10})
plt.show()
#plt.savefig('ROC.png', dpi=300)
plt.close('all')

0.8834637313714536
0.8834655209188432
0.9117743194822172
0.17289307087442984
0.17288286528111516
0.27151710345126157


KeyError: 'Combined score 0.00'