In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import ast

from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from matplotlib import pyplot as plt

In [2]:
print("A.Schueller method")
AS_csv = '/home/carlos/Dropbox/chembl23_GS3_v2.mphase_gt_0.fp2.fix.txt.co.out'
AS_df = pd.read_csv(AS_csv,sep='\t',header=None, names=['Fold','QL','HT','Tc','HL','QT','TP'])
print(AS_df[AS_df['Tc'] > -99].info())
print(AS_df.nunique())
print(AS_df.describe())


print("\nC.Vigil method")
CV_csv = '/home/carlos/lppnet_AS.out'
CV_df = pd.read_csv(CV_csv,sep='\t',header=None, names=['Fold','QL','HT','Tc','Path','Steps','TP'])
CV_df['Tc'] = pd.to_numeric(CV_df['Tc'])
print(CV_df[CV_df['Steps'] != 'Null'].info())
print(CV_df.nunique())
print(CV_df.describe())

print(CV_df['Steps'].unique().tolist())




A.Schueller method
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1076887 entries, 0 to 1105103
Data columns (total 7 columns):
Fold    1076887 non-null int64
QL      1076887 non-null object
HT      1076887 non-null object
Tc      1076887 non-null float64
HL      1076887 non-null int64
QT      1076887 non-null object
TP      1076887 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 65.7+ MB
None
Fold       10
QL       1232
HT        897
Tc      15263
HL       1230
QT       1230
TP          2
dtype: int64
               Fold            Tc            HL            TP
count  1.105104e+06  1.105104e+06  1.105104e+06  1.105104e+06
mean   4.493506e+00 -2.284694e+00  6.849806e+02  8.723161e-03
std    2.874535e+00  1.565583e+01  3.545904e+02  9.298965e-02
min    0.000000e+00 -9.900000e+01 -1.000000e+00  0.000000e+00
25%    2.000000e+00  1.702130e-01  3.960000e+02  0.000000e+00
50%    4.000000e+00  2.410260e-01  7.280000e+02  0.000000e+00
75%    7.000000e+00  3.085110e-01 

  result = method(y)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1105104 entries, 0 to 1105103
Data columns (total 7 columns):
Fold     1105104 non-null int64
QL       1105104 non-null object
HT       1105104 non-null object
Tc       1105104 non-null float64
Path     1105104 non-null object
Steps    1105104 non-null int64
TP       1105104 non-null bool
dtypes: bool(1), float64(1), int64(2), object(3)
memory usage: 60.1+ MB
None
Fold          10
QL          1232
HT           897
Tc         15259
Path     1105104
Steps          1
TP             2
dtype: int64
               Fold            Tc      Steps
count  1.105104e+06  1.105104e+06  1105104.0
mean   4.493506e+00 -2.317581e+00        2.0
std    2.874535e+00  1.575379e+01        0.0
min    0.000000e+00 -9.900000e+01        2.0
25%    2.000000e+00  1.698110e-01        2.0
50%    4.000000e+00  2.409640e-01        2.0
75%    7.000000e+00  3.087250e-01        2.0
max    9.000000e+00  1.000000e+00        2.0
[2]


In [3]:
def ROC(y_pred, y_scores, title, filename, color, string):
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_pred, y_scores, pos_label=1)
    auc_roc = sklearn.metrics.auc(fpr,tpr)
    plt.plot(fpr, tpr, color=color, lw=2, label=string+' (Area = %0.2f)' % (auc_roc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')


def PR(y_pred, y_scores, title, filename, color, string):
    average_precision = average_precision_score(y_pred, y_scores)
    precision, recall, _ = precision_recall_curve(y_pred, y_scores)
    plt.step(recall, precision, color=color, where='post', label = string+' (AVG. Precision = %0.2f)' % (average_precision))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    


In [4]:
plt.figure()
ROC(AS_df[AS_df['Tc'] > -99]['TP'], AS_df[AS_df['Tc'] > -99]['Tc'], 'AS_ROC', 'AS_ROC','orange','A.Schueller')
ROC(CV_df[CV_df['Tc'] > -99]['TP'], CV_df[CV_df['Tc'] > -99]['Tc'], 'CV_ROC', 'CV_ROC','purple','Redes')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right", prop={'size': 10})
#plt.show()
plt.savefig('ROC.png', dpi=300)
plt.close('all')

plt.figure()
PR(AS_df[AS_df['Tc'] > -99]['TP'], AS_df[AS_df['Tc'] > -99]['Tc'], 'AS_PR', 'AS_PR','orange','A.Schueller')
PR(CV_df[CV_df['Tc'] > -99]['TP'], CV_df[CV_df['Tc'] > -99]['Tc'], 'CV_PR', 'CV_PR','purple','Redes')
plt.title('Precision-Recall curve')
plt.legend(loc="upper right", prop={'size': 10})
#plt.show()
plt.savefig('PR.png', dpi=300)
plt.close('all')

In [5]:
AS_df = AS_df[['HT','QL','Tc']].sort_values(by=['HT','QL'],ascending=False)
AS_df = AS_df.rename(index=str, columns={"HT": "HT_AS", "QL": "QL_AS", "Tc":"Tc_AS"})
AS_df = AS_df.reset_index(drop=True)

CV_df = CV_df[['HT','QL','Tc']].sort_values(by=['HT','QL'],ascending=False)
CV_df = CV_df.rename(index=str, columns={"HT": "HT_CV", "QL": "QL_CV", "Tc":"Tc_CV"})
CV_df = CV_df.reset_index(drop=True)

print(AS_df)
print(CV_df)

result = pd.concat([AS_df, CV_df], axis=1, join_axes=[AS_df.index])
print(result)
result.to_csv('/home/carlos/ComparaciónMetodos.csv')

                 HT_AS         QL_AS     Tc_AS
0           CHEMBL6191     CHEMBL998  0.336100
1           CHEMBL6191     CHEMBL997  0.147368
2           CHEMBL6191    CHEMBL9967  0.316667
3           CHEMBL6191     CHEMBL990  0.273585
4           CHEMBL6191     CHEMBL989  0.224490
5           CHEMBL6191     CHEMBL986  0.065360
6           CHEMBL6191     CHEMBL982  0.271889
7           CHEMBL6191      CHEMBL98  0.303448
8           CHEMBL6191     CHEMBL978  0.169231
9           CHEMBL6191     CHEMBL973  0.298701
10          CHEMBL6191     CHEMBL964  0.086667
11          CHEMBL6191     CHEMBL960  0.287582
12          CHEMBL6191      CHEMBL96  0.107692
13          CHEMBL6191     CHEMBL959  0.101695
14          CHEMBL6191     CHEMBL957  0.340249
15          CHEMBL6191     CHEMBL956  0.189542
16          CHEMBL6191     CHEMBL954  0.247253
17          CHEMBL6191     CHEMBL953  0.295566
18          CHEMBL6191     CHEMBL945  0.271186
19          CHEMBL6191   CHEMBL94454  0.336493
20          C