In [1]:
import os
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from adad.utils import create_dir

In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent.parent
print(PATH_ROOT)

/home/lukec/workspace/applicabilityDomain


## Objective

- Create ROC plots per dataset per model;

In [3]:
CLASSIFIERS = ['KNeighborsClassifier', 'NNClassifier', 'RandomForestClassifier', 'SVC']
CLF_alias = ['KNN', 'NN', 'RF', 'SVM']
AD = ['DAIndexDelta', 
      'DAIndexGamma', 
      'DAIndexKappa', 
      'PCABoundingBox', 
      'ProbabilityClassifier', 
      'Magnet',
      'SklearnFeatureSqueezing', 
      'SklearnRegionBasedClassifier',
]
AD_alias = ['DM-δ',
            'DM-γ',
            'DM-κ',
            'BB',
            'Prob.',
            'Magnet',
            'FS',
            'RC'
]
DATASETS = ['Ames', 'BBBP', 'Cancer', 'CYP1A2', 'hERG', 'HIV', 'Liver']

In [9]:
def get_data_path(model, ad, dataset):
    """Return a full path of the ROC results;"""
    path_file = os.path.join(PATH_ROOT, 'results', f'{model}_{ad}', f'{dataset}_roc.csv')
    print(os.path.join(PATH_ROOT, 'results', f'{model}_{ad}', f'{dataset}_roc.csv'))
    return path_file

# Test function
df = pd.read_csv(get_data_path(CLASSIFIERS[0], AD[0], DATASETS[0]))
df.head()

/home/lukec/workspace/applicabilityDomain/results/KNeighborsClassifier_DAIndexDelta/Ames_roc.csv


Unnamed: 0,cv1_fpr,cv1_tpr,cv2_fpr,cv2_tpr,cv3_fpr,cv3_tpr,cv4_fpr,cv4_tpr,cv5_fpr,cv5_tpr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00099,0.0,0.0,0.003236,0.0,0.003636,0.000999,0.0,0.0,0.003559
2,0.00099,0.003425,0.002016,0.003236,0.001949,0.003636,0.000999,0.003333,0.0,0.010676
3,0.00495,0.003425,0.002016,0.009709,0.001949,0.010909,0.002997,0.003333,0.00098,0.010676
4,0.00495,0.006849,0.004032,0.009709,0.002924,0.010909,0.002997,0.006667,0.00098,0.014235


In [17]:
def get_mean_tpr(clf, ad, dataset):
    """Get TPR on 5-fold CV on 1 dataset with 1 classifier and 1 AD method;"""
    mean_fpr = np.linspace(0, 1, 100)
    df = pd.read_csv(get_data_path(clf, ad, dataset))
    tpr = []
    # Results are saved as 5-fold CV, starts from 1 to 5;
    for i in range(1, 6):
        fold_fpr = df[f'cv{i}_fpr'].dropna().to_numpy()
        fold_tpr = df[f'cv{i}_tpr'].dropna().to_numpy()
        interp_tpr = np.interp(mean_fpr, fold_fpr, fold_tpr)
        interp_tpr[0] = 0.
        tpr.append(interp_tpr)

    mean_tpr = np.mean(tpr, axis=0)
    mean_tpr[-1] = 1.0
    mean_tpr = pd.Series(mean_tpr)
    return mean_tpr

# Test function
res = get_mean_tpr(CLASSIFIERS[0], AD[0], DATASETS[0])
res

/home/lukec/workspace/applicabilityDomain/results/KNeighborsClassifier_DAIndexDelta/Ames_roc.csv


0     0.000000
1     0.019244
2     0.035791
3     0.051066
4     0.075846
        ...   
95    0.980451
96    0.984066
97    0.987680
98    0.991295
99    1.000000
Length: 100, dtype: float64

In [19]:
# Get a DataFrame that contains all AD method on 1 dataset and 1 classifier 
for i, (ad, ad_lbl) in enumerate(zip(AD, AD_alias)):
    print(i, ad, ad_lbl)

0 DAIndexDelta DM-δ
1 DAIndexGamma DM-γ
2 DAIndexKappa DM-κ
3 PCABoundingBox BB
4 ProbabilityClassifier Prob.
5 Magnet Magnet
6 SklearnFeatureSqueezing FS
7 SklearnRegionBasedClassifier RC
