In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from pathlib import Path

# External
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Local
from adad.distance import DAIndexDelta, DAIndexGamma, DAIndexKappa
from adad.evaluate import (cumulative_accuracy, permutation_auc,
                           predictiveness_curves, roc_ad,
                           sensitivity_specificity)
from adad.utils import create_dir, maccs2binary, open_json, to_json


## Functions

In [3]:
def trainCLF(dataset, clf, name, path=None, scale=False, binary=False):    
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    
    if binary:
        X = maccs2binary(X)
    
    seed = np.random.randint(0, 999999)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    #Train classifier
    clf.fit(X_train, y_train)
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    
    print(f"Accuracy evaluation of {type(clf).__name__} for {name} dataset:")
    print("===============================================================")
    print(f"Accuracy of training split: {accuracy_score(y_train, train_pred):.3f}")
    print(f"Accuracy of testing split : {accuracy_score(y_test, test_pred):.3f}")
    print(f"Seed of split: {seed}\n")
    
    clf_name = type(clf).__name__ + "_jsons"
    clf_path = os.path.join(path, clf_name)
    create_dir(clf_path)
    
    #Save classifier
    filename = f"{type(clf).__name__ }_{name}"
    with open(os.path.join(clf_path, filename + ".pickle"), 'wb') as file:
        pickle.dump(clf, file)
    
    #Get JSON file
    parameter_json = clf.get_params()
    parameter_json['split_seed'] = seed
    to_json(parameter_json, os.path.join(clf_path, filename + ".json"))
    
    json_file = open_json(os.path.join(clf_path, filename + ".json"))
    return json_file

In [4]:
def runAD(dataset, train_split, test_split, clf, ad, name, path, scale=False, binary=False):
    """Runs through n cv experiments using a classifier and AppDomain then saves the measure and evaluation outputs"""
    #set up ad, scaler, and path
    scaler = StandardScaler()
    
    #Get X and y
    y = dataset['y'].to_numpy().astype(int)
    X = dataset.drop(['y'], axis=1).to_numpy()
    size = X.shape[0]
    
    #Convert if binary maccs are required
    if binary:
        X = maccs2binary(X)
    
    #Prepare DataFrames for evaluation and score data
    measure = pd.DataFrame()
    evaluation = pd.DataFrame()
    evaluations = []
    
    #Create directory to save evaluation and scores
    if isinstance(ad, (DAIndexGamma, DAIndexDelta, DAIndexKappa)):
        ad_name = type(ad).__name__ + "_" + ad.dist_metric
    else:
        ad_name = type(ad).__name__ 
        
    ad_path = os.path.join(path, ad_name)
    clf_path = os.path.join(ad_path, type(clf).__name__)
    create_dir(ad_path)
    create_dir(clf_path)
        
    #Test ad for all cv
    for col in train_split:
        #Find indexes
        train_idx = train_split[col].dropna(axis=0).to_numpy().astype(int)
        test_idx = test_split[col].dropna(axis=0).to_numpy().astype(int)
        
        assert len(test_idx) + len(train_idx) == len(y)
        
        assert not np.all(np.isin(train_idx, test_idx))
        
        #Scale train and test datasets
        X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
        if scale:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
        #Train AD and classifier
        clf.fit(X_train, y_train)
        ad.fit(X_train)
        
        #Gather scores and save them in csv
        dist_measure = ad.measure(X_test)
        new_col = pd.DataFrame(np.around(dist_measure, decimals=6), columns=[col])
        measure = pd.concat([measure, new_col], axis=1)
        
        #Start gathering data from evaluation functions
        y_pred = clf.predict(X_test)
        
        sensitivity, specificity = sensitivity_specificity(y_test, y_pred)
        cumulative_acc, cumulative_rate = cumulative_accuracy(y_test, y_pred, dist_measure)
        fpr, tpr = roc_ad(y_test, y_pred, dist_measure)
        auc_signi, auc_perm = permutation_auc(y_test, y_pred, dist_measure)
        percentile, err_rate = predictiveness_curves(y_test, y_pred, dist_measure, n_quantiles=75)
        auc_roc = auc(fpr, tpr)
        
        #Compare AUCs
        print(f"permutationAUC vs auc(roc_ad) of {name} using {ad_name} for {col}:")
        print("==================================================================")
        print(f"         {auc_signi:.3f} vs {auc_roc:.3f}\n")
        
        #Save evaluation
        column1 = pd.DataFrame([np.round(sensitivity, 6)], columns=['sensitivity'])
        column2 = pd.DataFrame([np.round(specificity, 6)],columns=['specificity'])
        column3 = pd.DataFrame(np.around(cumulative_acc, 6),columns=['cumulative_acc'])
        column4 = pd.DataFrame(np.around(cumulative_rate, 6), columns=['cumulative_rate'])
        column5 = pd.DataFrame(np.around(fpr, 6),columns=['roc_ad_fpr'])
        column6 = pd.DataFrame(np.around(tpr, 6),columns=['roc_ad_tpr'])
        column7 = pd.DataFrame([np.round(auc_roc,6)] ,columns=['auc(roc_ad)'])
        column8 = pd.DataFrame([np.round(auc_signi, 6)],columns=['permutation_auc'])
        column9 = pd.DataFrame(np.around(auc_perm,6),columns=['permutation_list'])
        column10 = pd.DataFrame(np.around(percentile, 6),columns=['pred_curve_percent'])
        column11 = pd.DataFrame(np.around(err_rate, 6),columns=['pred_curve_err'])
        
        evaluation = pd.concat([column1, column2, column3, column4, column5, column6, column7, column8, column9, column10, column11],axis=1)
        evaluation.to_csv(os.path.join(clf_path, f'{name}_{col}_evaluation.csv'), index=False)
        evaluations.append(evaluation)
    
    measure.to_csv(os.path.join(clf_path, f'{name}_scores.csv'), index=False)
    
    return evaluations

## Set Seed

In [5]:
SEED = np.random.randint(0, 999999)
print(f'The seed is {SEED}')

The seed is 173664


## Set Parameters

In [6]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print(PATH_ROOT)

path_maccs = os.path.join(PATH_ROOT, 'data', 'maccs')
path_maccs_files = np.sort([os.path.join(path_maccs, file) for file in os.listdir(path_maccs) if file[-4:] == '.csv'])
print(path_maccs_files)

/home/lukec/workspace/applicabilityDomain
['/home/lukec/workspace/applicabilityDomain/data/maccs/Ames_maccs.csv'
 '/home/lukec/workspace/applicabilityDomain/data/maccs/BBBP_maccs.csv'
 '/home/lukec/workspace/applicabilityDomain/data/maccs/CYP1A2_maccs.csv'
 '/home/lukec/workspace/applicabilityDomain/data/maccs/Cancer_maccs.csv'
 '/home/lukec/workspace/applicabilityDomain/data/maccs/HIV_maccs.csv'
 '/home/lukec/workspace/applicabilityDomain/data/maccs/Liver_maccs.csv'
 '/home/lukec/workspace/applicabilityDomain/data/maccs/hERG_maccs.csv']


In [7]:
path_cv = os.path.join(PATH_ROOT, 'data', 'cv')
path_cv_train = np.sort([os.path.join(path_cv, file) for file in os.listdir(path_cv) if file[-13:] == '_cv_train.csv'])
path_cv_test = np.sort([os.path.join(path_cv, file) for file in os.listdir(path_cv) if file[-12:] == '_cv_test.csv'])
print('Train:', path_cv_train)
print('Test:', path_cv_test)

Train: ['/home/lukec/workspace/applicabilityDomain/data/cv/Ames_cv_train.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/BBBP_cv_train.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/CYP1A2_cv_train.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/Cancer_cv_train.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/HIV_cv_train.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/Liver_cv_train.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/hERG_cv_train.csv']
Test: ['/home/lukec/workspace/applicabilityDomain/data/cv/Ames_cv_test.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/BBBP_cv_test.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/CYP1A2_cv_test.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/Cancer_cv_test.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/HIV_cv_test.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/Liver_cv_test.csv'
 '/home/lukec/workspace/applicabilityDomain/data/cv/hERG_cv_t

In [8]:
datanames = [Path(f).stem.split('_')[0] for f in path_maccs_files]
print(datanames)

['Ames', 'BBBP', 'CYP1A2', 'Cancer', 'HIV', 'Liver', 'hERG']


In [9]:
for i in range(len(datanames)):
    dataname = datanames[i]
    n_name = len(dataname)
    assert os.path.basename(path_maccs_files[i])[:n_name] == dataname, path_maccs_files[i][:n_name]
    assert os.path.basename(path_cv_train[i])[:n_name] == dataname, path_cv_train[i][:n_name]
    assert os.path.basename(path_cv_test[i])[:n_name] == dataname, path_cv_test[i][:n_name]

In [10]:
path_outputs = os.path.join(PATH_ROOT, 'results')

## Prepare Classifiers

In [11]:
N_ESTIMATORS = 200

## Prepare AD

In [12]:
#Non distinct maccs
for i in range(len(datanames)):
    dataname = datanames[i]
    path_data = path_maccs_files[i]
    path_idx_train = path_cv_train[i]
    path_idx_test = path_cv_test[i]

    df = pd.read_csv(path_data)
    idx_train = pd.read_csv(path_idx_train, dtype=pd.Int64Dtype())
    idx_test = pd.read_csv(path_idx_test, dtype=pd.Int64Dtype())

    rfc = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=SEED)
    trainCLF(df, rfc, dataname, path_outputs)
    ad = DAIndexGamma(clf=rfc, dist_metric='jaccard')
    runAD(df, idx_train, idx_test, rfc, ad, dataname, path=path_outputs)

Accuracy evaluation of RandomForestClassifier for Ames dataset:
Accuracy of training split: 0.974
Accuracy of testing split : 0.813
Seed of split: 54945

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv1:
         0.640 vs 0.572

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv2:
         0.655 vs 0.548

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv3:
         0.644 vs 0.578

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv4:
         0.627 vs 0.529

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv5:
         0.649 vs 0.574

Accuracy evaluation of RandomForestClassifier for BBBP dataset:
Accuracy of training split: 0.991
Accuracy of testing split : 0.880
Seed of split: 401106

permutationAUC vs auc(roc_ad) of BBBP using DAIndexGamma_jaccard for cv1:
         0.467 vs 0.539

permutationAUC vs auc(roc_ad) of BBBP using DAIndexGamma_jaccard for cv2:
         0.441 vs 0.603