In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path

# External
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import RocCurveDisplay, auc, accuracy_score, auc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

# Local
from adad.probability import ProbabilityClassifier
from adad.distance import DAIndexGamma, DAIndexKappa, DAIndexDelta
from adad.evaluate import (cumulative_accuracy, permutation_auc,
                           predictiveness_curves, roc_ad,
                           sensitivity_specificity)
from adad.utils import category2code, drop_redundant_col, maccs2binary, to_json, open_json, create_dir

## Functions

In [3]:
def trainCLF(dataset, clf, name, path=None, scale=True, binary=False):
    if path is None:
        path = os.getcwd()
    
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    
    if binary:
        X = maccs2binary(X)
    
    seed = np.random.randint(0, 999999)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    #Train classifier
    clf.fit(X_train, y_train)
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    
    print(f"Accuracy evaluation of {type(clf).__name__} for {name} dataset:")
    print("===============================================================")
    print(f"Accuracy of training split: {accuracy_score(y_train, train_pred):.3f}")
    print(f"Accuracy of testing split : {accuracy_score(y_test, test_pred):.3f}")
    print(f"Seed of split: {seed}\n")
    
    clf_name = type(clf).__name__ + "_jsons"
    clf_path = os.path.join(path, clf_name)
    create_dir(clf_path)
    
    #Save classifier
    filename = f"{type(clf).__name__ }_{name}"
    with open(os.path.join(clf_path, filename + ".pickle"), 'wb') as file:
        pickle.dump(clf, file)
    
    #Get JSON file
    parameter_json = clf.get_params()
    parameter_json['split_seed'] = seed
    to_json(parameter_json, os.path.join(clf_path, filename + ".json"))
    
    json_file = open_json(os.path.join(clf_path, filename + ".json"))
    return json_file

In [4]:
def runAD(dataset, train_split, test_split, clf, ad, name, path=None, scale=True, binary=False):
    """Runs through n cv experiments using a classifier and AppDomain then saves the measure and evaluation outputs"""
    #set up ad, scaler, and path
    scaler = StandardScaler()
    if path == None:
        path = os.getcwd()
    
    #Get X and y
    y = dataset['y'].to_numpy().astype(int)
    X = dataset.drop(['y'], axis=1).to_numpy()
    size = X.shape[0]
    
    #Convert if binary maccs are required
    if binary:
        X = maccs2binary(X)
    
    #Prepare DataFrames for evaluation and score data
    measure = pd.DataFrame()
    evaluation = pd.DataFrame()
    evaluations = []
    
    #Create directory to save evaluation and scores
    if isinstance(ad, (DAIndexGamma, DAIndexDelta, DAIndexKappa)):
        ad_name = type(ad).__name__ + "_" + ad.dist_metric
    else:
        ad_name = type(ad).__name__ 
        
    ad_path = os.path.join(path, ad_name)
    clf_path = os.path.join(ad_path, type(clf).__name__)
    create_dir(ad_path)
    create_dir(clf_path)
        
    #Test ad for all cv
    for col in train_split:
        #Find indexes
        train_idx = train_split[col].dropna(axis=0).to_numpy().astype(int)
        test_idx = test_split[col].dropna(axis=0).to_numpy().astype(int)
        
        assert len(test_idx) + len(train_idx) == len(y)
        
        assert not np.all(np.isin(train_idx, test_idx))
        
        #Scale train and test datasets
        X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
        if scale:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
        #Train AD and classifier
        clf.fit(X_train, y_train)
        ad.fit(X_train)
        
        #Gather scores and save them in csv
        dist_measure = ad.measure(X_test)
        new_col = pd.DataFrame(np.around(dist_measure, decimals=6), columns=[col])
        measure = pd.concat([measure, new_col], axis=1)
        
        #Start gathering data from evaluation functions
        y_pred = clf.predict(X_test)
        
        sensitivity, specificity = sensitivity_specificity(y_test, y_pred)
        cumulative_acc, cumulative_rate = cumulative_accuracy(y_test, y_pred, dist_measure)
        fpr, tpr = roc_ad(y_test, y_pred, dist_measure)
        auc_signi, auc_perm = permutation_auc(y_test, y_pred, dist_measure)
        percentile, err_rate = predictiveness_curves(y_test, y_pred, dist_measure, n_quantiles=75)
        auc_roc = auc(fpr, tpr)
        
        #Compare AUCs
        print(f"permutationAUC vs auc(roc_ad) of {name} using {ad_name} for {col}:")
        print("==================================================================")
        print(f"         {auc_signi:.3f} vs {auc_roc:.3f}\n")
        
        #Save evaluation
        column1 = pd.DataFrame([np.round(sensitivity, 6)], columns=['sensitivity'])
        column2 = pd.DataFrame([np.round(specificity, 6)],columns=['specificity'])
        column3 = pd.DataFrame(np.around(cumulative_acc, 6),columns=['cumulative_acc'])
        column4 = pd.DataFrame(np.around(cumulative_rate, 6), columns=['cumulative_rate'])
        column5 = pd.DataFrame(np.around(fpr, 6),columns=['roc_ad_fpr'])
        column6 = pd.DataFrame(np.around(tpr, 6),columns=['roc_ad_tpr'])
        column7 = pd.DataFrame([np.round(auc_roc,6)] ,columns=['auc(roc_ad)'])
        column8 = pd.DataFrame([np.round(auc_signi, 6)],columns=['permutation_auc'])
        column9 = pd.DataFrame(np.around(auc_perm,6),columns=['permutation_list'])
        column10 = pd.DataFrame(np.around(percentile, 6),columns=['pred_curve_percent'])
        column11 = pd.DataFrame(np.around(err_rate, 6),columns=['pred_curve_err'])
        
        evaluation = pd.concat([column1, column2, column3, column4, column5, column6, column7, column8, column9, column10, column11],axis=1)
        evaluation.to_csv(os.path.join(clf_path, f'{name}_{col}_evaluation.csv'), index=False)
        evaluations.append(evaluation)
    
    measure.to_csv(os.path.join(clf_path, f'{name}_scores.csv'), index=False)
    
    return evaluations

## Set Seed

In [5]:
SEED = np.random.randint(0, 999999)
print(f'The seed is {SEED}')

The seed is 30442


## Set Parameters

In [6]:
PATH_ROOT = Path(os.getcwd()).absolute().parent

files_path = os.path.join(PATH_ROOT, 'data', 'maccs')
dataset_files = [os.path.join(files_path, file) for file in os.listdir(files_path) if 'csv' in file]

cv_path = os.path.join(PATH_ROOT, 'data', 'cv')
cv_files = [os.path.join(cv_path, file) for file in os.listdir(cv_path)]
save_path = os.path.join(PATH_ROOT, 'test_results')

TREE_ESTIMATORS = 100
C = 100
names_list = ['Ames', 'BBBP', 'Cancer', 'CYP1A2', 'FXa', 'hERG', 'HIV', 'Liver']

## Prepare Classifiers

In [7]:
#Set classifiers
rfc = RandomForestClassifier(n_estimators=TREE_ESTIMATORS, random_state=SEED)

## Prepare AD

In [8]:
ad = DAIndexGamma(clf=rfc, dist_metric='jaccard')

In [9]:
#Non distinct maccs
for i in range(len(names_list)):
    filename = dataset_files[i]
    cv_train = cv_files[i* 2+1]
    cv_test = cv_files[i*2]
    name = names_list[i]

    dataset = pd.read_csv(filename)
    idx_train = pd.read_csv(cv_train, dtype=pd.Int64Dtype())
    idx_test = pd.read_csv(cv_test, dtype=pd.Int64Dtype())

    trainCLF(dataset, rfc, name, save_path, scale=False, binary=True)

    runAD(dataset, idx_train, idx_test, rfc, ad, name, path=save_path, scale=False, binary=True)

Accuracy evaluation of RandomForestClassifier for Ames dataset:
Accuracy of training split: 0.976
Accuracy of testing split : 0.810
Seed of split: 763984

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv1:
         0.632 vs 0.579

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv2:
         0.650 vs 0.577

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv3:
         0.660 vs 0.598

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv4:
         0.636 vs 0.518

permutationAUC vs auc(roc_ad) of Ames using DAIndexGamma_jaccard for cv5:
         0.637 vs 0.564

Accuracy evaluation of RandomForestClassifier for BBBP dataset:
Accuracy of training split: 0.996
Accuracy of testing split : 0.890
Seed of split: 860354

permutationAUC vs auc(roc_ad) of BBBP using DAIndexGamma_jaccard for cv1:
         0.480 vs 0.623

permutationAUC vs auc(roc_ad) of BBBP using DAIndexGamma_jaccard for cv2:
         0.414 vs 0.51