In [None]:
import sys
!python -m pip install ..

In [None]:
import pickle
import pandas as pd
import adad
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, RocCurveDisplay
import os
import adad
from adad.utils import to_json, open_json
from sklearn.ensemble import RandomForestClassifier
from adad.distance import DAIndexGamma
import numpy as np
from adad.evaluate import sensitivity_specificity, cumulative_accuracy, roc_ad, permutation_auc, predictiveness_curves
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

## Functions

In [None]:
def trainCLF(dataset, train_split, clf, name, path=None, scale=True):
    if path is None:
        path = os.getcwd()
    
    #Get train and test
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    #Train classifier
    clf.fit(X_train, y_train)
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    
    print(f"Accuracy evaluation of {type(clf).__name__} for {name} dataset:")
    print("===============================================================")
    print(f"Accuracy of training split: {accuracy_score(y_train, train_pred):.3f}")
    print(f"Accuracy of testing split : {accuracy_score(y_test, test_pred):.3f}\n")
    
    #Save classifier
    filename = f"{type(clf).__name__ }_{name}"
    with open(os.path.join(path, filename + ".pickle"), 'wb') as file:
        pickle.dump(clf, file)
    
    #Get JSON file
    parameter_json = clf.get_params()
    to_json(parameter_json, os.path.join(path, filename + ".json"))
    
    json_file = open_json(os.path.join(path, filename + ".json"))
    return json_file

In [None]:
def runAD(dataset, cv_split, clf, ad, name, path=None, scale=True):
    if path == None:
        path = os.getcwd()
    
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    size = X.shape[0]
    
    evaluation = pd.DataFrame(columns=['sensitivity', 'specificity', 'cumulative_acc', 'roc_ad', 'auc(roc_ad)','permutation_auc', 
                                       'permutation_list', 'predictiveness_curves'])
    measure = pd.DataFrame()
    
    for col in cv_split:
        #Find indexes
        train_idx = cv_split[col].to_numpy()
        train_idx = train_idx[~np.isnan(train_idx)].astype(int)
        
        all_idx = np.arange(0, size)
        bool_train = np.isin(all_idx, train_idx)
        test_idx = all_idx[~bool_train]
        
        #Scale train and test datasets
        X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
        #Train AD
        ad.fit(X_train)
        
        #Gather scores and save them in csv
        dist_measure = ad.measure(X_test)
        new_col = pd.DataFrame(np.around(dist_measure, decimals=6), columns=[f"cv_{col+1}"])
        measure = pd.concat([measure, new_col], axis=1)
        
        #Start gathering data from evaluation functions
        y_pred = clf.predict(X_test)
        
        sensitivity, specificity = sensitivity_specificity(y_test, y_pred)
        cumulative_acc, cumulative_rate = cumulative_accuracy(y_test, y_pred, dist_measure)
        fpr, tpr = roc_ad(y_test, y_pred, dist_measure)
        auc_signi, auc_perm = permutation_auc(y_test, y_pred, dist_measure)
        percentile, err_rate = predictiveness_curves(y_test, y_pred, dist_measure)
        auc_roc = auc(fpr, tpr)
        
        #Compare AUCs
        print(f"permutationAUC vs auc(roc_ad) of {name} for split{col+1}:")
        print(f"         {auc_signi:.3f} vs {auc_roc:.3f}\n")
        
        
        #Create graphs to save for each split
        fig, axes = plt.subplots(2,2,figsize=(6,6))
        axes[0, 0].plot(cumulative_rate, cumulative_acc)
        axes[0, 0].set_title("Cumulative Accuracy")
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr,roc_auc=auc_roc)
        roc_display.plot(ax=axes[0, 1])
        axes[0, 1].set_title("AUC ROC")
        axes[1, 0].plot(percentile, err_rate)
        axes[1, 0].set_title("Predictiveness Curves (PC)")
        plt.setp(axes[0, 0], xlabel='Cumulative Rate', ylabel='Cumulative Accuracy (%)')
        plt.setp(axes[1, 0], xlabel='Percentile', ylabel='Error Rate')
        fig.delaxes(axes[1, 1])
        fig.suptitle(f'Graphs of {name} for split{col+1}', fontsize=16)
        fig.tight_layout()
        fig.savefig(f'{os.path.join(path, f"{name}_split{col+1}_graphs.png")}', dpi=300)
        
        #Save evaluation
        value_dict = {'sensitivity': np.round(sensitivity, 6), 'specificity': np.round(specificity, 6), 
                      'cumulative_acc': list((np.around(cumulative_acc, 6), np.around(cumulative_rate, 6))),
                      'roc_ad': list((np.around(fpr, 6), np.around(tpr, 6))), 'auc(roc_ad)': auc_roc, 
                      'permutation_auc': auc_signi, 'permutation_list': [auc_perm],
                      'predictiveness_curves': list((np.around(percentile, 6), np.around(err_rate, 6)))}
        
        evaluation = evaluation.append(value_dict, ignore_index=True)
    
    measure.to_csv(os.path.join(path, f'{name}_scores.csv'), index=False)
    evaluation.to_csv(os.path.join(path, f'{name}_evaluation.csv'), index=False)
    
    return evaluation
        

## Demo

In [None]:
path = os.getcwd()
parent_path = os.path.abspath(os.path.join(path, os.pardir))

files_path = os.path.join(parent_path, 'data', 'maccs')
dataset_files = [os.path.join(files_path, file) for file in os.listdir(files_path)]
filename = dataset_files[1]

cv_path = os.path.join(parent_path, 'experiments', 'preprocessing')
cv_files = [os.path.join(cv_path, file) for file in os.listdir(cv_path)]
cv = cv_files[1]

dataset = pd.read_csv(filename)
cv_split = pd.read_csv(cv, header=None, dtype={'id': int})

In [None]:
SEED = 42
rfc = RandomForestClassifier(n_estimators=300, random_state=SEED)

In [None]:
json = trainCLF(dataset, cv_split, rfc, 'bbbp')

In [None]:
print(json)

In [None]:
ad = DAIndexGamma(clf=rfc)
measure = runAD(dataset, cv_split, rfc, ad, 'bbbp')