In [None]:
import sys
!python -m pip install ..

In [1]:
import os
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from adad.distance import DAIndexGamma, DAIndexKappa, DAIndexDelta
from adad.probability import ProbabilityClassifier

## Set Seed

In [2]:
SEED = 42

## Parent Path

In [3]:
path = os.getcwd()
parent_path = os.path.abspath(os.path.join(path, os.pardir))

files_path = os.path.join(parent_path, 'data', 'maccs')
dataset_files = [os.path.join(files_path, file) for file in os.listdir(files_path)]
filename = dataset_files[0]

## Current Path

In [None]:
files_path = os.path.join(os.getcwd(), 'data', 'maccs')
dataset_files = [os.path.join(files_path, file) for file in os.listdir(files_path)]
filename = dataset_files[0]

## List of App Domains

In [4]:
AD_LIST = [DAIndexGamma, DAIndexKappa, DAIndexDelta, ProbabilityClassifier]

## Pipeline

In [24]:
def pipe(filename, SEED):
    data_name = filename.split("\\")[-1].split("_")[0]
    dataset_pickle = data_name + "_dictionary.pickle"
    dataset_dict = dict()
    dataset_dict["SEED"] = SEED
    
    print(f'Read from: {filename}')

    df = pd.read_csv(filename)
    print(df.head())

    y = df['y'].to_numpy()
    X = df.drop(['y'], axis=1).to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED
    )
    
    print(f"Train shape: {X_train.shape}")
    print(f" Test shape: {X_test.shape}\n")
    
    scaler = StandardScaler()
    train_normal = scaler.fit_transform(X_train)
    test_normal = scaler.transform(X_test)
    
    print(f"Training classifiers for {data_name}...")
    #classifier settings were edited to match the Efficiency paper a bit
    rfc = RandomForestClassifier(n_estimators=300, random_state=SEED)
    svc = SVC(C=100, gamma='auto', random_state=SEED, probability=True)
    knnc = KNeighborsClassifier(n_neighbors=6)
    
    models = [rfc, svc, knnc]
    
    for model in models:
        model.fit(train_normal, y_train)
    print(f"All classifiers for {data_name} trained.\n")
    
    dataset_dict["Classifiers"] = models
    
    
    for ad in AD_LIST:
        print(f"Running {ad.__name__} with all classifiers for {data_name}...")
        dataset_dict[ad.__name__] = []
        
        for model in models:
            AD = ad(model)
            AD.fit(train_normal)
            dataset_dict[ad.__name__].append([AD, AD.measure(test_normal), type(model).__name__])     
        
    print(f"\nSaving data for {data_name}...")
    
    with open(dataset_pickle, 'wb') as output:
        pickle.dump(dataset_dict, output)
        
    print(f"\nSaved {data_name}!\n")
        
    return dataset_pickle, dataset_dict

In [25]:
a_pickle, a_dict = pipe(filename, SEED)

Read from: C:\Users\sarah\Desktop\applicabilityDomain\data\maccs\Ames_MACCS.csv
    x1   x2   x3   x4   x5   x6   x7   x8   x9  x10  ...  x158  x159  x160  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   5.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   2.0   1.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   2.0   1.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   2.0   1.0   0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   1.0   2.0   1.0   

   x161  x162  x163  x164  x165  x166    y  
0   2.0   1.0  42.0   6.0  44.0   0.0  0.0  
1   5.0   0.0   0.0   2.0   0.0   0.0  1.0  
2   4.0   0.0   6.0   2.0   6.0   0.0  1.0  
3   4.0   0.0   0.0   2.0   0.0   0.0  1.0  
4   4.0   0.0   6.0   3.0   6.0   0.0  1.0  

[5 rows x 167 columns]
Train shape: (5209, 166)
 Test shape: (1303, 166)

Training classifiers for Ames...
All classifiers for Ames trained.

Running DAIndexGamma with all classifi

In [26]:
print(a_dict)

{'SEED': 42, 'Classifiers': [RandomForestClassifier(n_estimators=300, random_state=42), SVC(C=100, gamma='auto', probability=True, random_state=42), KNeighborsClassifier(n_neighbors=6)], 'DAIndexGamma': [[<adad.distance.DAIndexGamma object at 0x00000182BEBAED00>, array([1.96168708, 4.88295974, 9.92201981, ..., 2.19799251, 5.14940854,
       0.99266144]), 'RandomForestClassifier'], [<adad.distance.DAIndexGamma object at 0x00000182BEBC3520>, array([1.96168708, 4.88295974, 9.92201981, ..., 2.19799251, 5.14940854,
       0.99266144]), 'SVC'], [<adad.distance.DAIndexGamma object at 0x00000182BEBC31C0>, array([1.96168708, 4.88295974, 9.92201981, ..., 2.19799251, 5.14940854,
       0.99266144]), 'KNeighborsClassifier']], 'DAIndexKappa': [[<adad.distance.DAIndexKappa object at 0x00000182BEBAE610>, array([ 2.00685761,  5.79573291, 10.09195039, ...,  2.19799251,
        8.83844714,  1.25707866]), 'RandomForestClassifier'], [<adad.distance.DAIndexKappa object at 0x00000182BEBC3B50>, array([ 2.006

In [28]:
with open(a_pickle, "rb") as input_name:
    a_dict2 = pickle.load(input_name)
    
print(a_dict2)

{'SEED': 42, 'Classifiers': [RandomForestClassifier(n_estimators=300, random_state=42), SVC(C=100, gamma='auto', probability=True, random_state=42), KNeighborsClassifier(n_neighbors=6)], 'DAIndexGamma': [[<adad.distance.DAIndexGamma object at 0x00000182BDE835E0>, array([1.96168708, 4.88295974, 9.92201981, ..., 2.19799251, 5.14940854,
       0.99266144]), 'RandomForestClassifier'], [<adad.distance.DAIndexGamma object at 0x00000182BDE83940>, array([1.96168708, 4.88295974, 9.92201981, ..., 2.19799251, 5.14940854,
       0.99266144]), 'SVC'], [<adad.distance.DAIndexGamma object at 0x00000182BDE83D00>, array([1.96168708, 4.88295974, 9.92201981, ..., 2.19799251, 5.14940854,
       0.99266144]), 'KNeighborsClassifier']], 'DAIndexKappa': [[<adad.distance.DAIndexKappa object at 0x00000182BDEC12E0>, array([ 2.00685761,  5.79573291, 10.09195039, ...,  2.19799251,
        8.83844714,  1.25707866]), 'RandomForestClassifier'], [<adad.distance.DAIndexKappa object at 0x00000182BDEC1700>, array([ 2.006