In [1]:
import sys
sys.path.append('../')
from ControlBurn.ControlBurnExperiment import run_experiment
from ControlBurn.ControlBurnExperiment import plot_tradeoff_curve
from ControlBurn.RandomForestBaseline import RandomForestBaseline
import pandas as pd


from sklearn.datasets import load_boston
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_breast_cancer

from sklearn import preprocessing
from sklearn.model_selection import KFold
from pmlb import fetch_data
from pmlb import classification_dataset_names, regression_dataset_names



# Data
## Load PMLB Data

In [None]:
dataset_names = ['analcatdata_bankruptcy'
,'analcatdata_boxing2'
,'analcatdata_cyyoung8092'
,'analcatdata_japansolvent'
,'analcatdata_lawsuit'
,'appendicitis'
,'breast_cancer_wisconsin'
,'bupa'
,'diabetes'
,'glass2'
,'haberman'
,'lupus'
,'phoneme'
,'pima'
,'prnn_crabs'
,'prnn_synth'
,'ring'
,'twonorm'
,'wdbc'
,'spectf',
'chess'
,'dis'
,'horse_colic'
,'hypothyroid'
,'colic',
'sonar',
'Hill_Valley_without_noise',
'crx','clean1','tokyo1','spambase','ionosphere','churn',
'Hill_Valley_with_noise','analcatdata_cyyoung9302','australian','biomed',
'buggyCrx','cleve','credit_a','heart_c','heart_h']


dataset = 'chess'

print(dataset)
data = fetch_data(dataset)

y = data['target']
X = data.drop('target',axis = 1)
features = X.columns
X = preprocessing.scale(X)
X = pd.DataFrame(X,columns = features)

## Duplication Step for Semi Synthetic Experiment

In [None]:
if dataset == 'Chess':
    rf = RandomForestClassifier().fit(X,y)
    importances = pd.DataFrame(np.column_stack((X.columns,rf.feature_importances_))
                 ,columns = ['feat','imp']).sort_values('imp',ascending = False)
    to_duplicate = importances.head(3)['feat'].values
    for col in to_duplicate:
        for i in range(7):
            name_col = col +'dup'+str(i)
            X[name_col] = X[col] + np.random.normal(0,.1,len(X))

# Load Real World Data

In [None]:
from sklearn.datasets import load_boston
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_breast_cancer

def load_adult():
    data = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        header=None)
    data.columns = [
            "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
            "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
            "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
        ]
    data['target'] = 0
    data = data.sample(frac = 1)
    data['target'].loc[data['Income']== data['Income'].unique()[1]] = 1
    y = data['target']
    data.drop(['target','Occupation','Income'],axis = 1, inplace = True)
    data = pd.get_dummies(data, columns = ['WorkClass','Education','MaritalStatus','Relationship','Race','Gender'])
    data['NativeCountry'] = data['NativeCountry'] == ' United-States'
    data['NativeCountry'] = data['NativeCountry'].astype(int)
    features = data.columns
    X = preprocessing.scale(data)
    X = pd.DataFrame(X,columns = features)
    #xTrain, xTest, yTrain, yTest = train_test_split(X,y, test_size = 0.3)
    return X,y

def load_audit():
    audit_risk = pd.read_csv("../Data/audit_risk.csv")
    trial = pd.read_csv("trial.csv")
    trial.columns = ['Sector_score','LOCATION_ID', 'PARA_A', 'Score_A', 'PARA_B',
           'Score_B',  'TOTAL', 'numbers', 'Marks',
           'Money_Value', 'MONEY_Marks', 'District',
           'Loss', 'LOSS_SCORE', 'History', 'History_score', 'Score', 'Risk_trial' ]
    trial['Score_A'] = trial['Score_A']/10
    trial['Score_B'] = trial['Score_B']/10
    merged_df = pd.merge(audit_risk, trial, how='outer', on = ['History', 'LOCATION_ID', 'Money_Value', 'PARA_A', 'PARA_B',
           'Score', 'Score_A', 'Score_B', 'Sector_score', 'TOTAL', 'numbers'])

    df = merged_df.drop(['Risk_trial'], axis = 1)
    df['Money_Value'] = df['Money_Value'].fillna(df['Money_Value'].median())
    df = df.drop(['Detection_Risk', 'Risk_F'], axis = 1) 
    df = df[(df.LOCATION_ID != 'LOHARU')]
    df = df[(df.LOCATION_ID != 'NUH')]
    df = df[(df.LOCATION_ID != 'SAFIDON')]
    df = df.astype(float)
    df = df.drop_duplicates(keep = 'first')
    df = df.sample(frac=1)
    class_df = df.drop(["Audit_Risk",'Inherent_Risk','Score','TOTAL'], axis = 1)
    y = class_df["Risk"]    
    classification_X = class_df.drop(["Risk"], axis = 1)
    cols = classification_X.columns
    X = preprocessing.scale(classification_X)
    X = pd.DataFrame(X,columns = cols)
    return X,y

# Run Experiment

In [None]:
max_depth= 10
problem_type = 'Classification'
loss_type = 'logistic'
optimization_type = 'penalized'


lambd=  0.01
threshold= 10**-3
ntrials = 10
features_to_find = min(len(X.columns),10)
search_limit = 20
l_start = 10

bag_test_acc = []
bag_nonzero = []
base_line_acc = []
baseline_nonzero = []
baseline_se = []


kf = KFold(n_splits=4)
kf.get_n_splits(X)
for train_index, test_index in kf.split(X):
    xTrain, xTest = X.iloc[train_index], X.iloc[test_index]
    yTrain, yTest = y.iloc[train_index], y.iloc[test_index]
    
    arg = [xTrain,yTrain,xTest,yTest, max_depth,problem_type,loss_type,lambd,threshold,optimization_type]
    bag_test_acc1,bag_nonzero1,bag_train_acc1 = run_experiment(arg,ntrials,features_to_find,search_limit,l_start)
    bag_test_acc = np.append(bag_test_acc,bag_test_acc1)
    bag_nonzero = np.append(bag_nonzero,bag_nonzero1)
    
    range1 = np.unique(bag_nonzero1)
    base_line_acc1,baseline_nonzero1,baseline_se1 = RandomForestBaseline(xTrain,yTrain,xTest,yTest,problem_type,range1)
    
    base_line_acc = np.append(base_line_acc,base_line_acc1)
    baseline_nonzero = np.append(baseline_nonzero,baseline_nonzero1)


# Plot Results

In [None]:
plot_tradeoff_curve(bag_test_acc,bag_nonzero,'blue',label = 'ControlBurn')
plt.ylabel('ROC-AUC')
plt.xlabel('Number of Non-Zero Features')
plt.xlim(0,10)
plt.scatter(baseline1['nonzero'],baseline1['acc']['mean'],label = 'Random Forest',color = 'grey')
plt.errorbar(baseline1['nonzero'],baseline1['acc']['mean'], baseline1['acc']['std'],color = 'grey')
plt.title(dataset)
plt.legend()