In [1]:
import math
import warnings

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, f1_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier

from pathlib import Path

DATA_DIR = Path("/home/ancarey/kennedy/karuna_data/original")
warnings.filterwarnings("ignore")

In [2]:
def main(train_set, test_set):
   
    df = pd.read_csv(train_set)
    df = df.dropna(axis=0)
    y_train = df['label']
    X_train = df.drop(columns=['label'])

    df_test = pd.read_csv(test_set)
    df_test = df_test.dropna(axis=0)
    y_test = df_test['label']
    X_test = df_test.drop(columns=['label'])

    # LR
    LR = LogisticRegression(max_iter=500)
    LR.fit(X_train, y_train)
    predictions = LR.predict(X_test)
    acc_lr = accuracy_score(y_test, predictions)
    f1_lr = f1_score(y_test, predictions)
  
    # # XGBOOST
    GB = GradientBoostingClassifier()
    GB.fit(X_train, y_train)
    predictions = GB.predict(X_test)
    acc_xgb = accuracy_score(y_test, predictions)
    f1_xgb = f1_score(y_test, predictions)

    # #LGBM 
    LGBM = LGBMClassifier(verbose=-1)
    LGBM.fit(X_train, y_train)
    predictions = LGBM.predict(X_test)
    acc_lgbm = accuracy_score(y_test, predictions)
    f1_lgbm = f1_score(y_test, predictions)

    #RF 
    RF = RandomForestClassifier()
    RF.fit(X_train, y_train)
    predictions = RF.predict(X_test)
    acc_rf = accuracy_score(y_test, predictions)
    f1_rf = f1_score(y_test, predictions)

    return acc_lr, f1_lr, acc_xgb, f1_xgb, acc_lgbm, f1_lgbm, acc_rf, f1_rf

In [12]:
eps = [1, 2, 5, 10, 25, 50]
datasets = ['adult', 'bank', 'blood', 'calhousing', 'car','diabetes', 'heart', 'jungle']

for e in eps:
    for d in datasets:
        print('Dataset:', d, 'Epsilon:', e)
        lr_accs, xgb_accs, lgbm_accs, rf_accs = [], [], [], []
        lr_f1s, xgb_f1s, lgbm_f1s, rf_f1s = [], [], [], []
        for r in range(5):
            train_set = f'/home/ancarey/kennedy/ldp-files-baselines/{d}_{e}_recon_reduced_binary_train_{r}.csv'
            test_set = f'/home/ancarey/kennedy/ldp-files-baselines/{d}_{e}_reduced_binary_test_{r}.csv'
            acc_lr, f1_lr, acc_xgb, f1_xgb, acc_lgbm, f1_lgbm, acc_rf, f1_rf = main(train_set, test_set)
            lr_accs.append(acc_lr)
            xgb_accs.append(acc_xgb)
            lgbm_accs.append(acc_lgbm)
            rf_accs.append(acc_rf)
            lr_f1s.append(f1_lr)
            xgb_f1s.append(f1_xgb)
            lgbm_f1s.append(f1_lgbm)
            rf_f1s.append(f1_rf)

        print('LR')
        print('Acc:', round(np.mean(np.array(lr_accs)),3), round(np.std(np.array(lr_accs)),3), 'F1:', round(np.mean(np.array(lr_f1s)),3), round(np.std(np.array(lr_f1s)),3))
        print('XGBoost')
        print('Acc:', round(np.mean(np.array(xgb_accs)),3), round(np.std(np.array(xgb_accs)),3), 'F1:', round(np.mean(np.array(xgb_f1s)),3), round(np.std(np.array(xgb_f1s)),3))
        print('LGBM')
        print('Acc:', round(np.mean(np.array(lgbm_accs)),3), round(np.std(np.array(lgbm_accs)),3), 'F1:', round(np.mean(np.array(lgbm_f1s)),3), round(np.std(np.array(lgbm_f1s)),3))
        print('RF')
        print('Acc:', round(np.mean(np.array(rf_accs)),3), round(np.std(np.array(rf_accs)),3), 'F1:', round(np.mean(np.array(rf_f1s)),3), round(np.std(np.array(rf_f1s)),3))

Dataset: adult Epsilon: 1
LR
Acc: 0.353 0.203 F1: 0.305 0.154
XGBoost
Acc: 0.393 0.192 F1: 0.322 0.161
LGBM
Acc: 0.363 0.198 F1: 0.31 0.155
RF
Acc: 0.494 0.011 F1: 0.327 0.005
Dataset: bank Epsilon: 1
LR
Acc: 0.575 0.374 F1: 0.089 0.099
XGBoost
Acc: 0.581 0.368 F1: 0.085 0.104
LGBM
Acc: 0.49 0.019 F1: 0.186 0.011
RF
Acc: 0.496 0.005 F1: 0.186 0.007
Dataset: blood Epsilon: 1
LR
Acc: 0.508 0.218 F1: 0.317 0.159
XGBoost
Acc: 0.541 0.061 F1: 0.331 0.086
LGBM
Acc: 0.508 0.04 F1: 0.356 0.081
RF
Acc: 0.508 0.04 F1: 0.356 0.081
Dataset: calhousing Epsilon: 1
LR
Acc: 0.499 0.01 F1: 0.392 0.321
XGBoost
Acc: 0.472 0.033 F1: 0.427 0.251
LGBM
Acc: 0.479 0.027 F1: 0.482 0.031
RF
Acc: 0.481 0.025 F1: 0.483 0.029
Dataset: car Epsilon: 1
LR
Acc: 0.531 0.179 F1: 0.18 0.173
XGBoost
Acc: 0.615 0.139 F1: 0.371 0.076
LGBM
Acc: 0.516 0.13 F1: 0.369 0.102
RF
Acc: 0.546 0.131 F1: 0.41 0.182
Dataset: diabetes Epsilon: 1
LR
Acc: 0.573 0.111 F1: 0.242 0.221
XGBoost
Acc: 0.562 0.094 F1: 0.268 0.171
LGBM
Acc: 0.568