In [2]:
import os
import math
import torch
import warnings

import numpy as np
import pandas as pd
from diffprivlib import models
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB as skl_gnb
from diffprivlib.models import LogisticRegression, GaussianNB, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, f1_score

warnings.simplefilter("ignore")

CUDA_VISIBLE_DEVICES = 0

In [6]:
def main(dataset, eps, min, max, n):

    numerical_columns_selector = selector(dtype_exclude=object)
    categorical_columns_selector = selector(dtype_include=object)
    # import pandas data train, test
    train_filename = f'/home/ancarey/kennedy/karuna_data/original/{dataset}/{dataset}_original.csv'
    test_filename = f'/home/ancarey/kennedy/karuna_data/original/{dataset}/{dataset}_original_test.csv'

    df = pd.read_csv(train_filename)
    df = df.dropna(axis=0)
    if dataset == 'adult':
        df = df.loc[df['native_country'] != 'Holand-Netherlands']
    if dataset == 'car':
        df['label'] = df['label'].map({0: 0, 1:1, 2:1, 3:1})
    if dataset == 'diabetes' or dataset == 'heart':
        df['label'] = df['label'].astype('bool')

    df = pd.get_dummies(df)
    
    # sample dfTrain according to Pois(n / N)
    items_to_sample = [(np.random.poisson(n / len(df)) >= 1) for i in range(len(df))]
    df_sampled = df[items_to_sample]
    
    df_test = pd.read_csv(test_filename)
    df_test = df_test.dropna(axis=0)
    if dataset == 'car':
        df_test['label'] = df_test['label'].map({0: 0, 1:1, 2:1, 3:1})
    if dataset == 'diabetes' or dataset == 'heart':
        df_test['label'] = df_test['label'].astype('bool')

    large = ['adult', 'bank', 'calhousing', 'jungle']
    df_test = pd.get_dummies(df_test)
    if dataset in large:
        df_test_sample = df_test.sample(n=int(len(df_test)*.1))
    else: 
        df_test_sample = df_test.sample(n=int(len(df_test)*.75))
        
    y_train = df_sampled['label']
    X_train = df_sampled.drop(columns=['label'])
    y_test = df_test_sample['label']
    X_test = df_test_sample.drop(columns=['label'])
    # LR    
    LR = LogisticRegression(epsilon=eps) #, bounds=(-1e5, 1e5)
    LR.fit(X_train, y_train)
    predictions = LR.predict(X_test)
    acc_lr = accuracy_score(y_test, predictions)
    f1_lr = f1_score(y_test, predictions)

    # GaussianBayes
    GB = skl_gnb() #, bounds=(-1e5, 1e5)
    GB.fit(X_train, y_train)
    predictions = GB.predict(X_test)
    acc_gb = accuracy_score(y_test, predictions)
    f1_gb = f1_score(y_test, predictions)

    # RF
    RF = RandomForestClassifier(epsilon=eps) #, bounds=(-1e5, 1e5)
    RF.fit(X_train, y_train)
    predictions = RF.predict(X_test)
    acc_rf = accuracy_score(y_test, predictions)
    f1_rf = f1_score(y_test, predictions)

    return acc_lr, f1_lr, acc_gb, f1_gb, acc_rf, f1_rf

In [13]:
def get_bounds(name):
    if name == 'adult':
        min = [0 for _ in range(102)]
        max = [100, 50000, 5000, 40]
        ones = [1 for _ in range(98)]
        max.extend(ones)
    elif name == 'bank':
        min = -5000
        max = 5e5
    elif name == 'blood':
        min = 0
        max = 3e5
    elif name == 'calhousing':
        min = -124
        max = 6e5
    elif name == 'car':
        min = 0
        max = 4
    elif name == 'diabetes':
        min = 0
        max = 600
    elif name == 'heart':
        min = -4
        max = 500
    elif name == 'jungle':
        min = 0
        max = 8

    return min, max

In [16]:
eps = [float('inf'), 1, 2, 5, 10, 25, 50]
datasets = ['jungle']
rs = [3016, 3617, 300, 3300, 691, 307, 367, 3586]
for d in datasets:
    for e in eps:
        print('Dataset:', d, 'Epsilon:', e)
        min, max = get_bounds(d)
        lr_accs, gb_accs, rf_accs = [], [], []
        lr_f1s, gb_f1s, rf_f1s = [], [], []
        for i, r in enumerate(rs):
            acc_lr, f1_lr, acc_gb, f1_gb, acc_rf, f1_rf = main(d, e, min, max, r)
            lr_accs.append(acc_lr)
            lr_f1s.append(f1_lr)
            gb_accs.append(acc_gb)
            gb_f1s.append(f1_gb)
            rf_accs.append(acc_rf)
            rf_f1s.append(f1_rf)

        print('LR')
        print('Acc:', round(np.mean(np.array(lr_accs)),3), round(np.std(np.array(lr_accs)),3), 'F1:', round(np.mean(np.array(lr_f1s)),3), round(np.std(np.array(lr_f1s)),3))
        print('GB')
        print('Acc:', round(np.mean(np.array(gb_accs)),3), round(np.std(np.array(gb_accs)),3), 'F1:', round(np.mean(np.array(gb_f1s)),3), round(np.std(np.array(gb_f1s)),3))
        print('RF')
        print('Acc:', round(np.mean(np.array(rf_accs)),3), round(np.std(np.array(rf_accs)),3), 'F1:', round(np.mean(np.array(rf_f1s)),3), round(np.std(np.array(rf_f1s)),3))

Dataset: jungle Epsilon: inf
LR
Acc: 0.73 0.015 F1: 0.74 0.02
GB
Acc: 0.744 0.021 F1: 0.749 0.027
RF
Acc: 0.688 0.044 F1: 0.671 0.067
Dataset: jungle Epsilon: 1


KeyboardInterrupt: 