## Comparison
Providing results for comparison with other fair classification approaches.

In [58]:
import os, sys
import numpy as np
sys.path.append('../')  # the code for fair classification is in this directory

import time
import pandas as pd
from itertools import product
from SMOTEBoost import SMOTEBoost 
from loaders.load_adult import load_adult
from loaders.load_bank import load_bank
from loaders.load_compas_data import load_compas
from loaders.load_kdd import load_kdd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [70]:
def display_metrics(metrics, metrics_filename='metrics.txt'):
    print(f'Accuracy: {np.mean(metrics["test_scores"]["test_accuracy"])} \
    std:{np.std(metrics["test_scores"]["test_accuracy"])}')
    print(f'Balanced_accuracy: {np.mean(metrics["test_scores"]["Balanced accuracy"])}')
    print(f'Eq.Odds: {np.mean(metrics["test_scores"]["Eq.Odds"])}')
    print(f'TPR Prot.: {np.mean(metrics["test_scores"]["TPR prot"])} ')
    print(f'TPR Non-Prot.: {np.mean(metrics["test_scores"]["TPR non-prot"])}')
    print(f'TNR Prot.: {np.mean(metrics["test_scores"]["TNR prot"])}')
    print(f'TNR Non-Prot.: {np.mean(metrics["test_scores"]["TNR non-prot"])}')
    
    metrics_out = open(metrics_filename, 'a')
    metrics_out.write(f'Model: SMOTEBoost')
    metrics_out.write(f'Accuracy: {np.mean(metrics["test_scores"]["test_accuracy"])} \
    std:{np.std(metrics["test_scores"]["test_accuracy"])}')
    metrics_out.write(f'Balanced_accuracy: {np.mean(metrics["test_scores"]["Balanced accuracy"])}')
    metrics_out.write(f'Eq.Odds: {np.mean(metrics["test_scores"]["Eq.Odds"])}')
    metrics_out.write(f'TPR Prot.: {np.mean(metrics["test_scores"]["TPR prot"])} ')
    metrics_out.write(f'TPR Non-Prot.: {np.mean(metrics["test_scores"]["TPR non-prot"])}')
    metrics_out.write(f'TNR Prot.: {np.mean(metrics["test_scores"]["TNR prot"])}')
    metrics_out.write(f'TNR Non-Prot.: {np.mean(metrics["test_scores"]["TNR non-prot"])}')
    metrics_out.close()

In [57]:
def GridSearch(X_val, y_val, n_estimators=np.arange(10, 100, 5), n_samples=(100, 200), lr=(0.1, 0.5, 1)):
    opt_n_estimators, opt_n_samples, opt_lr = 0, 0, 0
    params = list(product(n_estimators, n_samples, lr))
    min_balanced_accuracy = 0
    for (n_estimators_it, n_samples_it, lr_it) in params:
        smote_boost = SMOTEBoost(n_estimators=n_estimators_it, n_samples=n_samples_it, learning_rate=lr_it,
                                storePerfomance=False, X_test=X_val, y_test=y_val)
        smote_boost.fit(X_val, y_val)
        balanced_accuracy = balanced_accuracy_score(y_val, smote_boost.predict(X_val))
        if balanced_accuracy > min_balanced_accuracy:
            min_balanced_accuracy = balanced_accuracy
            opt_n_estimators, opt_n_samples, opt_lr = n_estimators_it, n_samples_it, lr_it
    
    print('GridSearch result:')
    print(f'Optimal number of estimators: {opt_n_estimators}')
    print(f'Optimal number of samples: {opt_n_samples}')
    print(f'Optimal learning rate: {opt_lr}')
    return opt_n_estimators, opt_n_samples, opt_lr

In [51]:
def eval_model(loader, sensativeAttribute='', grid_search=False, n_estimators=50, n_samples=100, lr=1):
    if sensativeAttribute == '':
        X, y, sa_index, p_Group, x_control = loader()
    else:
        X, y, sa_index, p_Group, x_control = loader(sensativeAttribute)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
    opt_n_estimators, opt_n_samples, opt_lr = n_estimators, n_samples, lr
    if grid_search:
        X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.7)
        opt_n_estimators, opt_n_samples, opt_lr = GridSearch(X_val, y_val)
    smote_boost = SMOTEBoost(n_estimators=opt_n_estimators, n_samples=opt_n_samples, 
                                        storePerfomance=True, saIndex=sa_index, learning_rate=opt_lr, 
                                        saValue=p_Group, X_test=X_test, y_test=y_test)
    smote_boost.fit(X_train, y_train)
    display_metrics(smote_boost.get_performance_over_iterations()[-1]) 

In [53]:
eval_model(load_bank, dataset_name='bank', n_estimators=200)

Accuracy: 0.884761544227886     std:0.00023929263015588057
Balanced_accuracy: 0.7497830863890187
Eq.Odds: 0.10987743763672964
TPR Prot.: 0.5255847953216374 
TPR Non-Prot.: 0.6072572038420491
TNR Prot.: 0.9493506493506494
TNR Non-Prot.: 0.9211456202343314


In [54]:
eval_model(load_compas, sensativeAttribute="sex", n_estimators=200, n_samples=2)

Counter({'Male': 4247, 'Female': 1031})
Features we will be using for classification are: ['age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'race', 'sex', 'priors_count', 'c_charge_degree', 'target'] 

Accuracy: 0.6547845373891     std:0.02238936033005612
Balanced_accuracy: 0.6715311556948075
Eq.Odds: 0.5720339555082817
TPR Prot.: 0.4717948717948718 
TPR Non-Prot.: 0.770048309178744
TNR Prot.: 0.8284023668639053
TNR Non-Prot.: 0.5546218487394958


In [55]:
eval_model(load_adult, sensativeAttribute="sex", n_estimators=200)

Accuracy: 0.7489817523528561     std:4.446342177790719e-05
Balanced_accuracy: 0.8186313253283344
Eq.Odds: 0.4948184052755933
TPR Prot.: 0.6076555023923444 
TPR Non-Prot.: 0.8611915597848573
TNR Prot.: 0.9621188789651987
TNR Non-Prot.: 0.7208365310821182


In [56]:
eval_model(load_kdd, sensativeAttribute='', n_estimators=50, n_samples=500)

Accuracy: 0.9435991120866267     std:0.0011518067300757774
Balanced_accuracy: 0.694814123653508
Eq.Odds: 0.27619926752427404
TPR Prot.: 0.20573183213920163 
TPR Non-Prot.: 0.4586220953933958
TNR Prot.: 0.99474113223423
TNR Non-Prot.: 0.9714321279641501


## Verifaction of the algorithm
In this part of the notebook we verify SMOTEBoost algorithm by comparing aquired results with graphs presented in the article: SMOTEBoost: Nitesh V Chawla, Aleksandar Lazarevic, Lawrence O Hall, and Kevin W Bowyer. 2003. SMOTEBoost: Improving prediction of the minority class in boosting. In ECML PKDD. Springer, 107–119

In [61]:
def load_ver_dataset(dataset_path):
    phomene_ds = pd.read_csv(dataset_path)
    X = phomene_ds.iloc[:, :-1]
    y = phomene_ds.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test 

In [None]:
X_train, X_test, y_train, y_test = load_ver_dataset('../data/phomene.csv')
smote_boost = SMOTEBoost(n_estimators=200, n_samples=100, storePerfomance=True,
                                    X_test=X_test.loc[minor_class_idxs], y_test=y_test[minor_class_idxs])

In [None]:
X, y, sa_index, p_Group, x_control = load_kdd()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) 
smote_boost = SMOTEBoost.SMOTEBoost(n_estimators=100, n_samples=100, storePerfomance=True, saIndex=sa_index, 
                                    saValue=p_Group, X_test=X_test, y_test=y_test)
smote_boost.fit(X_train, y_train)
display_metrics(smote_boost.get_performance_over_iterations()[-1])

In [64]:
phomene_ds = pd.read_csv('../data/phomene.csv')
X_p = phomene_ds.iloc[:, :5]
y_p = phomene_ds.iloc[:, 5]
phomene_ds.head()
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p, test_size=1586)
minor_class_idxs = list(y_test[y_test == 1].index)

In [66]:
smote_boost = SMOTEBoost.SMOTEBoost(n_samples=100, X_test=X_test.loc[minor_class_idxs], y_test=y_test[minor_class_idxs])

In [67]:
s = time.perf_counter()
smote_boost.fit(X_train.to_numpy(), y_train.to_numpy())
print(time.perf_counter() -  s)

7.837844800000084


In [70]:
performances = np.array(smote_boost.get_performance_over_iterations())
recall = [np.mean(performances[i]['train_scores']['test_recall']) for i in range(len(performances))]
display(y_test == 1)
# precision = performances
# for perfomance

3367     True
1495    False
312     False
3149     True
1090    False
        ...  
3757    False
4561    False
419     False
4004    False
3887     True
Name: Class, Length: 1586, dtype: bool

In [None]:
list(y_test[y_test == 1].index)

In [19]:
cross(precision_score(smote_boost.predict(X_test), y_test))
print(recall_score(smote_boost.predict(X_test), y_test))
print(f1_score(smote_boost.predict(X_test), y_test))
print(accuracy_score(smote_boost.predict(X_test), y_test))

0.8865096359743041
0.5440210249671484
0.6742671009771987
0.7477931904161412


In [32]:
def where(x):
    xp = cp.get_array_module(x)
    return cp.where(xp == 1)

In [None]:
%%timeit
x_gpu = cp.ones((200, 200, 200))
where(x_gpu)

In [None]:
%%timeit
x_cpu = np.ones((200, 200, 200))
where(x_cpu)

In [3]:
smote_boost_cuda = SMOTEBoostCuda.SMOTEBoost(n_samples=100)

NameError: name 'SMOTEBoostCuda' is not defined

In [None]:
s = time.perf_counter()
X_train_cp = cp.asarray(X_train)
y_train_cp = cp.asarray(y_train)
smote_boost_cuda.fit(X_train_cp, y_train_cp)
print(time.perf_counter() -  s)

In [25]:
satimage_ds = pd.read_csv('datasets/satimage.csv')
X_s = satimage_ds.iloc[:, :36]
y_s = satimage_ds.iloc[:, 36].replace(to_replace=[1, 2, 3, 5, 6, 7], value=2)
y_s = y_s.replace(to_replace=4, value=1)
# y_s = y_p.replace(to_replace=[1, 2, 3, 5, 6, 7], value=2)
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.3)

In [26]:
smote_boost = SMOTEBoost.SMOTEBoost(n_samples=100)

s = time.perf_counter()
smote_boost.fit(X_train.to_numpy(), y_train.to_numpy())
print(time.perf_counter() -  s)

4.932315400000334


In [27]:
print(precision_score(smote_boost.predict(X_test), y_test))
print(recall_score(smote_boost.predict(X_test), y_test))
print(f1_score(smote_boost.predict(X_test), y_test))
print(accuracy_score(smote_boost.predict(X_test), y_test))

0.7621621621621621
0.407514450867052
0.5310734463276836
0.8710512687726567


In [73]:
kddcup_ds = pd.read_csv('datasets/kddcup.csv')
X_p = kddcup_ds.iloc[:, :-1]
y_p = kddcup_ds.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p, test_size=1586)

In [76]:
kddcup_ds.columns

Index(['duration', ' protocol_type', ' service', ' flag', ' src_bytes',
       ' dst_bytes', ' land', ' wrong_fragment', ' urgent', ' hot',
       ' num_failed_logins', ' logged_in', ' num_compromised', ' root_shell',
       ' su_attempted', ' num_root', ' num_file_creations', ' num_shells',
       ' num_access_files', ' num_outbound_cmds', ' is_host_login',
       ' is_guest_login', ' count', ' srv_count', ' serror_rate',
       ' srv_serror_rate', ' rerror_rate', ' srv_rerror_rate',
       ' same_srv_rate', ' diff_srv_rate', ' srv_diff_host_rate',
       ' dst_host_count', ' dst_host_srv_count', ' dst_host_same_srv_rate',
       ' dst_host_diff_srv_rate', ' dst_host_same_src_port_rate',
       ' dst_host_srv_diff_host_rate', ' dst_host_serror_rate',
       ' dst_host_srv_serror_rate', ' dst_host_rerror_rate',
       ' dst_host_srv_rerror_rate', ' label'],
      dtype='object')

In [44]:
kddcup_ds.groupby(by='label').count()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,32,33,34,35,36,37,38,39,40,41
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
back.,2203,2203,2203,2203,2203,2203,2203,2203,2203,2203,...,2203,2203,2203,2203,2203,2203,2203,2203,2203,2203
buffer_overflow.,30,30,30,30,30,30,30,30,30,30,...,30,30,30,30,30,30,30,30,30,30
ftp_write.,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
guess_passwd.,53,53,53,53,53,53,53,53,53,53,...,53,53,53,53,53,53,53,53,53,53
imap.,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
ipsweep.,1247,1247,1247,1247,1247,1247,1247,1247,1247,1247,...,1247,1247,1247,1247,1247,1247,1247,1247,1247,1247
land.,21,21,21,21,21,21,21,21,21,21,...,21,21,21,21,21,21,21,21,21,21
loadmodule.,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
multihop.,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
neptune.,107201,107201,107201,107201,107201,107201,107201,107201,107201,107201,...,107201,107201,107201,107201,107201,107201,107201,107201,107201,107201


In [67]:
a = {'a' : [1], 'b': [2]}

In [68]:
a['a'].append(2)

In [69]:
a

{'a': [1, 2], 'b': [2]}

In [82]:
f = open('metrics_kdd.txt', 'r')
lines = f.readlines()


In [89]:
line = [line.split() for line in lines]

In [93]:
for words in line:
    for word in words:
        print(int(word))

ValueError: invalid literal for int() with base 10: 'Model:'