In [1]:
import pandas as pd
import openml
import data_preprocess as dp
import warnings

data = openml.datasets.get_dataset(846)

X, y, categorical_indicator, attribute_names = data.get_data(
    target=data.default_target_attribute, dataset_format="dataframe"
)

# Data Preprocessing

In [2]:
from copy import deepcopy
import numpy as np

nominal = [b for a, b in zip(categorical_indicator, attribute_names) if a]
numerical = [b for a, b in zip(categorical_indicator, attribute_names) if not a]


encoded_data = deepcopy(X)

for col in nominal:
    
    mapping = {c: i+1 for i, c in enumerate(encoded_data[col].unique())}
    encoded_data[col] = encoded_data[col].replace(mapping)
    try:
        encoded_data[col] = encoded_data[col].cat.add_categories([0])
    except:
        continue

encoded_data = encoded_data[numerical + nominal]
encoded_data.fillna(0, inplace=True)

In [3]:
mapping = {v: i for i, v in enumerate(y.unique())}


y = y.replace(mapping)

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split

train_data, left_out, train_label, y_left_out = train_test_split(encoded_data, y, test_size=0.3, 
                                                                 stratify=y, random_state=42)
test_data, dev_data, test_label, dev_label = train_test_split(left_out, y_left_out, test_size=0.5, 
                                                              stratify=y_left_out, random_state=42)


In [5]:
from sklearn.preprocessing import Normalizer, StandardScaler

nn = StandardScaler()

nn.fit(train_data[numerical])

train_data[numerical] = nn.transform(train_data[numerical])
dev_data[numerical] = nn.transform(dev_data[numerical])
test_data[numerical] = nn.transform(test_data[numerical])

In [6]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(train_data, train_label)

## TabNet

In [7]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier

aucs = []
for r in range(0, 5):
    clf = TabNetClassifier(seed=r)
    clf.fit(
      X_over.values, y_over.values,
      eval_set=[(dev_data.values, dev_label.values)],
      patience = 10
    )
    preds = clf.predict(test_data.values)

    rf_pred_prob = [i[1] for i in clf.predict_proba(test_data.values)]

    roc = roc_auc_score(
        test_label,
        rf_pred_prob,
    )
    prec = precision_score(test_label.tolist(), preds.tolist(), average='macro')
    recall = recall_score(test_label.tolist(), preds.tolist(), average='macro')
    f_score = f1_score(test_label.tolist(), preds.tolist(), average='macro')
    
    aucs.append(roc)

    print('roc is : {}\nprec {}\nrecall {}\nf-score {}' .format(roc, prec, recall, f_score))

epoch 0  | loss: 0.63984 | val_0_auc: 0.79251 |  0:00:03s
epoch 1  | loss: 0.54791 | val_0_auc: 0.81473 |  0:00:05s
epoch 2  | loss: 0.52802 | val_0_auc: 0.82344 |  0:00:06s
epoch 3  | loss: 0.51133 | val_0_auc: 0.83073 |  0:00:07s
epoch 4  | loss: 0.49654 | val_0_auc: 0.84365 |  0:00:08s
epoch 5  | loss: 0.48282 | val_0_auc: 0.84679 |  0:00:09s
epoch 6  | loss: 0.46772 | val_0_auc: 0.85469 |  0:00:10s
epoch 7  | loss: 0.45423 | val_0_auc: 0.86379 |  0:00:11s
epoch 8  | loss: 0.4379  | val_0_auc: 0.89178 |  0:00:12s
epoch 9  | loss: 0.4079  | val_0_auc: 0.90698 |  0:00:14s
epoch 10 | loss: 0.38636 | val_0_auc: 0.91231 |  0:00:15s
epoch 11 | loss: 0.36134 | val_0_auc: 0.92157 |  0:00:16s
epoch 12 | loss: 0.3539  | val_0_auc: 0.90968 |  0:00:17s
epoch 13 | loss: 0.34717 | val_0_auc: 0.93247 |  0:00:18s
epoch 14 | loss: 0.32291 | val_0_auc: 0.93311 |  0:00:19s
epoch 15 | loss: 0.32151 | val_0_auc: 0.93353 |  0:00:20s
epoch 16 | loss: 0.31649 | val_0_auc: 0.93592 |  0:00:21s
epoch 17 | los

epoch 3  | loss: 0.48736 | val_0_auc: 0.84995 |  0:00:05s
epoch 4  | loss: 0.47227 | val_0_auc: 0.85893 |  0:00:06s
epoch 5  | loss: 0.46204 | val_0_auc: 0.88006 |  0:00:08s
epoch 6  | loss: 0.43237 | val_0_auc: 0.87107 |  0:00:09s
epoch 7  | loss: 0.40461 | val_0_auc: 0.90178 |  0:00:11s
epoch 8  | loss: 0.37902 | val_0_auc: 0.90339 |  0:00:12s
epoch 9  | loss: 0.35    | val_0_auc: 0.92705 |  0:00:14s
epoch 10 | loss: 0.33631 | val_0_auc: 0.92884 |  0:00:15s
epoch 11 | loss: 0.32257 | val_0_auc: 0.91848 |  0:00:17s
epoch 12 | loss: 0.3185  | val_0_auc: 0.93085 |  0:00:18s
epoch 13 | loss: 0.31436 | val_0_auc: 0.92976 |  0:00:20s
epoch 14 | loss: 0.306   | val_0_auc: 0.92915 |  0:00:21s
epoch 15 | loss: 0.3062  | val_0_auc: 0.93528 |  0:00:23s
epoch 16 | loss: 0.29977 | val_0_auc: 0.91744 |  0:00:24s
epoch 17 | loss: 0.29831 | val_0_auc: 0.93978 |  0:00:26s
epoch 18 | loss: 0.29117 | val_0_auc: 0.94194 |  0:00:27s
epoch 19 | loss: 0.28691 | val_0_auc: 0.92405 |  0:00:29s
epoch 20 | los

In [8]:
print(aucs)
print(f"{np.mean(aucs)} +- {np.std(aucs)}")

[0.9421035940803384, 0.9413742071881607, 0.9424875415282392, 0.9397160978556327, 0.9399018423437028]
0.9411166565992148 +- 0.0011275674574728301


## Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

from tqdm import tqdm
 
comb_data = pd.concat([X_over, dev_data])
comb_labels = pd.concat([y_over, dev_label])

r_aucs = []
for r in range(0, 5):
    rf_clf = RandomForestClassifier(random_state=r).fit(comb_data, comb_labels)

    rf_pred = rf_clf.predict(test_data)
    test_ac = rf_clf.score(test_data, test_label)
    f_score = f1_score(test_label, rf_pred, average='macro')
    prec = precision_score(test_label, rf_pred, average='macro')
    recall = recall_score(test_label, rf_pred, average='macro')

    rf_pred_prob = [i[1] for i in rf_clf.predict_proba(test_data)]

    weighted_roc = roc_auc_score(
        test_label,
        rf_pred_prob
    )

    r_aucs.append(weighted_roc)
    print('roc is : {}\nprec {}\nrecall {}\nf-score {}\n' .format(weighted_roc, prec, recall, f_score))


roc is : 0.9089021443672607
prec 0.8466624941482974
recall 0.8086416490486258
f-score 0.823595574486512

roc is : 0.9094008607671399
prec 0.8425906258850184
recall 0.8068974630021142
f-score 0.8210843316099036

roc is : 0.9090557988523106
prec 0.843231172853929
recall 0.8105708245243128
f-score 0.8238117939486223

roc is : 0.9115844910903051
prec 0.8472310833226999
recall 0.8123150105708246
f-score 0.8263234986398827

roc is : 0.9089538659015404
prec 0.8418369932432432
recall 0.8055987617034128
f-score 0.81994159365874



In [11]:
print(r_aucs)
print(f"{np.mean(r_aucs)} +- {np.std(r_aucs)}")

[0.9089021443672607, 0.9094008607671399, 0.9090557988523106, 0.9115844910903051, 0.9089538659015404]
0.9095794321957114 +- 0.0010174863925559942


## XGBoost

In [12]:
import xgboost as xgb
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

xg_boost = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False)

comb_data = pd.concat([X_over, dev_data])
comb_labels = pd.concat([y_over, dev_label])

xgb_clf = xg_boost.fit(comb_data.values.astype(float), comb_labels)
test_acc = xgb_clf.score(test_data.values.astype(float), test_label)
test_pred = xgb_clf.predict(test_data.values.astype(float))
f_score = f1_score(test_label, test_pred, average='macro')
prec = precision_score(test_label, test_pred, average='macro')
recall = recall_score(test_label, test_pred, average='macro')

pred_score = [i[1] for i in xgb_clf.predict_proba(test_data.values.astype(float))]
weighted_roc = roc_auc_score(
    test_label,
    pred_score,
)

print(f'XGB test_acc: {test_acc}\nrecall: {recall}\nprecision: {prec}\nf_score: {f_score}\n ROC: {weighted_roc}')

XGB test_acc: 0.8835341365461847
recall: 0.8525747508305648
precision: 0.8701088093743461
f_score: 0.8604895404829562
 ROC: 0.9342615524010873
