In [1]:
import pandas as pd
import openml
import data_preprocess as dp

data_name = 'porto-seguro'

data = openml.datasets.get_dataset(42206)

X, y, categorical_indicator, attribute_names = data.get_data(
    target=data.default_target_attribute, dataset_format="dataframe"
)

# Data Preprocessing

In [2]:
from copy import deepcopy

categorical = [b for a, b in zip(categorical_indicator, attribute_names) if a]
numerical = [b for a, b in zip(categorical_indicator, attribute_names) if not a]

encoded_data = deepcopy(X)

for col in categorical:
    
    mapping = {c: i+1 for i, c in enumerate(encoded_data[col].unique())}
    encoded_data[col] = encoded_data[col].replace(mapping)

encoded_data = encoded_data[numerical + categorical]
encoded_data.fillna(0, inplace=True)
encoded_data

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,...,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat
0,2,5,0,11,0.7,0.2,0.718070,2.0,0.400000,0.883679,...,1,1,1,1,1,1,1,1,1,1
1,1,7,0,3,0.8,0.4,0.766078,3.0,0.316228,0.618817,...,1,1,1,2,2,1,2,2,1,2
2,5,9,0,12,0.0,0.0,0.000000,1.0,0.316228,0.641586,...,1,1,1,2,3,1,2,2,1,3
3,0,2,0,8,0.9,0.2,0.580948,1.0,0.374166,0.542949,...,1,2,1,1,2,1,2,3,1,4
4,0,0,0,9,0.7,0.6,0.840759,3.0,0.316070,0.565832,...,1,1,1,2,3,1,2,2,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,3,10,0,13,0.5,0.3,0.692820,3.0,0.374166,0.684631,...,1,1,1,1,9,1,2,1,1,37
595208,5,3,0,6,0.9,0.7,1.382027,2.0,0.387298,0.972145,...,1,1,1,2,6,3,1,2,1,85
595209,1,10,0,12,0.9,0.2,0.659071,3.0,0.397492,0.596373,...,1,1,1,2,9,1,2,2,1,37
595210,5,3,0,12,0.9,0.4,0.698212,3.0,0.374166,0.764434,...,1,1,1,2,2,1,2,2,1,11


In [3]:
mapping = {b: int(i) for i, b in enumerate(y.unique())}


y = y.replace(mapping)
y

0         0
1         0
2         0
3         0
4         0
         ..
595207    0
595208    0
595209    0
595210    0
595211    0
Name: target, Length: 595212, dtype: int64

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split

train_data, left_out, train_label, y_left_out = train_test_split(encoded_data, y, test_size=0.05, random_state=42)
test_data, dev_data, test_label, dev_label = train_test_split(left_out, y_left_out, test_size=0.5, random_state=42)

print(f'train shape: {train_data.shape}\n test shape: {test_data.shape}\n dev shape: {dev_data.shape}')

train shape: (565451, 37)
 test shape: (14880, 37)
 dev shape: (14881, 37)


In [5]:
from sklearn.preprocessing import Normalizer, StandardScaler

nn = StandardScaler()

nn.fit(train_data[numerical])

train_data[numerical] = nn.transform(train_data[numerical])
dev_data[numerical] = nn.transform(dev_data[numerical])
test_data[numerical] = nn.transform(test_data[numerical])

In [6]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(train_data, train_label)

## TabNet

In [7]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier

clf = TabNetClassifier()
clf.fit(
  X_over.values, y_over.values,
  eval_set=[(dev_data.values, dev_label.values)],
  patience = 20
)
preds = clf.predict(test_data.values)

tabnet_pred_prob = [i[1] for i in clf.predict_proba(test_data.values)]

roc = roc_auc_score(
    test_label,
    tabnet_pred_prob,
)

prec = precision_score(test_label.tolist(), preds.tolist(), average='macro')
recall = recall_score(test_label.tolist(), preds.tolist(), average='macro')
f_score = f1_score(test_label.tolist(), preds.tolist(), average='macro')


print('roc is : {}\nprec {}\nrecall {}\nf-score {}' .format(roc, prec, recall, f_score))

epoch 0  | loss: 0.67653 | val_0_auc: 0.6325  |  0:01:29s
epoch 1  | loss: 0.66767 | val_0_auc: 0.63386 |  0:03:45s
epoch 2  | loss: 0.66427 | val_0_auc: 0.63656 |  0:06:02s
epoch 3  | loss: 0.66274 | val_0_auc: 0.64472 |  0:08:16s
epoch 4  | loss: 0.66198 | val_0_auc: 0.64314 |  0:10:29s
epoch 5  | loss: 0.65908 | val_0_auc: 0.63757 |  0:12:47s
epoch 6  | loss: 0.65559 | val_0_auc: 0.62361 |  0:15:09s
epoch 7  | loss: 0.65442 | val_0_auc: 0.63695 |  0:17:33s
epoch 8  | loss: 0.65178 | val_0_auc: 0.63558 |  0:20:11s
epoch 9  | loss: 0.6492  | val_0_auc: 0.63256 |  0:22:40s
epoch 10 | loss: 0.64604 | val_0_auc: 0.6283  |  0:25:02s
epoch 11 | loss: 0.6444  | val_0_auc: 0.61782 |  0:27:20s
epoch 12 | loss: 0.64241 | val_0_auc: 0.61245 |  0:29:33s
epoch 13 | loss: 0.64055 | val_0_auc: 0.61886 |  0:31:54s
epoch 14 | loss: 0.63909 | val_0_auc: 0.61323 |  0:34:08s
epoch 15 | loss: 0.63779 | val_0_auc: 0.60414 |  0:36:23s
epoch 16 | loss: 0.63713 | val_0_auc: 0.61708 |  0:38:35s
epoch 17 | los

## Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

from tqdm import tqdm

comb_data = pd.concat([X_over, dev_data])
comb_labels = pd.concat([y_over, dev_label])

rf_clf = RandomForestClassifier(random_state=0).fit(comb_data, comb_labels)

rf_pred = rf_clf.predict(test_data)
test_ac = rf_clf.score(test_data, test_label)
f_score = f1_score(test_label, rf_pred, average='macro')
prec = precision_score(test_label, rf_pred, average='macro')
recall = recall_score(test_label, rf_pred, average='macro')

print(f'XGB test_acc: {test_ac}\nrecall: {recall}\nprecision: {prec}\nf_score: {f_score}\n')
rf_pred_prob = [i[1] for i in rf_clf.predict_proba(test_data)]

weighted_roc = roc_auc_score(
    test_label,
    rf_pred_prob,
)

print(f'ROC: {weighted_roc}')


XGB test_acc: 0.963239247311828
recall: 0.5
precision: 0.481619623655914
f_score: 0.4906377297778386

ROC: 0.5985163423510593


## XGBoost

In [9]:
import xgboost as xgb
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

xg_boost = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False)

comb_data = pd.concat([X_over, dev_data])
comb_labels = pd.concat([y_over, dev_label])

xgb_clf = xg_boost.fit(comb_data, comb_labels)
test_acc = xgb_clf.score(test_data, test_label)
test_pred = xgb_clf.predict(test_data)
f_score = f1_score(test_label, test_pred, average='macro')
prec = precision_score(test_label, test_pred, average='macro')
recall = recall_score(test_label, test_pred, average='macro')

pred_score = [i[1] for i in xgb_clf.predict_proba(test_data)]

weighted_roc = roc_auc_score(
    test_label,
    pred_score,
)

print(f'XGB test_acc: {test_acc}\nrecall: {recall}\nprecision: {prec}\nf_score: {f_score}\n ROC: {weighted_roc}')

XGB test_acc: 0.7043682795698925
recall: 0.5854228445344993
precision: 0.514617183309494
f_score: 0.46255891883181577
 ROC: 0.6221543437109821
