In [1]:
# Installing TabNet through PyTorch
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-2.0.1-py3-none-any.whl (30 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
# importing libraries
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [3]:
# Libraries for pre-processing and evaluation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

## Data loading

In [4]:
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [5]:
test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

In [6]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Feature engineering

# Merging transaction data with identity data

In [7]:
train = pd.merge(train_transaction,train_identity,on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [8]:
print('train dataset shape', train.shape)
print('test dataset shape', test.shape)

train dataset shape (590540, 434)
test dataset shape (506691, 433)


In [9]:
del train_identity, train_transaction, test_identity, test_transaction

In [10]:
# solving incosistent naming of columns

for i in range(1,39):
    if i < 10:
          test.rename(columns = {"id-0"+str(i) : "id_0"+str(i)}, inplace=True)
    else:
          test.rename(columns = {"id-"+str(i) : "id_"+str(i)}, inplace = True)

### Droping columns having null values greater than 90%

In [11]:
cols_lotof_nulls = [c for c in train.columns if (train[c].isnull().sum() / train.shape[0])>0.90]
cols_lotof_nulls_test = [c for c in test.columns if (test[c].isnull().sum() / test.shape[0])>0.90]


cols_to_drop = list(set(cols_lotof_nulls+ cols_lotof_nulls_test))
len(cols_to_drop)

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [12]:
# taking categorical colums to give as paratmeter to TabNet classifier
cat_cols = list(train.select_dtypes(['object']).columns)

len(cat_cols)



29

## Encoding categorical variables

In [13]:
# Using Label encoder to convert categorical to numerical
for col in cat_cols:
  if col in train.columns:
    le = LabelEncoder()
    le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
    train[col] = le.transform(list(train[col].astype('str').values))
    test[col] = le.transform(list(test[col].astype('str').values))

In [14]:
# Replacing inf values if any
train = train.replace([np.inf, -np.inf], np.nan)
test = test.replace([np.inf, -np.inf], np.nan)

In [15]:
# Removing columns if they dont have atleast 100000 non-Nan values
train = train.dropna(axis=1, thresh = 250000)


In [16]:
# Removing Unnecessary columsn for traing data
X = train.drop(['isFraud', 'TransactionID'], axis=1)
y = train.isFraud

# For saving memory
del train

In [17]:
# Custom Loss Function
def log_loss_score(actual, predicted,  eps=1e-15):
    p1 = actual * np.log(predicted+eps)
    p0 = (1-actual) * np.log(1-predicted+eps)
    loss = p0 + p1

    return -loss.mean()

In [18]:
# Filling Nan values with -1
X = X.fillna(-1)

In [19]:
# Training model with kfold strategy
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit
strategy = "KFOLD"
num_ensembling = 1
device = 'cuda'
EPOCHS = 5
SPLITS = 5
save_name = 'tabnet'
if strategy == "KFOLD":
    oof_preds_all = []
    oof_targets_all = []
    scores_all =  []
    scores_auc_all= []
    for seed in range(num_ensembling):
        print("## SEED : ", seed)
        skf = TimeSeriesSplit(n_splits=SPLITS)
        oof_preds = []
        oof_targets = []
        scores = []
        scores_auc = []
        for j, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            print("FOLDS : ", j)

            X_train = torch.tensor(X.iloc[train_idx].values)
            y_train = torch.tensor(y[train_idx].values)
            X_val, y_val = torch.tensor(X.iloc[val_idx].values), torch.tensor(y[val_idx].values)
            model = TabNetClassifier(n_d=8, n_a=8, n_steps=1, gamma=1.3,
                                     lambda_sparse=0,optimizer_fn=torch.optim.Adam,
                                   optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                                     mask_type='entmax', device_name=device, output_dim=1,
                                     scheduler_params=dict(milestones=[100,150], gamma=0.9), 
                                     scheduler_fn=torch.optim.lr_scheduler.MultiStepLR)
            #'sparsemax'
            
            model.fit(X_train=X_train, y_train=y_train,  eval_set=[(X_train, y_train), (X_val, y_val)],max_epochs=EPOCHS,
                      patience=20, batch_size=1024, virtual_batch_size=128, eval_name=['train', 'valid'],)

            preds = model.predict(X_val)
            score = log_loss_score(y_val, preds)
            name = save_name + f"_fold{j}_{seed}"
            model.save_model(name)
            ## save oof to compute the CV later
            oof_preds.append(preds)
            oof_targets.append(y_val)
            scores.append(score)
            roc_ = roc_auc_score(y_val,preds)
            scores_auc.append(roc_)
            print(f"validation fold {j} : {score}, roc AUC Score: {roc_}")
        oof_preds_all.append(np.concatenate(oof_preds))
        
        oof_targets_all.append(np.concatenate(oof_targets))
        scores_all.append(np.array(scores))
        scores_auc_all.append(np.array(scores_auc))

## SEED :  0
FOLDS :  0
Device used : cuda
epoch 0  | loss: 0.1413  | train_auc: 0.76303 | valid_auc: 0.77069 |  0:00:07s
epoch 1  | loss: 0.10511 | train_auc: 0.80874 | valid_auc: 0.78537 |  0:00:14s
epoch 2  | loss: 0.09985 | train_auc: 0.8248  | valid_auc: 0.75022 |  0:00:20s
epoch 3  | loss: 0.09529 | train_auc: 0.84058 | valid_auc: 0.76451 |  0:00:26s
epoch 4  | loss: 0.09235 | train_auc: 0.84593 | valid_auc: 0.81941 |  0:00:32s
Stop training because you reached max_epochs = 5 with best_epoch = 4 and best_valid_auc = 0.81941
Best weights from best epoch are automatically used!
Successfully saved model at tabnet_fold0_0.zip
validation fold 0 : 1.0225861273764274, roc AUC Score: 0.5745969338437263
FOLDS :  1
Device used : cuda
epoch 0  | loss: 0.13073 | train_auc: 0.80456 | valid_auc: 0.82225 |  0:00:12s
epoch 1  | loss: 0.10394 | train_auc: 0.82462 | valid_auc: 0.82495 |  0:00:23s
epoch 2  | loss: 0.09838 | train_auc: 0.85278 | valid_auc: 0.8306  |  0:00:35s
epoch 3  | loss: 0.0944

In [20]:
# Prediction on test set
X_test = test[X.columns].fillna(-1)
X_test = torch.tensor(X_test.values)
preds = model.predict_proba(X_test)

sub = pd.DataFrame({'TransactionID': test['TransactionID'].values.tolist(),
                    'isFraud': preds[:,1].tolist()
                   })

sub.to_csv('submission.csv', index=False)