In [15]:
import pandas as pd
import torch
import optuna

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.tab_model import TabNetClassifier

In [16]:
N_ROWS=10000

In [17]:
df=pd.read_csv('data/application_train.csv', nrows=N_ROWS)

In [18]:
X=df.loc[:, ~df.columns.isin(['TARGET'])]
y=df['TARGET'].astype(int)

In [19]:
# using the train test split function
X_train, X_test,y_train, y_test = train_test_split(X,y,random_state=104,test_size=0.25,shuffle=True)

In [20]:
def ObjectiveR(trial):
    
    """
    regression
    """
    mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 56, 64, step=4)
    n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
    gamma = trial.suggest_float("gamma", 1., 1.4, step=0.2)
    n_shared = trial.suggest_int("n_shared", 1, 3)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                     lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=mask_type, n_shared=n_shared,
                     scheduler_params=dict(mode="min",
                                           patience=trial.suggest_int("patienceScheduler",low=3,high=10), # changing sheduler patience to be lower than early stopping patience 
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=0,
                     ) #early stopping
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    CV_score_array    =[]
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        regressor = TabNetRegressor(**tabnet_params)
        regressor.fit(X_train=X_train, y_train=y_train,
                  eval_set=[(X_valid, y_valid)],
                  patience=trial.suggest_int("patience",low=15,high=30), max_epochs=trial.suggest_int('epochs', 1, 100),
                  eval_metric=['rmse'])
        CV_score_array.append(regressor.best_cost)
    avg = np.mean(CV_score_array)
    return avg

In [21]:
def ObjectiveC(trial):
    
    """
    classification
    """
    mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 56, 64, step=4)
    n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
    gamma = trial.suggest_float("gamma", 1., 1.4, step=0.2)
    n_shared = trial.suggest_int("n_shared", 1, 3)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                     lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=mask_type, n_shared=n_shared,
                     scheduler_params=dict(mode="min",
                                           patience=trial.suggest_int("patienceScheduler",low=3,high=10), # changing sheduler patience to be lower than early stopping patience 
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=0,
                     ) #early stopping
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    CV_score_array    =[]
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        classifier = TabNetClassifier(**tabnet_params)
        classifier.fit(X_train=X_train, y_train=y_train,
                  eval_set=[(X_valid, y_valid)],
                  patience=trial.suggest_int("patience",low=15,high=30), max_epochs=trial.suggest_int('epochs', 1, 100),
                  eval_metric=['auc'])
        CV_score_array.append(classifier.best_cost)
    avg = np.mean(CV_score_array)
    return avg

In [22]:
numeric_feature_mask = X.dtypes!=object
numeric_features = X.columns[numeric_feature_mask].tolist()

In [23]:
x_train=X_train[numeric_features].fillna(0).values
x_test=X_test[numeric_features].fillna(0).values

In [24]:
classifier = TabNetClassifier(verbose=10,seed=42)
classifier.fit(X_train=x_train, y_train=y_train, eval_set=[(x_test, y_test)],
               patience=5,max_epochs=40, eval_metric=['auc'], batch_size=2)



epoch 0  | loss: 0.32575 | val_0_auc: 0.47666 |  0:00:46s
epoch 10 | loss: 0.2811  | val_0_auc: 0.54913 |  0:08:05s

Early stopping occurred at epoch 12 with best_epoch = 7 and best_val_0_auc = 0.58946




In [25]:
# don't have enough cpu or memory to run this on my computer
# study = optuna.create_study(direction="minimize", study_name='TabNet optimization')
# study.optimize(Objective, timeout=6*1) #6 mins

In [26]:
# scoring on test dataset
test_df=pd.read_csv('data/application_test.csv')

X_val=test_df[numeric_features].fillna(0).values
submission_df=test_df[['SK_ID_CURR']]
submission_df['TARGET']=classifier.predict(X_val)
submission_df.to_csv('submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['TARGET']=classifier.predict(X_val)


In [27]:
submission_df['TARGET'].sum()

101