In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch.optim 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cols = [
    'class_label', 'lepton_pt', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude',
    'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag',
    'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt',
    'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta',
    'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv',
    'm_bb', 'm_wbb', 'm_wwbb'
]

df = pd.read_csv('data/HIGGS_train.csv', header=None, names=cols)
df.info()

  df = pd.read_csv('data/HIGGS_train.csv', header=None, names=cols)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   class_label               600000 non-null  float64
 1   lepton_pt                 600000 non-null  float64
 2   lepton_eta                600000 non-null  float64
 3   lepton_phi                600000 non-null  float64
 4   missing_energy_magnitude  600000 non-null  float64
 5   missing_energy_phi        600000 non-null  float64
 6   jet_1_pt                  600000 non-null  float64
 7   jet_1_eta                 600000 non-null  float64
 8   jet_1_phi                 600000 non-null  object 
 9   jet_1_b-tag               600000 non-null  float64
 10  jet_2_pt                  600000 non-null  float64
 11  jet_2_eta                 600000 non-null  float64
 12  jet_2_phi                 600000 non-null  float64
 13  jet_2_b-tag               600000 non-null  f

In [3]:
#count the number of missing values in each column
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.isnull().sum()
#remove rows with missing values
df.dropna(inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 599996 entries, 0 to 599999
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   class_label               599996 non-null  float64
 1   lepton_pt                 599996 non-null  float64
 2   lepton_eta                599996 non-null  float64
 3   lepton_phi                599996 non-null  float64
 4   missing_energy_magnitude  599996 non-null  float64
 5   missing_energy_phi        599996 non-null  float64
 6   jet_1_pt                  599996 non-null  float64
 7   jet_1_eta                 599996 non-null  float64
 8   jet_1_phi                 599996 non-null  float64
 9   jet_1_b-tag               599996 non-null  float64
 10  jet_2_pt                  599996 non-null  float64
 11  jet_2_eta                 599996 non-null  float64
 12  jet_2_phi                 599996 non-null  float64
 13  jet_2_b-tag               599996 non-null  f

In [4]:
X = df.iloc[:, 1:].values 
y = df.iloc[:, 0].values

#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=313)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=313)


In [15]:
logreg = LogisticRegression(random_state=313).fit(X_train, y_train)


In [16]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.7323525816953522
0.7580958911611273


0.7328596269979277
0.7580958911611273


In [10]:
#tabnet pretraining
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax' # "sparsemax"
)

unsupervised_model.fit(
    X_train=X_train,
    eval_set=[X_val],
    pretraining_ratio=0.8,
)





epoch 0  | loss: 1.20806 | val_0_unsup_loss_numpy: 0.8765199780464172|  0:00:42s
epoch 1  | loss: 0.94531 | val_0_unsup_loss_numpy: 0.874809980392456|  0:01:33s
epoch 2  | loss: 0.93886 | val_0_unsup_loss_numpy: 0.8457000255584717|  0:02:03s
epoch 3  | loss: 0.93382 | val_0_unsup_loss_numpy: 0.8651400208473206|  0:02:33s
epoch 4  | loss: 0.9317  | val_0_unsup_loss_numpy: 0.8500400185585022|  0:03:04s
epoch 5  | loss: 0.92804 | val_0_unsup_loss_numpy: 0.8390399813652039|  0:03:33s
epoch 6  | loss: 0.92585 | val_0_unsup_loss_numpy: 0.8735700249671936|  0:04:03s
epoch 7  | loss: 0.92717 | val_0_unsup_loss_numpy: 0.8381699919700623|  0:04:33s
epoch 8  | loss: 0.92484 | val_0_unsup_loss_numpy: 0.8323100209236145|  0:05:00s
epoch 9  | loss: 0.92409 | val_0_unsup_loss_numpy: 0.8367400169372559|  0:05:30s
epoch 10 | loss: 0.92414 | val_0_unsup_loss_numpy: 0.8262500166893005|  0:05:57s
epoch 11 | loss: 0.92439 | val_0_unsup_loss_numpy: 0.8359900116920471|  0:06:26s
epoch 12 | loss: 0.92449 | va



In [11]:
unsupervised_model.save_model('./test_pretrain')


Successfully saved model at ./test_pretrain.zip


'./test_pretrain.zip'

In [12]:

clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax', # This will be overwritten if using pretrain model
    verbose=5
)

clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],    
    batch_size=1024,
    virtual_batch_size=128,
    from_unsupervised=un
)




NameError: name 'unsupervised_model' is not defined

In [None]:
clf = TabNetClassifier()  
clf.fit(
  X_train, y_train,
  eval_set=[(X_val, y_val)]
)

In [13]:
saving_path_name = "./tabnet_with_pretrain"
saved_filepath = clf.save_model(saving_path_name)


Successfully saved model at ./tabnet_with_pretrain.zip


tabnet acc: 0.7394429969055384
xgboost acc: 0.7328596269979277
logistic regression acc: 0.6398035544641915


In [17]:
unsupervised_M = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=0.02),
    verbose=True,
    n_a=128,
    n_d=128,
    n_steps=20
)

unsupervised_M.fit(
    X_train=X_train,
    eval_set=[X_val],
    pretraining_ratio=0.8,
    batch_size=8192,
    virtual_batch_size=256,
)

unsupervised_M.save_model('./test_pretrain_M')

model_M = TabNetClassifier(n_d=96, n_a=32, lambda_sparse=0.000001,  n_steps=8, gamma=2.0, from_unsupervised=unsupervised_M)
model_M.fit(X_train, y_train, eval_set=[(X_val, y_val)], batch_size=8192, virtual_batch_size=256, )

epoch 0  | loss: 89.09007| val_0_unsup_loss_numpy: 1.776039958000183|  0:12:22s
epoch 1  | loss: 2.88749 | val_0_unsup_loss_numpy: 1.5425200462341309|  0:34:06s
epoch 2  | loss: 2.15739 | val_0_unsup_loss_numpy: 1.0488799810409546|  0:46:17s
epoch 3  | loss: 1.16372 | val_0_unsup_loss_numpy: 0.9975799918174744|  0:58:30s
epoch 4  | loss: 0.97847 | val_0_unsup_loss_numpy: 0.86735999584198|  1:10:34s


In [None]:
#!/bin/bash
#SBATCH --partition=gpu
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu
#SBATCH --mem=12000
#SBATCH --time=0-03:00:00


In [19]:
from sklearn.metrics import roc_auc_score


preds = clf.predict(X_test)

print(f'tabnet train acc: {sum(clf.predict(X_train) == y_train) / len(y_train)}')
print(f'tabnet acc: {sum(preds == y_test) / len(y_test)}')
print(f'xgboost acc: {model.score(X_test, y_test)}')
y_pred = model.predict_proba(X_test)[:,1]
model.predict_proba(X_test)
print(f'xgboost auc: {roc_auc_score(y_test, y_pred)}')

tabnet train acc: 0.7482315349871547
tabnet acc: 0.7409415660174002
xgboost acc: 0.7323525816953522
xgboost auc: 0.8137412268368229
