# This notebook is one solution provided by using [tabnet](https://arxiv.org/pdf/1908.07442.pdf) which is part of "[IDAO 2021 ML Bootcamp](https://idao.world/bootcamp/)" kaggle competition on [Insomnia](https://www.kaggle.com/c/idao-2022-bootcamp-insomnia/submit) dataset.

In [None]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
train = pd.read_csv('/content/TRAIN.csv')
test = pd.read_csv('/content/TEST.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

In [None]:
print(f"train.shape = {train.shape} | test.shape = {test.shape} | sample_submission.shape = {sample_submission.shape}")
train.head(7)

train, test, sample_submission shapes
(70000, 13) (30000, 12) (30000, 2)


Unnamed: 0,id,age,weight,height,sex,stress,doctor,sport,pernicious_1,pernicious_2,ubp,lbp,insomnia
0,0,50.35729,62.0,168,2,1,1,1,0,0,110,80,0
1,1,55.381246,85.0,156,1,3,1,1,0,0,140,90,1
2,2,51.627652,64.0,165,1,3,1,0,0,0,130,70,1
3,3,48.249144,82.0,169,2,1,1,1,0,0,150,100,1
4,4,47.841205,56.0,156,1,1,1,0,0,0,100,60,0
5,8,59.997262,67.0,151,1,2,2,0,0,0,120,80,0
6,9,60.542094,93.0,157,1,3,1,1,0,0,130,80,0


In [None]:
# drop *doctor* (correlated with *stress*, but less correlated with target) 
train = train.drop(columns=['doctor', 'id'])
test = test.drop(columns=['doctor', 'id'])
train.head()

Unnamed: 0,age,weight,height,sex,stress,sport,pernicious_1,pernicious_2,ubp,lbp,insomnia
0,50.35729,62.0,168,2,1,1,0,0,110,80,0
1,55.381246,85.0,156,1,3,1,0,0,140,90,1
2,51.627652,64.0,165,1,3,0,0,0,130,70,1
3,48.249144,82.0,169,2,1,1,0,0,150,100,1
4,47.841205,56.0,156,1,1,0,0,0,100,60,0


In [None]:
# fix nulls 

# pernicious_1 & pernicious_2 correelate with *sex* a little 
# so fill nulls with regard to it: 
# 1 for *sex* == 2 and 0, otherwise 
pernicious_1_nan_male = np.where((np.isnan(test['pernicious_1']) & (test['sex']==2)), 
                               1, 
                               test['pernicious_1'])
test['pernicious_1'] = pernicious_1_nan_male
test['pernicious_1'] = test['pernicious_1'].fillna(0)

pernicious_2_nan_male = np.where((np.isnan(test['pernicious_2']) & (test['sex']==2)), 
                               1, 
                               test['pernicious_2'])
test['pernicious_2'] = pernicious_2_nan_male
test['pernicious_2'] = test['pernicious_2'].fillna(0)

# fill in *sport* with median
test['sport'] = test['sport'].median()

In [None]:
train.head()

Unnamed: 0,age,weight,height,sex,stress,sport,pernicious_1,pernicious_2,ubp,lbp,insomnia
0,50.35729,62.0,168,2,1,1,0,0,110,80,0
1,55.381246,85.0,156,1,3,1,0,0,140,90,1
2,51.627652,64.0,165,1,3,0,0,0,130,70,1
3,48.249144,82.0,169,2,1,1,0,0,150,100,1
4,47.841205,56.0,156,1,1,0,0,0,100,60,0


In [None]:
# one-hot-encoding categorical features

stress_train_ohe = pd.get_dummies(train['stress'], prefix='stress').drop(columns='stress_3')
stress_test_ohe = pd.get_dummies(test['stress'], prefix='stress').drop(columns='stress_3')

train = train.drop(columns='stress')
test = test.drop(columns='stress')

train = train.join(stress_train_ohe)
test = test.join(stress_test_ohe)

In [None]:
train.head()

Unnamed: 0,age,weight,height,sex,sport,pernicious_1,pernicious_2,ubp,lbp,insomnia,stress_1,stress_2
0,50.35729,62.0,168,2,1,0,0,110,80,0,1,0
1,55.381246,85.0,156,1,1,0,0,140,90,1,0,0
2,51.627652,64.0,165,1,0,0,0,130,70,1,0,0
3,48.249144,82.0,169,2,1,0,0,150,100,1,1,0
4,47.841205,56.0,156,1,0,0,0,100,60,0,1,0


In [None]:
# Utility function to transform ``dataframe`` to ``list``

def transform_df_to_np_arr(df_inputs):
  features_vectors = []

  for index, row in tqdm(df_inputs.iterrows()):
      features_vector = [df_inputs[feature][index] for feature in df_inputs.columns]
      features_vectors.append(features_vector)    
      
  # convert to pytorch tensor
  features_vectors = np.array(features_vectors)
  return features_vectors

0it [00:00, ?it/s]

array([[ 50.35728953,  62.        , 168.        ,   2.        ,
          1.        ,   0.        ,   0.        , 110.        ,
         80.        ,   1.        ,   0.        ],
       [ 55.38124572,  85.        , 156.        ,   1.        ,
          1.        ,   0.        ,   0.        , 140.        ,
         90.        ,   0.        ,   0.        ]])

In [None]:
# step 1 : Prepare the input data by dropping the final column
df_inputs = train.drop("insomnia", axis=1, inplace=False)
df_inputs[:2]

Unnamed: 0,age,weight,height,sex,sport,pernicious_1,pernicious_2,ubp,lbp,stress_1,stress_2
0,50.35729,62.0,168,2,1,0,0,110,80,1,0
1,55.381246,85.0,156,1,1,0,0,140,90,0,0


In [None]:
# step 2 : Transform the input data from `df` to `np.array`
train_features_vectors = transform_df_to_np_arr(df_inputs)
train_features_vectors[:2]

In [None]:
# step 3 : Transform the target variable from `df` to `np.array`
targets = np.array(train['insomnia'])

[0 1]
(70000,)


In [None]:
# step 4 : Split the train_features_vectors into train and validation set
X_train, X_val, y_train, y_val = train_test_split(train_features_vectors, 
                                                  targets, 
                                                  test_size=0.3, 
                                                  random_state=13)

In [None]:
# step 5 : Train the TabNet Model. 
# note : The reason I used TabNetReggresor instead of TabNetClassifier is because
#        the sample submsission takes the probs, while the TabNetClassifier inference
#        output the class

clf = TabNetRegressor() 
clf.fit(
  X_train, y_train,
  eval_set=[(X_val, y_val)],
  max_epochs=10 , patience=10,
  batch_size=32
)

Device used : cuda
Loading weights from unsupervised pretraining




epoch 0  | loss: 0.24101 | val_0_mse: 0.25073 |  0:00:49s
epoch 1  | loss: 0.23605 | val_0_mse: 0.27839 |  0:01:39s
epoch 2  | loss: 0.2367  | val_0_mse: 0.23417 |  0:02:29s
epoch 3  | loss: 0.2357  | val_0_mse: 0.46651 |  0:03:19s
epoch 4  | loss: 0.23564 | val_0_mse: 0.37139 |  0:04:10s
epoch 5  | loss: 0.23553 | val_0_mse: 0.23747 |  0:05:00s
epoch 6  | loss: 0.23525 | val_0_mse: 0.23742 |  0:05:50s
epoch 7  | loss: 0.23496 | val_0_mse: 0.25768 |  0:06:40s
epoch 8  | loss: 0.23491 | val_0_mse: 13.04676|  0:07:31s
epoch 9  | loss: 0.23452 | val_0_mse: 8.48785 |  0:08:21s
epoch 10 | loss: 0.23471 | val_0_mse: 15.52276|  0:09:11s
epoch 11 | loss: 0.23482 | val_0_mse: 13.04274|  0:10:01s
epoch 12 | loss: 0.23527 | val_0_mse: 0.48499 |  0:10:52s

Early stopping occurred at epoch 12 with best_epoch = 2 and best_val_0_mse = 0.23417
Best weights from best epoch are automatically used!


In [None]:
# Step 6 : Do prediction on test set
test_features_vector = transform_df_to_np_arr(test)
preds = clf.predict(test_features_vector)
preds = preds.reshape(-1).tolist()
print(preds[:5])

[0.533506453037262, 0.3747526705265045, 0.39136767387390137, 0.5600950717926025, 0.4312325716018677]


In [None]:
# Step 7 : Prepare the submission file 
test_submission = pd.DataFrame(sample_submission['id'])
test_submission['insomnia'] = preds
test_submission.head()

Unnamed: 0,id,insomnia
0,5,0.533506
1,6,0.374753
2,7,0.391368
3,10,0.560095
4,11,0.431233


In [None]:
# Step 8 : Check if the sample_submission has matched to our test_submission
sample_submission.shape == test_submission.shape

True

In [None]:
# Step 9 : Save the test_submission.csv to submit it in kaggle leaderboard
test_submission.to_csv('test_submission.csv', index=False)