In [1]:
!python --version

Python 3.10.12


## Google Colab

In [2]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')

# Change working path
import os
path = "/content/drive/MyDrive/Colab Notebooks/DeepLearning/Final Project"
os.chdir(path)
print(os.getcwd())

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/DeepLearning/Final Project


In [3]:
!pip3 install openml
!pip3 install loguru
!pip3 install -U pytorch_tabular[extra]

Collecting openml
  Downloading openml-0.14.1.tar.gz (131 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.3/131.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.19.1-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: openml, liac-a

In [4]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
import numpy as np
import pandas as pd
import os
# %load_ext autoreload
# %autoreload 2

import dataset

# Utility Functions

In [5]:
def make_mixed_classification(n_samples, n_features, n_categories):
    X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = []
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.Series(y, name="target")
    data = X.join(y)
    return data, cat_col_names, num_col_names

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = accuracy_score(y_true, y_pred)
    val_f1 = f1_score(y_true, y_pred)
    print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")

# Generate Synthetic Data

First of all, let's create a synthetic data which is a mix of numerical and categorical features

In [6]:
# data, cat_col_names, num_col_names = make_mixed_classification(n_samples=10000, n_features=20, n_categories=4)
# train, test = train_test_split(data, random_state=42)
# train, val = train_test_split(train, random_state=42)

# print(val)
# print(type(val))

In [7]:
dataset_config = {
    'credit-g': {'bin': []},
    'credit-approval': {'bin': []}
}
data, train, val, test, cat_col_names, num_col_names, bin_col_names \
     = dataset.load_data('dresses-sales', dataset_config=dataset_config)

# print(cat_col_names)
# print(type(cat_col_names))
# print(type(cat_col_names[0]))
# print(num_col_names)
# print(type(num_col_names[0]))

# cat_col_names_strlist = [ str(name) for name in cat_col_names]
# print(cat_col_names_strlist)
# print(type(cat_col_names_strlist))
# print(type(cat_col_names_strlist[0]))

cat_col_names = [str(name) for name in cat_col_names]
num_col_names = [str(name) for name in num_col_names]

train_data = train[0]
train_data["target"] = train[1]
val_data = val[0]
val_data["target"] = val[1]
test_data = test[0]
test_data["target"] = test[1]

# print(test_data)
# print(type(test_data))


########################################




openml data index: 23381
load data from dresses-sales
# data: 500, # feat: 12, # cate: 11,  # bin: 0, # numerical: 1, pos rate: 0.42


# Importing the Library

In [8]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, TabTransformerConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

  warn(


## Define the Configs

This is the most crucial step in the process. There are four configs that you need to provide(most of them have intelligent default values), which will drive the rest of the process.

* DataConfig - Define the target column names, categorical and numerical column names, any transformation you need to do, etc.
* ModelConfig - There is a specific config for each of the models. This determines which model we are going to train and also lets you define the hyperparameters of the model
* TrainerConfig - This let's you configure the training process by setting things like batch_size, epochs, early stopping, etc. The vast majority of parameters are directly borrowed from PyTorch Lightning and is passed to the underlying Trainer object during training
* OptimizerConfig - This let's you define and use different Optimizers and LearningRate Schedulers. Standard PyTorch Optimizers and Learning RateSchedulers are supported. For custom optimizers, you can use the parameter in the fit method to overwrite this. The custom optimizer should be PyTorch compatible
* ExperimentConfig - This is an optional parameter. If set, this defines the Experiment Tracking. Right now, only two experiment tracking frameworks are supported: Tensorboard and Weights&Biases. W&B experiment tracker has more features like tracking the gradients and logits across epochs.

In [25]:
data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=500,
    accelerator="auto", # can be 'cpu','gpu', 'tpu', or 'ipu'
    early_stopping=None
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="", # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

# model_config = CategoryEmbeddingModelConfig(
#     task="classification",
#     layers="32-16", # Number of nodes in each layer
#     activation="LeakyReLU", # Activation between each layers
#     dropout=0.1,
#     initialization="kaiming",
#     head = "LinearHead", #Linear Head
#     head_config = head_config, # Linear Head Config
#     learning_rate = 1e-3
# )

model_config = TabTransformerConfig(
    task="classification",
    input_embed_dim=32,
    num_attn_blocks=6,
    num_heads=8,
)


tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

2023-12-31 22:44:19,742 - {pytorch_tabular.tabular_model:105} - INFO - Experiment Tracking is turned off
INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off


## Training the Model
Now that we have defined the configs and the TabularModel. We just need to call the `fit` method and pass the train and test dataframes. We can also pass in validation dataframe. But if omitted, TabularModel will separate 20%(also configurable) at random from the data as validation.

By default, EarlyStopping is enabled and is monitoring Validation Loss with a patience of 3 epochs. The trainer also saves the best model(based on validation loss) and loads that model at the end of training. `TrainerConfig` has the parameters to tweak this default behaviour.

In [26]:
tabular_model.fit(train=train_data, validation=val_data)

INFO:lightning_fabric.utilities.seed:Global seed set to 42
2023-12-31 22:44:20,126 - {pytorch_tabular.tabular_model:473} - INFO - Preparing the DataLoaders
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
2023-12-31 22:44:20,129 - {pytorch_tabular.tabular_datamodule:290} - INFO - Setting up the datamodule for classification task
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
2023-12-31 22:44:20,180 - {pytorch_tabular.tabular_model:521} - INFO - Preparing the Model: TabTransformerModel
INFO:pytorch_tabular.tabular_model:Preparing the Model: TabTransformerModel
2023-12-31 22:44:20,231 - {pytorch_tabular.tabular_model:268} - INFO - Preparing the Trainer
INFO:pytorch_tabular.tabular_model:Preparing the Trainer
  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
INFO:pytorch_lightning.tuner.lr_finder:LR finder stopped early after 99 steps due to diverging loss.
INFO:pytorch_lightning.tuner.lr_finder:Learning rate set to 0.012022644346174132
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/drive/MyDrive/Colab Notebooks/DeepLearning/Final Project/.lr_find_66a5c9cf-fd71-4652-9228-28571b397d30.ckpt
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint file at /content/drive/MyDrive/Colab Notebooks/DeepLearning/Final Project/.lr_find_66a5c9cf-fd71-4652-9228-28571b397d30.ckpt
2023-12-31 22:45:31,104 - {pytorch_tabular.tabular_model:575} - INFO - Suggested LR: 0.012022644346174132. For plot and detailed analysis, use `find_learning_rate` method.
INFO:pytorch_tabular.tabular_model:Suggested LR: 0.012022644346174132. For plot and detailed analysis, use `find_learning_rate` method.
2023-12-31 22:45:31,108 - {pytorch_tabular.tabular_model:582}

Output()

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=500` reached.


2023-12-31 22:51:58,867 - {pytorch_tabular.tabular_model:584} - INFO - Training the model completed
INFO:pytorch_tabular.tabular_model:Training the model completed
2023-12-31 22:51:58,870 - {pytorch_tabular.tabular_model:1258} - INFO - Loading the best model
INFO:pytorch_tabular.tabular_model:Loading the best model
  rank_zero_deprecation(


<pytorch_lightning.trainer.trainer.Trainer at 0x7f195ce31870>

## Evaluating the Model

### Loss and Metrics on New Data
To evaluate the model on new data on the same metrics/loss that was used during training, we can use the `evaluate` method

In [27]:
result = tabular_model.evaluate(test_data)

Output()

## New Predictions as DataFrame
To get the prediction as a dataframe, we can use the `predict` method. This will add predictions to the same dataframe that was passed in. For classification problems, we get both the probabilities and the final prediction taking 0.5 as the threshold

In [28]:
pred_df = tabular_model.predict(test_data)
pred_df.head()

Output()

Unnamed: 0,V4,V2,V3,V5,V6,V7,V8,V9,V10,V11,V12,V13,target,0_probability,1_probability,prediction
88,1.0,Sexy,Low,M,Automn,o-neck,full,natural,cotton,chiffon,lace,print,1,0.561941,0.438059,0
314,0.74,Casual,Average,M,Summer,boat-neck,sleevless,natural,linen,chiffon,lace,solid,0,0.561605,0.438395,0
28,0.86,cute,Low,free,Automn,o-neck,sleevless,natural,polyster,chiffon,sashes,striped,0,0.56176,0.43824,0
374,0.82,Sexy,Average,free,Summer,v-neck,sleevless,natural,cotton,chiffon,lace,solid,1,0.561708,0.438292,0
291,0.0,party,Medium,free,Winter,v-neck,short,empire,cotton,chiffon,lace,solid,1,0.560648,0.439352,0


In [29]:
print_metrics(test_data['target'], pred_df["prediction"], tag="Holdout")

Holdout Acc: 0.58 | Holdout F1: 0.0


## Saving and Loading the Model

In [30]:
tabular_model.save_model("examples/basic")



In [31]:
loaded_model = TabularModel.load_model("examples/basic")

2023-12-31 22:52:00,020 - {pytorch_tabular.tabular_model:129} - INFO - Experiment Tracking is turned off
INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
2023-12-31 22:52:00,035 - {pytorch_tabular.tabular_model:268} - INFO - Preparing the Trainer
INFO:pytorch_tabular.tabular_model:Preparing the Trainer
  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [32]:
result = loaded_model.evaluate(test_data)

Output()