TabNet: Attentive Interpretable Tabular Learning (for starter, <a tagret = "_blank" href="https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/main/notebooks/samples/explanations/ai-explanations-tabnet-algorithm.ipynb">Google Notebook)</a>

All that is pure free Open Source. You can only build and enjoy the ride.<br />
Ever so grateful

example workflow adapts to your data.
takes full benefit of grouped attention for (out-of-the-box one-hot-encoded) categorical features

<hr />

# setup

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import os, json

In [None]:
from dotenv import find_dotenv, load_dotenv
print(find_dotenv())
load_dotenv("../.env")

In [None]:
import torch
print(torch.cuda.get_device_name(0))
torch.__version__

In [None]:
from IPython import get_ipython

def reset_jupyter_history():
    """
    reset cell numbering
    """

    import os
    import sqlite3
    from traitlets.config import Config
    from IPython.core.history import HistoryManager
    
    ip = get_ipython()
    
    if ip is not None:
        # Disable history temporarily
        if hasattr(ip, 'history_manager'):
            ip.history_manager.enabled = False
        
        # Get the path to the history database
        hist_file = os.path.expanduser('~/.ipython/profile_default/history.sqlite')
        if hasattr(ip, 'history_manager') and hasattr(ip.history_manager, 'hist_file'):
            hist_file = ip.history_manager.hist_file
        
        if os.path.exists(hist_file):
            # Close any existing connection
            if hasattr(ip, 'history_manager') and hasattr(ip.history_manager, 'db') and ip.history_manager.db is not None:
                ip.history_manager.db.close()
            
            # Truncate the database
            conn = sqlite3.connect(hist_file)
            conn.execute("DELETE FROM history")
            conn.execute("DELETE FROM sqlite_sequence WHERE name='history'")
            conn.commit()
            conn.close()
        
        # Create a new Config object
        c = Config()
        c.HistoryManager.hist_file = hist_file
        
        # Create a new HistoryManager instance
        new_history_manager = HistoryManager(shell=ip, config=c)
        
        # Assign the new history manager to the shell
        ip.history_manager = new_history_manager
        
        # Enable history
        ip.history_manager.enabled = True

        ip.kernel.shell.execution_count = 0
        
        print("Jupyter history has been reset.")
    else:
        print("This function must be run within a Jupyter notebook.")

reset_jupyter_history()
get_ipython().kernel.shell.execution_count

<hr />

# Generate data

In [None]:
from retrain_pipelines.dataset import DatasetType, pseudo_random_generate

num_samples = 10_000 # number of samples
data = pseudo_random_generate(DatasetType.TABULAR_CLASSIFICATION, num_samples)
# Display the first few rows
print(data.head())
# save to file
data.to_csv(os.path.realpath(os.path.join('..', 'data', 'synthetic_classif_tab_data_4classes.csv')), index=False)

<hr />

# Metaflow Run

Chosse which domain shall be considered for the HP tuning grid search&nbsp;:

In [None]:
pipeline_hp_grid = {
    "trainer": {
        "max_epochs":[100], #sys.maxsize,
        "patience":[10],
        "batch_size":[1024],
        "virtual_batch_size":[256],
    },
    "model": {
        "n_d":[64],
        "n_a":[64],
        "n_steps":[6],
        "gamma":[1.5],
        "n_independent":[2],
        "n_shared":[2],
        "lambda_sparse":[1e-4],
        "momentum":[0.3],
        "clip_value":[2.],
        "optimizer_fn":["torch.optim.Adam"],
        "optimizer_params":[dict(lr=2e-2), dict(lr=0.1)],
        "scheduler_params":[{"gamma": 0.80,
                            "step_size": 20}],
        "scheduler_fn":["torch.optim.lr_scheduler.StepLR"],
        "epsilon":[1e-15]
    }
}
from textwrap import dedent
os.environ['pipeline_hp_grid'] = str(json.dumps(dedent("""{pipeline_hp_grid}""".format(pipeline_hp_grid=pipeline_hp_grid)))).replace("'", '"').strip('"')
print(os.environ['pipeline_hp_grid'])

## Run flow

In [None]:
%reload_ext retrain_pipelines.local_launcher_magic

In [None]:
%retrain_pipelines_local retraining_pipeline.py run --help

In [None]:
%retrain_pipelines_local retraining_pipeline.py run \
    --data_file "../data/synthetic_classif_tab_data_4classes.csv" \
    --buckets_param '{"num_feature2": 100, "num_feature4": 50}' \
    --pipeline_hp_grid "${pipeline_hp_grid}" \
    --cv_folds 2 \
    --wandb_run_mode disabled
    #--preprocess_artifacts_path "." \
    #--pipeline_card_artifacts_path "." \

In [None]:
%retrain_pipelines_local retraining_pipeline.py resume pipeline_card

# Inspectors

After the fact inspecting retraining pipelines runs.

In [None]:
mf_flow_name = "TabNetHpCvWandbFlow"

## local Metaflow SDK

In [None]:
from retrain_pipelines.frameworks import local_metaflow as metaflow

In [None]:
metaflow.Task("TabNetHpCvWandbFlow/973/pipeline_card/29591",  attempt=0)['model'].data

## local custom card explorer

Retrieve the full path to the custom html pipeline card in the local datastore&nbsp;:

In [None]:
from retrain_pipelines.inspectors import browse_local_pipeline_card

In [None]:
browse_local_pipeline_card(mf_flow_name)
#browse_local_pipeline_card(mf_run_id=800, verbose=True)

<hr />

## WandB

Make sure to have the `WANDB_API_KEY` environement variable set adequately.<br />
It can be through a `secret`.

<b>programmatically browse the saved source-code</b>

In [None]:
from retrain_pipelines.inspectors import get_execution_source_code

for source_code_artifact in get_execution_source_code(mf_run_id=860): #mf_flow_name):
    print(f" - {source_code_artifact.name} {source_code_artifact.url}")

<b>The below command will download source-code artifacts for a given run and open a file explorer on the parent dir&nbsp;:</b>

In [None]:
from retrain_pipelines.inspectors import explore_source_code

explore_source_code(mf_run_id=860) # mf_flow_name=mf_flow_name)

<hr />