<a href="https://colab.research.google.com/github/broistg/ML-Assignment-DNAC1/blob/main/notebooks/BTL1_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0.&nbsp;User manual

This Google Colab file implements a modern machine learning pipeline using deep learning for tabular data (Adult Census Income). To run the entire notebook, click the "Run All" button (library installation and data setup have been automated).

# 1.&nbsp;Prepare the necessary data and libraries

## 1.1.&nbsp;Import Libraries & Environment Setup
In this section, we install and import all the necessary libraries for data processing, visualization, and deep learning (Scikit-learn, PyTorch, TabNet, etc.).


In [None]:
# Verify required libraries
import sys
import subprocess

def pip_install(packages):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet"] + packages)

pip_install(["scikit-learn", "pandas", "matplotlib", "seaborn", "torch", "torchvision", "pytorch-tabnet", "wget"])

# Import libraries
import wget
from pathlib import Path
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from copy import deepcopy
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F

# Set seed random
RND = 42
np.random.seed(RND)
import random
random.seed(RND)
torch.manual_seed(RND)

<torch._C.Generator at 0x7c80c0e7e050>

## 1.2.&nbsp;Load Dataset
We download the **Adult/Census Income** dataset from the UCI repository and load it into a DataFrame for further analysis.


In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
col_names = [
'age','workclass','fnlwgt','education','education-num','marital-status',
'occupation','relationship','race','sex','capital-gain','capital-loss',
'hours-per-week','native-country','income'
]
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading dataset...")
    wget.download(url, out.as_posix())
    print("Done!")

# Load data
dataset = pd.read_csv(out, header=None, names=col_names, skipinitialspace=True)
print('Dataset shape:', dataset.shape)

Downloading dataset...
Done!
Dataset shape: (32561, 15)


# 2.&nbsp;Exploratory Data Analysis (EDA)
We analyze the dataset using summary statistics, visualize class imbalance, check for missing values, and inspect numeric/categorical distributions.


In [None]:
# Print the first 5 rows
print(dataset.head())
print(dataset.describe(include='all'))

print('\nValue counts for target (train):')
print(dataset['income'].value_counts())

# Class imbalance analysis
target_counts = dataset['income'].value_counts()
target_props = dataset['income'].value_counts(normalize=True)

print("\nClass distribution (absolute):\n", target_counts)
print("\nClass distribution (proportion):\n", target_props)

# Visualizing class imbalance
fig = px.pie(values=target_counts.values, names=['<=50K','>50K'],
             title='Class Distribution (Income)')
fig.show()

# Check for missing data (UCI uses '?')
print('\nMissing value counts (including "?" occurrences):')
print((dataset == '?').sum())

# Visualization
try:
  # Target distribution
  px.histogram(dataset, x='income', color='income', title='Income Distribution').show()

  # Numeric features distribution with boxplots
  numeric_features = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
  for col in numeric_features:
    px.histogram(dataset, x=col, nbins=40, marginal='box', title=f'{col} Distribution').show()

  # Categorical features (top categories)
  categorical_features = [c for c in col_names if c not in numeric_features + ['income']]
  for col in categorical_features:
    vc = dataset[col].value_counts().nlargest(10).index
    subset = dataset[dataset[col].isin(vc)]
    px.histogram(subset, x=col, color='income', barmode='group', title=f'{col} vs Income').update_layout(xaxis={'categoryorder':'total descending'}).show()

  # Correlation heatmap
  corr = dataset[numeric_features].corr()
  px.imshow(corr, text_auto=True, title='Correlation Heatmap (Numeric Features)').show()
  # Checking skewness and outliers
  from scipy.stats import skew

  for col in numeric_features:
    sk = skew(dataset[col].dropna())
    print(f"{col}: skew={sk:.2f}, min={dataset[col].min()}, max={dataset[col].max()}")

  # Visualizing outliers with boxplots
  px.box(dataset, y='capital-gain', title='Capital Gain Boxplot').show()
  px.box(dataset, y='capital-loss', title='Capital Loss Boxplot').show()

  # Log-transform for highly skewed variables
  dataset['capital-gain-log'] = np.log1p(dataset['capital-gain'])
  dataset['capital-loss-log'] = np.log1p(dataset['capital-loss'])

  px.histogram(dataset, x='capital-gain-log', nbins=40, title='Capital Gain (log)').show()
  px.histogram(dataset, x='capital-loss-log', nbins=40, title='Capital Loss (log)').show()

except Exception as e:
  print('Plotting skipped due to:', e)

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             


Missing value counts (including "?" occurrences):
age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64


age: skew=0.56, min=17, max=90
fnlwgt: skew=1.45, min=12285, max=1484705
education-num: skew=-0.31, min=1, max=16
capital-gain: skew=11.95, min=0, max=99999
capital-loss: skew=4.59, min=0, max=4356
hours-per-week: skew=0.23, min=1, max=99


# 3.&nbsp;Data Preprocessing
- Strip whitespace, replace "?" with NaN.  
- Apply log-transform on skewed features.  

## 3.1. Data Cleaning and Transformation

In [None]:
# Cleaning, log-transform , target -> binary, and split (train/val/test)

for col in dataset.select_dtypes(include=['object']).columns:
    dataset[col] = dataset[col].str.strip()
dataset.replace('?', np.nan, inplace=True)

dataset['capital-gain-log'] = np.log1p(dataset['capital-gain'].fillna(0))
dataset['capital-loss-log'] = np.log1p(dataset['capital-loss'].fillna(0))

numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain-log', 'capital-loss-log', 'hours-per-week']
categorical_features = [c for c in col_names if c not in numeric_features + ['income']]

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

dataset['income_bin'] = dataset['income'].apply(lambda x: 1 if x == '>50K' else 0)
target = 'income_bin'

Numeric features: ['age', 'fnlwgt', 'education-num', 'capital-gain-log', 'capital-loss-log', 'hours-per-week']
Categorical features: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'native-country']


## 3.2. Train/Validation/Test Split
Split the dataset into training, validation, and test sets with stratification.

In [None]:
train, temp = train_test_split(dataset, test_size=0.3, random_state=RND, stratify=dataset['income'])
val, test = train_test_split(temp, test_size=0.5, random_state=RND, stratify=temp['income'])

print('Original dataset shape:', dataset.shape)
print('Split -> Train:', train.shape, 'Validation:', val.shape, 'Test:', test.shape)

# Checking class distribution across datasets
def check_distribution(df, name):
    counts = df['income'].value_counts()
    props = df['income'].value_counts(normalize=True)
    print(f"\n{name} distribution:")
    print(counts)
    print(props)

check_distribution(train, "Train")
check_distribution(val, "Validation")
check_distribution(test, "Test")

print('Shapes -> full:', dataset.shape, 'train:', train.shape, 'val:', val.shape, 'test:', test.shape)

Original dataset shape: (32561, 18)
Split -> Train: (19536, 18) Validation: (6512, 18) Test: (6513, 18)

Train distribution:
income
<=50K    14832
>50K      4704
Name: count, dtype: int64
income
<=50K    0.759214
>50K     0.240786
Name: proportion, dtype: float64

Validation distribution:
income
<=50K    4944
>50K     1568
Name: count, dtype: int64
income
<=50K    0.759214
>50K     0.240786
Name: proportion, dtype: float64

Test distribution:
income
<=50K    4944
>50K     1569
Name: count, dtype: int64
income
<=50K    0.759097
>50K     0.240903
Name: proportion, dtype: float64
Shapes -> full: (32561, 18) train: (19536, 18) val: (6512, 18) test: (6513, 18)


## 3.3. Encoding Categorical Features
Convert categorical variables into integer indices for embedding layers, and compute cardinality for each feature.


In [None]:
# Build categorical -> integer mappings from TRAIN only (reserve an index for unseen)

cat_maps = {}          # col -> (mapping_dict, unk_index)
cat_cardinality = {}   # col -> cardinality (including unseen bucket)

for col in categorical_features:
    uniques = train[col].fillna('Missing').astype(str).unique().tolist()
    mapping = {val: idx for idx, val in enumerate(uniques)}
    unk_idx = len(mapping)
    cat_maps[col] = (mapping, unk_idx)
    cat_cardinality[col] = len(mapping) + 1

print("Cardinalities (including UNK):")
for c, v in cat_cardinality.items():
    print(f"  {c}: {v}")

def map_col_to_index(df_col, mapping, unk_idx):
    return df_col.fillna('Missing').astype(str).map(lambda x: mapping.get(x, unk_idx)).values

X_train_cat = np.column_stack([map_col_to_index(train[c], *cat_maps[c]) for c in categorical_features]).astype(np.int64)
X_val_cat   = np.column_stack([map_col_to_index(val[c],   *cat_maps[c]) for c in categorical_features]).astype(np.int64)
X_test_cat  = np.column_stack([map_col_to_index(test[c],  *cat_maps[c]) for c in categorical_features]).astype(np.int64)

print("Categorical encoded shapes:", X_train_cat.shape, X_val_cat.shape, X_test_cat.shape)

Cardinalities (including UNK):
  workclass: 10
  education: 17
  marital-status: 8
  occupation: 16
  relationship: 7
  race: 6
  sex: 3
  capital-gain: 114
  capital-loss: 84
  native-country: 43
Categorical encoded shapes: (19536, 10) (6512, 10) (6513, 10)


## 3.4. Scaling Numeric Features
Standardize numeric features (fit on train, transform on val/test). Prepare target labels for training.

In [None]:
# Scale numeric features (fit on train numeric only)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_num = scaler.fit_transform(train[numeric_features].values)
X_val_num   = scaler.transform(val[numeric_features].values)
X_test_num  = scaler.transform(test[numeric_features].values)

# Targets
y_train_dl = train[target].values.astype(np.float32)
y_val_dl   = val[target].values.astype(np.float32)
y_test_dl  = test[target].values.astype(np.float32)

print("Numeric shapes:", X_train_num.shape, X_val_num.shape, X_test_num.shape)
print("Target shapes:", y_train_dl.shape, y_val_dl.shape, y_test_dl.shape)


Numeric shapes: (19536, 6) (6512, 6) (6513, 6)
Target shapes: (19536,) (6512,) (6513,)


## 3.5. Data Split Verification
Visualize and confirm that the class distribution is preserved across train, validation, and test sets.


In [None]:
income_counts = pd.DataFrame({
    'Split': ['Train'] * len(train) + ['Validation'] * len(val) + ['Test'] * len(test),
    'Income_bin': pd.concat([train['income_bin'], val['income_bin'], test['income_bin']]).values
})

income_proportions = income_counts.groupby('Split')['Income_bin'].value_counts(normalize=True).reset_index(name='Proportion')

fig = go.Figure(data=[
    go.Bar(name='<=50K', x=income_proportions[income_proportions['Income_bin'] == 0]['Split'],
           y=income_proportions[income_proportions['Income_bin'] == 0]['Proportion']),
    go.Bar(name='>50K',  x=income_proportions[income_proportions['Income_bin'] == 1]['Split'],
           y=income_proportions[income_proportions['Income_bin'] == 1]['Proportion'])
])

fig.update_layout(barmode='group', title='Income_bin Distribution Across Train, Validation, Test',
                  xaxis_title='Split', yaxis_title='Proportion')
fig.show()


# 4.&nbsp;Deep Learning Pipeline

In [None]:
class TabularDataset(Dataset):
    def __init__(self, X_num, X_cat, y):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=True, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

class EarlyStopping:
    def __init__(self, patience=5, mode="max"):
        self.patience = patience
        self.mode = mode
        self.best_score = None
        self.counter = 0
        self.early_stop = False
        self.best_state = None

    def __call__(self, score, model):
        if self.best_score is None:
            self.best_score = score
            self.best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        elif (self.mode == "max" and score <= self.best_score) or (self.mode == "min" and score >= self.best_score):
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
            self.best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

## 4.1. Wide and Deep Neural Network
Define a **Wide & Deep Neural Network** with embeddings for categorical features and MLP layers for numeric features.


### 4.1.1. Model Architectures

In [None]:
class WideAndDeepPro(nn.Module):
    def __init__(self, num_features, cat_cardinalities, emb_dim=32, hidden_dims=[512, 256, 128], dropout=0.1):
        super().__init__()

        # Embeddings cho categorical
        self.embeddings = nn.ModuleList([
            nn.Embedding(card, emb_dim) for card in cat_cardinalities
        ])
        emb_out_dim = emb_dim * len(cat_cardinalities)

        # Wide (linear layer trên raw input)
        self.wide = nn.Linear(num_features + emb_out_dim, 1)

        # Deep part
        layers = []
        in_dim = num_features + emb_out_dim
        for h in hidden_dims:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = h
        layers.append(nn.Linear(in_dim, 1))
        self.deep = nn.Sequential(*layers)

    def forward(self, x_num, x_cat):
        embs = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        emb = torch.cat(embs, dim=1)
        x = torch.cat([x_num, emb], dim=1)

        wide_out = self.wide(x)
        deep_out = self.deep(x)
        out = wide_out + deep_out
        return out.squeeze(1)


In [None]:
def evaluate_probs(model, loader):
    model.eval()
    probs, labels = [], []
    with torch.no_grad():
        for xb_num, xb_cat, yb in loader:
            xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
            logits = model(xb_num, xb_cat)
            out = torch.sigmoid(logits)
            probs.extend(out.cpu().numpy())
            labels.extend(yb.cpu().numpy())
    return np.array(probs), np.array(labels)

### 4.1.2. Training and Evaluation

In [None]:
def wd_train_and_test(config):
    n_epochs = config['epoch']
    cat_card_list = [cat_cardinality[c] for c in categorical_features]

    model = WideAndDeepPro(num_features=X_train_num.shape[1],
                        cat_cardinalities=cat_card_list,
                        emb_dim=8,
                        hidden_dims=[128,64],
                        dropout=config['dropout']).to(device)


    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weigh_decay'])
    criterion = FocalLoss(alpha=1.0, gamma=2.0)
    early_stopping = EarlyStopping(patience=5, mode="max")

    best_f1 = 0
    best_state = None
    best_thresh = 0.5

    for epoch in range(1, n_epochs+1):
        model.train()
        total_loss = 0
        for xb_num, xb_cat, yb in train_dl:
            xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb_num, xb_cat)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()


        val_probs, val_labels = evaluate_probs(model, val_dl)
        thresholds_grid = np.linspace(0.3,0.7,41)
        f1s = [f1_score(val_labels,(val_probs>t).astype(int)) for t in thresholds_grid]
        best_idx = np.argmax(f1s)
        if f1s[best_idx] > best_f1:
            best_f1 = f1s[best_idx]
            best_thresh = thresholds_grid[best_idx]


        probs, _ = evaluate_probs(model, val_dl)
        val_preds = (probs > best_thresh).astype(int)

        avg_loss = total_loss / len(train_dl.dataset)
        val_acc = accuracy_score(y_val_dl, val_preds)
        val_f1 = precision_score(y_val_dl, val_preds)

        if config['verbose']:
            print(f"Epoch {epoch}/{n_epochs}: \t Average loss = {avg_loss:.6f} \t Accuracy = {val_acc:.6f} \t F1-score = {val_f1:.6f}")

        if val_f1 > best_f1:
            best_f1 = val_f1

        early_stopping(val_f1, model)
        if early_stopping.early_stop and config['early_stopping']:
            if not config['compare']:
                print(f"Early stopping at epoch {epoch}")
            model.load_state_dict(early_stopping.best_state)
            break

    if not config['compare']:
        probs, _ = evaluate_probs(model, val_dl)
        val_preds = (probs > best_thresh).astype(int)
        print("\n[Validation] Wide and Deep Neural Network")
        print(classification_report(y_val_dl, val_preds, target_names=["<=50K",">50K"], digits = 5))


        probs, _ = evaluate_probs(model, test_dl)
        test_preds = (probs > best_thresh).astype(int)
        print("\n[Test] Wide and Deep Neural Network")
        print(classification_report(y_test_dl, test_preds, target_names=["<=50K",">50K"], digits = 5))

    return model, best_thresh

## 4.2. TabNet

### 4.2.1. Model Architectures

In [None]:
class TabularMLP(nn.Module):
    def __init__(self, num_numeric, cat_cardinalities, hidden_dims=[256,128], emb_dim_cap=50, dropout=0.2):
        super().__init__()
        # Embedding layers: num_embeddings = cardinality per categorical column
        self.emb_layers = nn.ModuleList([
            nn.Embedding(card, min(emb_dim_cap, (card+1)//2)) for card in cat_cardinalities
        ])
        emb_out = sum([emb.embedding_dim for emb in self.emb_layers])
        layers = []
        input_dim = num_numeric + emb_out
        for h in hidden_dims:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.Dropout(dropout))
            input_dim = h
        layers.append(nn.Linear(input_dim, 1))  # output logit
        self.mlp = nn.Sequential(*layers)

    def forward(self, x_num, x_cat):
        embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.emb_layers)]
        x = torch.cat([x_num] + embs, dim=1)
        return self.mlp(x).squeeze(1)


In [None]:
def evaluate(model, loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for xb_num, xb_cat, yb in loader:
            xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
            logits = model(xb_num, xb_cat)
            out = torch.sigmoid(logits)
            preds.extend((out > 0.5).long().cpu().numpy())
            labels.extend(yb.long().cpu().numpy())
    acc = accuracy_score(labels, preds)
    p = precision_score(labels, preds, zero_division=0)
    r = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)

    return acc, p, r, f1, labels, preds

### 4.2.2. Training and Evaluation

In [None]:
def tabnet_train_and_test(config):
    n_epochs = config['epoch']
    best_f1 = 0
    criterion = FocalLoss(alpha=1, gamma=2)

    cat_card_list = [cat_cardinality[c] for c in categorical_features]
    model = TabularMLP(num_numeric = X_train_num.shape[1], cat_cardinalities = cat_card_list, dropout=config['dropout']).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=config['weigh_decay'])
    scheduler = ReduceLROnPlateau(
        optimizer, mode="max", factor=0.5, patience=2 # Removed verbose=True
    )
    early_stopping = EarlyStopping(patience=5, mode="max")

    for epoch in range(1, n_epochs+1):
        model.train()
        total_loss = 0.0
        for xb_num, xb_cat, yb in train_dl:
            xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb_num, xb_cat)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * xb_num.size(0)

        avg_loss = total_loss / len(train_dl.dataset)
        val_acc, val_p, val_r, val_f1, _, _ = evaluate(model, val_dl)

        scheduler.step(val_f1)

        if config['verbose']:
            print(f"Epoch {epoch}/{n_epochs}: \t Average loss = {avg_loss:.6f} \t Accuracy = {val_acc:.6f} \t F1-score = {val_f1:.6f}")

        if val_f1 > best_f1:
            best_f1 = val_f1

        early_stopping(val_f1, model)
        if early_stopping.early_stop and config['early_stopping']:
            if not config['compare']:
                print(f"Early stopping at epoch {epoch}")
            model.load_state_dict(early_stopping.best_state)
            break
    if not config['compare']:
        val_acc, val_p, val_r, val_f1, labels, preds = evaluate(model, val_dl)
        print("\n[Validation] TabNet")
        print(classification_report(labels, preds, target_names=['<=50K','>50K'], digits=5))

        test_acc, test_p, test_r, test_f1, labels, preds = evaluate(model, test_dl)
        print("\n[Test] TabNet")
        print(classification_report(labels, preds, target_names=['<=50K','>50K'], digits=5))

    return model

## 4.3. Model comparision

In [None]:
def model_comparison():
  wd_config = {"model": "wide_and_deep", "lr": 1e-3, "weigh_decay": 1e-5, "batch_size": 256,
               "epoch": 30, "dropout": 0.3, "early_stopping": True, "verbose": False, "compare": True}

  tabnet_config = {"model": "tabnet", "lr": 1e-3, "weigh_decay": 1e-5, "batch_size": 256,
                   "epoch": 30, "dropout": 0.2, "early_stopping": True, "verbose": False, "compare": True}

  wd_model, best_thresh = wd_train_and_test(wd_config)
  tabnet_model = tabnet_train_and_test(tabnet_config)

  results = []

  probs, _ = evaluate_probs(wd_model, test_dl)
  test_preds = (probs > best_thresh).astype(int)
  test_acc = accuracy_score(y_test_dl, test_preds)
  test_p = precision_score(y_test_dl, test_preds)
  test_r = recall_score(y_test_dl, test_preds)
  test_f1 = precision_score(y_test_dl, test_preds)

  results.append({"Model": "Wide and Deep Neural Network",
                  "Accuracy": round(test_acc, 5),
                  "Precision": round(test_p, 5),
                  "Recall": round(test_r, 5),
                  "F1-Score": round(test_f1, 5)
                })

  test_acc, test_p, test_r, test_f1, labels, preds = evaluate(tabnet_model, test_dl)

  results.append({"Model": "TabNet",
                  "Accuracy": round(test_acc, 5),
                  "Precision": round(test_p, 5),
                  "Recall": round(test_r, 5),
                  "F1-Score": round(test_f1, 5)
                })

  results = pd.DataFrame(results)

  fig = go.Figure(data=[go.Table(
      header=dict(values=list(results.columns),
                  fill_color='darkslateblue', font_color='white', align='center'),
      cells=dict(values=[results[col] for col in results.columns],
                fill_color='lavender',
                font_color='black', align='center', height=25)
  )])

  fig.update_layout(title_text="Model Comparison Results", title_x=0.5)
  fig.show()

# 5.&nbsp;Choose model configuration

**Model options**
*   **wide_and_deep:** Wide & Deep Neural Network
*   **tabnet:** TabNet
*   **compare:** Compare two models on the test set

In [None]:
config = {
    "name": "compare",
    "lr": 1e-3,
    "weigh_decay": 1e-5,
    "batch_size": 256,
    "epoch": 20,
    "dropout": 0.2,
    "early_stopping": False,
    "verbose": True
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_ds = TabularDataset(X_train_num, X_train_cat, y_train_dl)
val_ds   = TabularDataset(X_val_num,   X_val_cat,   y_val_dl)
test_ds  = TabularDataset(X_test_num,  X_test_cat,  y_test_dl)

train_dl = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True, drop_last=False)
val_dl   = DataLoader(val_ds,   batch_size=config['batch_size'])
test_dl  = DataLoader(test_ds,  batch_size=config['batch_size'])

if config['name'] == "wide_and_deep":
    wd_train_and_test(config)
elif config['name'] == "tabnet":
    tabnet_train_and_test(config)
elif config['name'] == "compare":
    model_comparison()