## Task 2: Model Hyperparameter Search



### 1. Preprearing the data for training

In [None]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import requests
from io import StringIO

In [None]:
# Define the URL for the Adult dataset on UCI Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# Define column names for the dataset
column_names = [
    "age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
    "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
    "hours_per_week", "native_country", "income"
]

# Fetch the dataset from the URL
response = requests.get(url)
data = pd.read_csv(StringIO(response.text), header=None,
                    names=column_names, sep=',\s*', engine='python')

# Define categorical and numerical columns
categorical_cols = ["workclass", "education", "marital_status",
                    "occupation", "relationship", "race", "sex", "native_country"]
numerical_cols = ["age", "fnlwgt", "education_num",
                    "capital_gain", "capital_loss", "hours_per_week"]

# Create a dictionary to store the number of unique categories for each categorical column
categories = {}

# Use LabelEncoder to encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Encode the target variable 'income' as 1 and 0
data['income'] = data['income'].map({'>50K': 1, '<=50K': 0})

# Calculate the number of unique categories for each categorical column
for col in categorical_cols:
    unique_values = data[col].nunique()
    categories[col] = unique_values

# Convert the dictionary to a list of values
categories_list = list(categories.values())

# Separate features (X) and target (y)
X = data[categorical_cols + numerical_cols]
y = data["income"]

# Split the dataset into training, validating and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)

# Standardize numerical features (optional but recommended)
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [None]:
X_train = X_train.values
y_train = y_train.values
X_val = X_val.values
y_val = y_val.values
X_test = X_test.values
y_test = y_test.values

In [None]:
# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

### 2. Model training and evaluation function

In [5]:
import wandb
import os
os.environ["WANDB_NOTEBOOK_NAME"] = "model_hyperparameter_search.ipynb"
wandb.login()

from torch import optim, nn
from tqdm import tqdm

from models import run_pytorch

[34m[1mwandb[0m: Currently logged in as: [33mzhipeng-he[0m. Use [1m`wandb login --relogin`[0m to force relogin


For pytorch based models

In [6]:
from models.pytorch.mlp import MLP
from models.pytorch.tab_transformer import TabTransformer
from models.pytorch.ft_transformer import FTTransformer
from models.jax.logistic_regression import LogisticRegression

In [7]:
def model_config(config):
    if config.model == "MLP":
        model_config = {
            "input_dim": config.input_dim,
            "output_dim": config.output_dim,
            "num_hidden_layers": 4,
            "hidden_layer_dims": [64, 32],
            "dropout": config.dropout,
            "categories": categories_list,
            "embedding_dim": config.embedding_dim,
            "num_categorical_feature": len(categorical_cols),
        }
        return MLP(**model_config)
    
    elif config.model == "TabTransformer":
        model_config = {
            "categories": categories_list,
            "num_continuous": len(numerical_cols),
            "dim": config.embedding_dim, # can sweep
            "dim_out": config.output_dim,
            "depth": config.depth, # can sweep
            "heads": config.heads, # can sweep
            "attn_dropout": config.dropout, 
            "ff_dropout": config.dropout, 
            "mlp_hidden_mults": (4, 2), 
            "mlp_act": nn.ReLU(),
            "continuous_mean_std": None,
        }
        return TabTransformer(**model_config)

    elif config.model == "FTTransformer":
        model_config = {
            "categories": categories_list,
            "num_continuous": len(numerical_cols),
            "dim": config.embedding_dim, 
            "dim_out": config.output_dim,
            "depth": config.depth,
            "heads": config.heads, 
            "attn_dropout": config.dropout, 
            "ff_dropout": config.dropout, 
        }
        return FTTransformer(**model_config)

    # elif model_name == "TabNet":
    #     model_config = {}

    # return model

In [8]:
def sweep_fun():
    with wandb.init(project="TabAttackBench-ModelSweep"):

        config = wandb.config

        model = model_config(config)

        # Define a loss function and an optimizer
        criterion = nn.BCEWithLogitsLoss()
        optimizer = run_pytorch.build_optimizer(model, config.optimizer, config.learning_rate)

        # and use them to train the model
        run_pytorch.train(model, (X_train_tensor, y_train_tensor), (X_val_tensor, y_val_tensor), criterion, optimizer, config)

        # and test its final performance
        run_pytorch.test(model, (X_test_tensor, y_test_tensor), config, wandb_run=wandb.run)

### 3. Model Configuration

In [9]:
import pprint

In [10]:
sweep_config = {
    'method': 'grid',
    'parameters': {
        'dataset': {
            'values': ['adult']
        },
        'input_dim': {
            'values': [X_train.shape[1]],
        },
        'output_dim': {
            'values': [1],
        },
        'epochs': {
            'values': [10, 20, 30]
            },
        'optimizer': {
            'values': ['adam']
            },
        'learning_rate': {
            'values': [0.001, 0.01, 0.1]
            },
        'batch_size': {
            'values': [128, 256, 512]
            },
        'dropout': {
            'values': [0.0, 0.1, 0.2, 0.3, 0.4]
            },
    }
}

#### 3.1. MLP

In [11]:
mlp_sweep_config = sweep_config

mlp_sweep_config['parameters']['model'] = {
    'values': ['MLP']
}
mlp_sweep_config['parameters']['embedding_dim'] = {
    'values': [4, 8]
}

In [12]:
mlp_sweep_id = wandb.sweep(
  sweep=mlp_sweep_config, 
  project='TabAttackBench-ModelSweep',
  )
print(f'{mlp_sweep_id=}')
pprint.pprint(mlp_sweep_config)

Create sweep with ID: r8s45onz
Sweep URL: https://wandb.ai/zhipeng-he/TabAttackBench-ModelSweep/sweeps/r8s45onz
mlp_sweep_id='r8s45onz'
{'method': 'grid',
 'parameters': {'batch_size': {'values': [128, 256, 512]},
                'dataset': {'values': ['adult']},
                'dropout': {'values': [0.0, 0.1, 0.2, 0.3, 0.4]},
                'embedding_dim': {'values': [4, 8]},
                'epochs': {'values': [10, 20, 30]},
                'input_dim': {'values': [14]},
                'learning_rate': {'values': [0.001, 0.01, 0.1]},
                'model': {'values': ['MLP']},
                'optimizer': {'values': ['adam']},
                'output_dim': {'values': [1]}}}


#### 2.2. TabTransformer

In [13]:
tabtrans_sweep_config = sweep_config
tabtrans_sweep_config['parameters']['model'] = {
    'values': ['TabTransformer']
}

tabtrans_sweep_config['parameters']['embedding_dim'] = {
    'values': [4, 8]
}

tabtrans_sweep_config['parameters']['depth'] = {
    'values': [6]
}

tabtrans_sweep_config['parameters']['heads'] = {
    'values': [4, 8]
}

In [14]:
tabtrans_sweep_id = wandb.sweep(
  sweep=tabtrans_sweep_config, 
  project='TabAttackBench-ModelSweep',
  )
print(f'{tabtrans_sweep_id=}')
pprint.pprint(tabtrans_sweep_config)

Create sweep with ID: kz5dylr7
Sweep URL: https://wandb.ai/zhipeng-he/TabAttackBench-ModelSweep/sweeps/kz5dylr7
tabtrans_sweep_id='kz5dylr7'
{'method': 'grid',
 'parameters': {'batch_size': {'values': [128, 256, 512]},
                'dataset': {'values': ['adult']},
                'depth': {'values': [6]},
                'dropout': {'values': [0.0, 0.1, 0.2, 0.3, 0.4]},
                'embedding_dim': {'values': [4, 8]},
                'epochs': {'values': [10, 20, 30]},
                'heads': {'values': [4, 8]},
                'input_dim': {'values': [14]},
                'learning_rate': {'values': [0.001, 0.01, 0.1]},
                'model': {'values': ['TabTransformer']},
                'optimizer': {'values': ['adam']},
                'output_dim': {'values': [1]}}}


#### 2.3. FTTransformer

In [15]:
fttrans_sweep_config = sweep_config
fttrans_sweep_config['parameters']['model'] = {
    'values': ['FTTransformer']
}

fttrans_sweep_config['parameters']['embedding_dim'] = {
    'values': [4, 8]
}

fttrans_sweep_config['parameters']['depth'] = {
    'values': [6]
}

fttrans_sweep_config['parameters']['heads'] = {
    'values': [4, 8]
}

In [16]:
fttrans_sweep_id = wandb.sweep(
  sweep=fttrans_sweep_config, 
  project='TabAttackBench-ModelSweep',
  )
print(f'{fttrans_sweep_id=}')
pprint.pprint(fttrans_sweep_config)

Create sweep with ID: hbo2yvwo
Sweep URL: https://wandb.ai/zhipeng-he/TabAttackBench-ModelSweep/sweeps/hbo2yvwo
fttrans_sweep_id='hbo2yvwo'
{'method': 'grid',
 'parameters': {'batch_size': {'values': [128, 256, 512]},
                'dataset': {'values': ['adult']},
                'depth': {'values': [6]},
                'dropout': {'values': [0.0, 0.1, 0.2, 0.3, 0.4]},
                'embedding_dim': {'values': [4, 8]},
                'epochs': {'values': [10, 20, 30]},
                'heads': {'values': [4, 8]},
                'input_dim': {'values': [14]},
                'learning_rate': {'values': [0.001, 0.01, 0.1]},
                'model': {'values': ['FTTransformer']},
                'optimizer': {'values': ['adam']},
                'output_dim': {'values': [1]}}}


#### 2.4. TabNet

#### 2.5. Logistic Regression

### 3. Model Sweeping

In [17]:
wandb.agent(mlp_sweep_id, function=sweep_fun)

[34m[1mwandb[0m: Agent Starting Run: 3pktcnli with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dataset: adult
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 4
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	input_dim: 14
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	model: MLP
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	output_dim: 1


self.category_embeddings.weight.shape=torch.Size([102, 4])


  0%|          | 0/10 [00:00<?, ?it/s]


Run 3pktcnli errored: ValueError('Target size (torch.Size([128])) must be the same as input size (torch.Size([128, 1]))')
[34m[1mwandb[0m: [32m[41mERROR[0m Run 3pktcnli errored: ValueError('Target size (torch.Size([128])) must be the same as input size (torch.Size([128, 1]))')
[34m[1mwandb[0m: Agent Starting Run: t75rizem with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dataset: adult
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 4
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	input_dim: 14
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	model: MLP
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	output_dim: 1


self.category_embeddings.weight.shape=torch.Size([102, 4])


  0%|          | 0/10 [00:00<?, ?it/s]


Run t75rizem errored: ValueError('Target size (torch.Size([128])) must be the same as input size (torch.Size([128, 1]))')
[34m[1mwandb[0m: [32m[41mERROR[0m Run t75rizem errored: ValueError('Target size (torch.Size([128])) must be the same as input size (torch.Size([128, 1]))')
[34m[1mwandb[0m: Agent Starting Run: j6q8azd0 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dataset: adult
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 4
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	input_dim: 14
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	model: MLP
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	output_dim: 1


self.category_embeddings.weight.shape=torch.Size([102, 4])


  0%|          | 0/10 [00:00<?, ?it/s]


wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


Run j6q8azd0 errored: ValueError('Target size (torch.Size([128])) must be the same as input size (torch.Size([128, 1]))')
[34m[1mwandb[0m: [32m[41mERROR[0m Run j6q8azd0 errored: ValueError('Target size (torch.Size([128])) must be the same as input size (torch.Size([128, 1]))')
Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: [32m[41mERROR[0m Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true


In [18]:
wandb.agent(tabtrans_sweep_id, function=sweep_fun)

404 response executing GraphQL.
{"errors":[{"message":"could not find sweep zhipeng-he/TabAttackBench-ModelSweep/kz5dylr7 during createAgent","path":["createAgent"]}],"data":{"createAgent":null}}
[34m[1mwandb[0m: [32m[41mERROR[0m Error while calling W&B API: could not find sweep zhipeng-he/TabAttackBench-ModelSweep/kz5dylr7 during createAgent (<Response [404]>)


UsageError: could not find sweep zhipeng-he/TabAttackBench-ModelSweep/kz5dylr7 during createAgent

In [None]:
wandb.agent(fttrans_sweep_id, function=sweep_fun)