### Lab 3.1: Batching and Regularization

In this lab you will learn how to set up a dataset to be processed in batches, rather than processing the entire dataset in each training iteration, and explore neural network regularization.

In [1]:
import numpy as np
import torch

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# Enable GPU acceleration
# if torch.cuda.is_available():
    # torch.set_default_device('cuda')
    # device = torch.device('cuda') 
# else:
device = torch.device('cpu')
    
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables)

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [3]:
y = y['income'].map({'<=50K':0,'<=50K.':0,'>50K':1,'>50K.':1})

Here I remove the missing values from the features and labels.

In [4]:
bad = X.isna().any(axis=1)
X = X[~bad]
y = y[~bad]

Selecting only the numeric variables:

In [5]:
X = X[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]

In [6]:
y = y.values
X = X.values.astype('float64')

To make the learning algorithm work more smoothly, we we will subtract the mean of each feature.

Here `np.mean` calculates a mean, and `axis=0` tells NumPy to calculate the mean over the rows (calculate the mean of each column).

In [7]:
# Min-max normalixatoin
X -= np.min(X, axis=0)  
X /= np.max(X, axis=0)

Now we will convert our `X` and `y` arrays to torch Tensors.

In [8]:
X = torch.tensor(X, device=device).float()
y = torch.tensor(y, device=device).long()

### Exercises

1. Divide the data into train and test splits.
2. Create a neural network for this dataset.
3. Use `TensorDataset` and `DataLoader` to batch the dataset during training.  
4. Use `weight_decay` parameter to `optim.SGD` to introduce L2 regularization during training. Evaluate the effect of regularization on test set accuracy.

In [9]:
from torch.utils.data import random_split, TensorDataset, DataLoader

def accuracy(model, X, y):
    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():  # Disable gradient calculation
        z = torch.argmax(model(X), dim=-1)
        return (z == y).float().mean().item()
        
    return None

def train(model, dataloader, weight_decay=0, lr=1e-2, epochs=500):
    opt = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = torch.nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        features_train, labels_train = next(iter(dataloader_train)) 
        opt.zero_grad() # zero out the gradients
    
        z = model(features_train) # compute z values
        loss = loss_fn(z, labels_train) # compute loss
    
        loss.backward() # compute gradients
        opt.step() # apply gradients

In [10]:
# 1. Divide the data into train and test splits. (Written with help of Qwen2.5-Coder-14B)
train_size = int(0.8 * len(X))
test_size = len(X) - train_size

subset_train, subset_test = random_split(dataset=TensorDataset(X, y), lengths=[train_size, test_size])

# Extract the actual data from the subsets
X_train, y_train = subset_train.dataset.tensors
X_test, y_test = subset_test.dataset.tensors

In [11]:
# 2. Create a neural network for this dataset.
model = torch.nn.Sequential(
    torch.nn.Linear(6, 100),
    torch.nn.SiLU(), 
    torch.nn.Linear(100, 2),
).to(device)

In [12]:
# 3. Use TensorDataset and DataLoader to batch the dataset during training. 
batch_size = 32

dataloader_train = DataLoader(
    TensorDataset(X_train, y_train), 
    batch_size=batch_size, 
    shuffle=True,
)

train(model, dataloader_train)

print(f'training accuracy: {accuracy(model, X_train, y_train)}')
print(f'testing accuracy: {accuracy(model, X_test, y_test)}')

training accuracy: 0.7576489448547363
testing accuracy: 0.7576489448547363


In [14]:
# Now testing with regularization
model2 = torch.nn.Sequential(
    torch.nn.Linear(6, 100),
    torch.nn.SiLU(), 
    torch.nn.Linear(100, 2),
).to(device)

train(model2, dataloader_train, weight_decay=0.5)

print(f'training accuracy: {accuracy(model2, X_train, y_train)}')
print(f'testing accuracy: {accuracy(model2, X_test, y_test)}')

training accuracy: 0.7576489448547363
testing accuracy: 0.7576489448547363


weight decay, while lowering the magnitude of the weights, does not seem to have any effect on this model's accuracy