<link rel="stylesheet" href="berkeley.css">

<h1 class="cal cal-h1">Lecture 14 – CS 189, Fall 2025</h1>



In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
from plotly.subplots import make_subplots
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
# from ipywidgets import HBox
import numpy as np

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Basics of Neural Networks</h2>

Neurons pass information from one to another using action potentials. They connect with one another at synapses, which are junctions between one neuron's axon and another's dendrite. Information flows from:

1.  The dendrites,
2.  To the cell body,
3.  Through the axons,
4.  To a synapse connecting the axon to the dendrite of the next neuron.

<img src="https://www.cs.toronto.edu/~lczhang/360/lec/w02/imgs/neuron.png" width="400">




An Artificial Neuron is a mathematical function with the following elements:


1.   Input
2.   Weighted summation of inputs
3.   Processing unit of activation function
4.   Output

<img src="https://www.cs.toronto.edu/~lczhang/360/lec/w02/imgs/neuron_model.jpeg" width="500">


The mathematical equation for an artificial is as follows:

\begin{align}
        \hat{y} = f(\vec{\mathbf{\theta}} \cdot \vec{\mathbf{x}}) &= f\left(\sum_{i=0}^D w_i x_i\right) \\
        &= f(w_0 + w_1 x_1 + ... + w_Dx_D).
\end{align}

Assuming that function $f$ is the logistic or sigmoid function, the output of the neuron has a probability value ($0 \leq p \leq 1$). This probability value can then be used for a binary classification task where $p < 0.5$ is an indication of class $0$, and $p \geq 0.5$ assigns data to class 1. Re-writing the equation above with a sigmoid activation function would give us the following:

\begin{align}
        \hat{y} = σ(\vec{\mathbf{w}} \cdot \vec{\mathbf{x}}) &= \sigma\left(\sum_{i=0}^d w_i x_i\right) \\
        &= σ(w_0 + w_1 x_1 + ... + w_Dx_D).
\end{align}


The code below contains an implementation of AND, OR, and XOR gates. You will be able to generate data for each of the functions and add the desired noise level to the data. Familiarize yourself with the code and answer the following questions.

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">The Logic Gate Problem</h2>



We'll start by generating synthetic data for a logic gate (e.g., AND, OR, XOR) with Gaussian noise. This data will be used for training and testing the logistic regression model.
The function `generate_data_with_noise` allows customization of the number of samples, the logic gate, and the noise level.
            

In [None]:
# Function to generate a dataset with multiple samples per gate location
def generate_data_with_noise(num_samples = 500, gate = "AND", noise_level = 0.05):
    """
    Generate multiple samples per logic gate configuration with added noise.
    """
    if gate == 'AND':
        base_X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
        base_t = np.array([0, 0, 0, 1])
    elif gate == 'OR':
        base_X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
        base_t = np.array([0, 1, 1, 1])
    elif gate == 'XOR':
        base_X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
        base_t = np.array([0, 1, 1, 0])
    else:
        raise ValueError("Gate must be 'AND', 'OR', or 'XOR'.")

    # Repeat each base configuration to create multiple samples
    X = np.repeat(base_X, num_samples // len(base_X), axis=0)
    t = np.repeat(base_t, num_samples // len(base_t), axis=0)

    # Add Gaussian noise to the inputs
    X = X + np.random.normal(0, noise_level, X.shape)

    # Shuffle the dataset to avoid ordered samples
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    t = t[indices]

    return X, t

In this lecture we will use interactive visualizations.  These require a python environment so if you are viewing this notebook through the static HTML version you won't be able to use the interactive features.

In [None]:
from ipywidgets import interact, FloatSlider, Dropdown, Checkbox
import plotly.express as px
import plotly.graph_objects as go


X, t = generate_data_with_noise(500, 'AND', 0.05)
# Make an interactive plot
data_fig = go.FigureWidget()
data_fig.add_trace(go.Scatter(x=X[t == 0, 0], y=X[t == 0, 1], mode='markers',
                              marker=dict(color='red', opacity=0.7, size=7,
                                          line=dict(width=1, color='DarkSlateGrey')),
                              name='0'))
data_fig.add_trace(go.Scatter(x=X[t == 1, 0], y=X[t == 1, 1], mode='markers',
                              marker=dict(color='blue', opacity=0.7, size=7,
                                          line=dict(width=1, color='DarkSlateGrey')),
                              name='1'))
data_fig.update_layout(width=800, height=500,
                       xaxis_range=[-1, 2], yaxis_range=[-1, 2])
# The following code defines a set of interactive widgets (sliders)
# and binds them to an update function that will be run whenever
# a slider is changed.
@interact(num_samples=FloatSlider(min=100, max=1000, step=100, value=500, description='Samples'),
          gate=Dropdown(options=['AND', 'OR', 'XOR'], value='AND', description='Gate'),
          noise_level=FloatSlider(min=0.0, max=1.0, step=0.01, value=0.05, description='Noise Level'))
def update_data_plot(num_samples, gate, noise_level):
    X, t = generate_data_with_noise(num_samples, gate, noise_level)
    with data_fig.batch_update():
        data_fig.data[0].x = X[t == 0, 0]
        data_fig.data[0].y = X[t == 0, 1]
        data_fig.data[1].x = X[t == 1, 0]
        data_fig.data[1].y = X[t == 1, 1]
        data_fig.update_layout(title=f"Dataset for {gate} Gate with Noise Level {noise_level}")
display(data_fig)

<link rel="stylesheet" href="berkeley.css">

<h3 class="cal cal-h3">Solving with Logistic Regression</h3>

This section demonstrates how to perform logistic regression using the scikit-learn library. The dataset is divided into training and testing subsets using an 80%-20% ratio with the train_test_split function. A logistic regression model is instantiated and trained on the training dataset using the `.fit()` method. The model's performance is evaluated on both the training and testing data using the `.score()` method, which computes accuracy.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Logistic Regression Using Scikit-learn
def perform_logistic_regression(X, t):
    # Split the data
    x_train, x_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=140)

    # Train a logistic regression model
    model = LogisticRegression().fit(x_train, t_train)

    # Print training and testing scores
    train_error = model.score(x_train, t_train)
    test_error = model.score(x_test, t_test)

    return model, train_error, test_error

We can plot a decision boundary or even a decision surface by plotting predictions on a regular grid of points.  This is accomplished using the meshgrid function from numpy.  We can then use the model to predict the class of each point on the grid and plot the results.  This is a useful way to visualize the decision boundary of a classifier.

In [None]:
def plot_decision_boundary(model, x1_range, x2_range, num_points=100, probs=True):
    # Generate a grid of points
    x1, x2 = np.meshgrid(np.linspace(x1_range[0], x1_range[1], num_points),
                         np.linspace(x2_range[0], x2_range[1], num_points))
    grid = np.c_[x1.ravel(), x2.ravel()]
    # Get predictions for the grid
    if probs:
        preds = model.predict_proba(grid)[:,1].reshape(x1.shape)
    else:
        preds = model.predict(grid).reshape(x1.shape)
    return go.Contour(x=x1[0], y=x2[:, 0], z=preds, colorscale=[[0, 'red'], [1, 'blue']],
                      opacity = 0.5, showscale=False)

Again we create an interactive visualization plot:            

In [None]:
pred_fig = go.FigureWidget(data=data_fig.data, layout=data_fig.layout)

model, train_test, test_error = perform_logistic_regression(X, t)
boundary = plot_decision_boundary(model, [-1, 2], [-1, 2], probs=False)
pred_fig.add_trace(boundary)

@interact(num_samples=FloatSlider(min=100, max=1000, step=100, value=500, description='Samples'),
          gate=Dropdown(options=['AND', 'OR', 'XOR'], value='AND', description='Gate'),
          noise_level=FloatSlider(min=0.0, max=1.0, step=0.01, value=0.05, description='Noise Level'),
          show_probs=Checkbox(value=False, description='Show Probabilities'))
def update_pred_fig(num_samples, gate, noise_level, show_probs):
    np.random.seed(42)
    X, y = generate_data_with_noise(num_samples, gate, noise_level)
    model, train_error, test_error = perform_logistic_regression(X, y)
    boundary = plot_decision_boundary(model, [-1, 2], [-1, 2], probs=show_probs)
    with pred_fig.batch_update():
        pred_fig.data[0].x = X[y == 0, 0]
        pred_fig.data[0].y = X[y == 0, 1]
        pred_fig.data[1].x = X[y == 1, 0]
        pred_fig.data[1].y = X[y == 1, 1]
    pred_fig.data[2].update(z=boundary.z)
    title = f"Predictions for {gate} Gate with Noise Level {noise_level} (Train: {train_error:.2f}, Test: {test_error:.2f})"
    pred_fig.update_layout(title=title)
display(pred_fig)

Let's focus on the XOR data and develop a neural network to classify it.  

In [None]:
X, t = generate_data_with_noise(500, 'XOR', 0.15)

data = pd.DataFrame({'x1': X[:, 0], 'x2': X[:, 1], 't': t})

data_fig = go.Figure()
data_fig.add_trace(go.Scatter(x=X[t == 0, 0], y=X[t == 0, 1], mode='markers',
                              marker=dict(color='red', opacity=0.7, size=7,
                                          line=dict(width=1, color='DarkSlateGrey')),
                              name='0'))
data_fig.add_trace(go.Scatter(x=X[t == 1, 0], y=X[t == 1, 1], mode='markers',
                              marker=dict(color='blue', opacity=0.7, size=7,
                                          line=dict(width=1, color='DarkSlateGrey')),
                              name='1'))
data_fig.update_layout(width=600, height=500,
                       xaxis_range=[-1, 2], yaxis_range=[-1, 2])

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Using PyTorch</h2>

PyTorch provides a flexible and efficient platform for building and training neural networks. PyTorch is known for its intuitive python centric approach to defining neural networks.

There are a few core concepts that are important to understand when using PyTorch:

1. **Tensors**: Tensors are the fundamental data structure in PyTorch, similar to NumPy arrays but with additional capabilities for GPU acceleration. They can be used to represent inputs, outputs, and parameters of neural networks.
1. **DataSets and DataLoaders**: PyTorch provides utilities to handle data loading and batching through `torch.utils.data.Dataset` and `torch.utils.data.DataLoader`.
1. **nn.Module**: This is the base class for all neural network modules in PyTorch. You can define your own models by subclassing `nn.Module` and defining the layers and forward pass.
1. **Optimizers**: PyTorch provides various optimization algorithms (like SGD, Adam, etc.) in the `torch.optim` module to update the model parameters based on the computed gradients.
1. **Autograd**: PyTorch's automatic differentiation library that tracks operations on tensors to compute gradients for backpropagation. (Focus of next lecture)


<link rel="stylesheet" href="berkeley.css">

<h3 class="cal cal-h3">Importing Torch</h3>

When we import PyTorch (called torch for historical reasons), we typically import the main torch library and the `torch.nn` module, which contains classes and functions for building neural networks. We often alias `torch.nn` as `nn` for convenience.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

One of the new features in torch is the `torch._dynamo` module, which is used to optimize the execution of PyTorch models. However, there are some known issues with `torch._dynamo` on certain hardware, such as Apple Silicon (M1/M2 chips). To avoid potential bugs or performance issues when running on Apple Silicon, we can disable `torch._dynamo` using the following command.

In [None]:
torch._dynamo.disable() # fixing a bug with apple silicon support

<link rel="stylesheet" href="berkeley.css">

<h3 class="cal cal-h3">Tensors</h3>

Tensors are the fundamental data structure in PyTorch, similar to NumPy arrays but with additional capabilities for GPU acceleration. They can be used to represent inputs, outputs, and parameters of neural networks.

We can create tensors in various ways, such as from lists or NumPy arrays, or by using built-in functions to create tensors filled with zeros, ones, or random values. Here are some examples of creating tensors:


In [None]:
# Creating a tensor from a list
torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)

We can set the data type of a tensor using the `dtype` argument when creating it. Common data types include `torch.float32`, `torch.int64`, etc.  Because we will work with hardware accelerators, it is important to be explicit about the data type of tensors.

In [None]:
torch.get_default_dtype()

Most accelerators will operate in float32 precision, so we will use `torch.float32` for our input data and model parameters.

In [None]:
torch.zeros((2, 3)).dtype

Tensors generally inherit the data type of the input data, but it is good practice to be explicit about the data type when creating tensors, especially when working with hardware accelerators like GPUs or TPUs.

In [None]:
t = torch.tensor(data['t'])
print(t.dtype)
t = t.to(torch.float32)
print(t.dtype)

In [None]:
torch.mean(t, axis=0)

In [None]:
t = torch.tensor(data['t'], dtype=torch.float32)
print(t.dtype)
t

We can create data using other constructors, such as `torch.zeros`, `torch.ones`, and `torch.rand`:


In [None]:
# Creating a tensor filled with zeros
torch.zeros((2, 3))

In [None]:
# Creating a tensor filled with ones
torch.ones((3, 2))

In [None]:
# Creating a tensor with random values
torch.rand((2, 2))


In [None]:
generator = torch.Generator().manual_seed(42)
torch.rand((2, 2), generator=generator)

We can manipulate tensors using various operations, such as addition, multiplication, and matrix multiplication. Here are some examples of tensor operations:


In [None]:
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
t = torch.tensor([[5, 6], [7, 8]], dtype=torch.float32)

x @ t

In [None]:
torch.mean(x, axis=1)

We can slice and index tensors similarly to NumPy arrays. Here are some examples of slicing and indexing tensors:

In [None]:
x[0]

In [None]:
x[:,1]

Moving between numpy arrays and torch tensors is easy. Here are some examples:

In [None]:
x.numpy()

In [None]:
x[1,1]

In [None]:
x[1,1].item()

In [None]:
torch.tensor(np.random.randn(2,3))

In [None]:
x[1,:]

<link rel="stylesheet" href="berkeley.css">

<h4 class="cal cal-h4">Working with Hardware Accelerators</h4>

Pytorch allows us to easily work with hardware accelerators like GPUs. This is particularly useful for training large neural networks, as it can significantly speed up the computations.

In [None]:
if torch.accelerator.is_available():
    device = torch.accelerator.current_accelerator()
else:
    device = "cpu"
print("Congratulations! You have access to a", device, "device.")

We can move tensors to the appropriate device (CPU or GPU) using the `.to(device)` method. Here is an example of how to check for available hardware accelerators and set the default device accordingly:

In [None]:
generator = torch.Generator(device=device).manual_seed(42)
x = torch.randn(3,2, device=device, generator=generator)
x

We can also move back to the CPU if needed using the same `.to("cpu")` method.

In [None]:
x.to("cpu")

In [None]:
x.cpu()

You cannot perform numpy operations on tensors that are on a device. You must first move them back to the CPU using the `.cpu()` method.

In [None]:
x.cpu().numpy()

Here is a simple function to convert tensors to numpy arrays safely:

In [None]:
def to_numpy(tensor):
    """Convert a PyTorch tensor to a NumPy array."""
    return tensor.cpu().numpy()

You can also set the default device for all tensors using `torch.set_default_device(device)`. This way, any new tensor created will automatically be on the specified device. This is particularly useful when you want to ensure that all tensors are on the same device without having to specify the device each time you create a tensor.

In [None]:
torch.set_default_device(device)

Let's convert our synthetic xor data to a tensor and move it to the appropriate device:

In [None]:
x = torch.tensor(data[['x1','x2']].values, dtype=torch.float32, device=device)
t = torch.tensor(data['t'].values, dtype=torch.float32, device=device)

<link rel="stylesheet" href="berkeley.css">

<h3 class="cal cal-h3">Datasets</h3>

PyTorch uses Datasets to represent the collections of training data.  PyTorch provides a convenient way to create datasets using the `torch.utils.data.TensorDataset` class. This class allows us to combine our input features and labels into a single dataset object, which can then be used for training and evaluation.

In [None]:
from torch.utils.data import TensorDataset
dataset = TensorDataset(
    torch.tensor(data[['x1','x2']].values, dtype=torch.float32),
    torch.tensor(data['t'].values, dtype=torch.float32))
dataset

We can access the dataset entries

In [None]:
x, t = dataset[1:5]
x, t

In [None]:
len(dataset)

There are useful utilities to manipulate datasets, such as constructing the train-validation split.

In [None]:
from torch.utils.data import random_split

generator = torch.Generator(device=device).manual_seed(189)

train_data, validation_data = random_split(
    dataset, [.8, .2],
    generator=generator)

len(train_data), len(validation_data)

Here we make a dataset builder function:

In [None]:
def make_datasets_from_numpy(x, t, split_ratio=0.8, seed=189):
    """Create training and test datasets from numpy data arrays."""
    from torch.utils.data import random_split, TensorDataset
    data = TensorDataset(
        torch.tensor(x, dtype=torch.float32, device=device),
        torch.tensor(t, dtype=torch.long, device=device),
    )
    generator = torch.Generator(device=device).manual_seed(seed)
    train_data, test_data = random_split(
        data, [split_ratio, 1 - split_ratio],
        generator=generator)
    return train_data, test_data

In [None]:
training_data, validation_data = make_datasets_from_numpy(
    data[['x1','x2']].values, data['t'].values)

In [None]:
(x, t) = training_data[:]
x, t

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Defining the Logistic Regression Model (Neural Network)</h2>

Now that we have our data in a suitable format, we can define our logistic regression model using PyTorch's `nn.Module` class. This class provides a convenient way to define the layers and forward pass of our model.

To build a new model (or model component), we create a new class that inherits from `nn.Module`. In the `__init__` method, we define the layers of our model. For logistic regression, we typically have a single linear layer that maps the input features to the output. We also need to implement a `forward` method that defines how the input data flows through the model to produce the output.

In [None]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.w = nn.Parameter(torch.zeros(input_dim, 1))
        self.b = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        p = torch.sigmoid(x @ self.w + self.b)
        return torch.cat([1 - p, p], dim=1)

When defining the `__init__` method of our model, we create nn.Parameters for the weights and biases of the linear layer. These parameters are automatically registered with the model, and PyTorch will handle their initialization and optimization during training.

In [None]:
nn.Parameter(torch.zeros(2, 1))

We can then create an new instance of our model and print its architecture to verify that it has been defined correctly.

In [None]:
model = LogisticRegressionModel(input_dim=2)
model

In [None]:
print(model.w)
print(model.b)

In [None]:
list(model.parameters())

We make predictions by either invoking forward or simply applying the model instance to the input data.

In [None]:
model.forward(x=x[:5])

In [None]:
model(x[:5])

Moving a model, moves its parameters to the specified device.

In [None]:
model.cpu().w

Linear models are so common that PyTorch provides a built-in `nn.Linear` layer that encapsulates the weight and bias parameters, as well as the forward pass. We could use this built-in layer instead of manually defining the weights and biases.

In [None]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        p = torch.sigmoid(self.linear(x))
        return torch.cat([1 - p, p], dim=1)

In [None]:
model = LogisticRegressionModel(input_dim=2)
print(model)
print(model.linear.weight)
print(model.linear.bias)

In [None]:
model(x[:5])

Notice that the model weights are not initialized to zero. This is important for breaking symmetry during training and ensuring that the model can learn effectively. We will discuss weight initialization strategies in more detail in a future lecture.

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Defining the Loss Function</h2>

The Loss function quantifies how well the model's predictions match the true labels. For binary classification tasks like logistic regression, we typically use the binary cross-entropy loss function. PyTorch provides a built-in loss function for this purpose called `nn.BCELoss`.


In [None]:
loss_fn = nn.BCELoss()

The binary cross-entropy loss function computes the loss between the predicted probabilities of class 1 and the true binary labels.

In [None]:
loss_fn(model(x[0:1])[:,1], t[0:1].float())

The more general cross entropy loss works for multi-class classification problems, where there are more than two classes. It is implemented in PyTorch as `nn.CrossEntropyLoss`.

**Important:** This loss function combines the softmax activation and the negative log-likelihood loss into a single function, making it suitable for multi-class classification tasks.  This behavior of combining steps is actually common with neural network components in PyTorch.  This enables more efficient computation and numerical stability.

In [None]:
loss_fn = nn.CrossEntropyLoss()

We can redefine our linear model to return raw logits instead of probabilities, which is what `nn.CrossEntropyLoss` expects as input.

In [None]:
class MultiClassLinearModel(nn.Module):
    def __init__(self, input_dim, output_dim=2):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

In [None]:
model = MultiClassLinearModel(input_dim=2, output_dim=2)
model

In [None]:
model(x[:5])

In [None]:
loss_fn(model(x[:5]), t[:5])

This should be the same as computing the softmax and then applying the negative log likelihood loss separately.

In [None]:
F.softmax(model(x[:5]), dim=1)

In [None]:
nll = nn.NLLLoss()
nll(F.log_softmax(model(x[:5]), dim=1), t[:5])

<link rel="stylesheet" href="berkeley.css">

<h3 class="cal cal-h3">Automatic Differentiation</h3>

Automatic differentiation (AD) is a set of techniques to evaluate the derivative of a function specified by a computer program. PyTorch provides a powerful automatic differentiation library called Autograd, which allows us to compute gradients of tensors with respect to some scalar value (usually the loss) automatically.


In [None]:
model.zero_grad()

pred = model(x)
loss = loss_fn(pred, t)

print(model.linear.weight.grad)

loss.backward()
print(model.linear.weight.grad)

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Optimizing the Loss Function</h2>



We can now used minibatch stochastic gradient descent (SGD) to minimize the loss function. PyTorch provides various optimization algorithms in the `torch.optim` module. These algorithms implement the logic of a single step.  Here we will use the Adam optimizer we introduced in the last lecture.
In PyTorch, you have to create your own iterative training loop.  Before the training loop, you typically setup a **DataLoader** to handle batching and shuffling of the dataset, and you create an **optimizer** to update the model parameters.


Then, within the training loop, you perform the following steps for each batch of data:
1. **Zero the gradients**: Before computing the gradients for the current batch, we need to zero out the gradients from the previous batch. This is done using the `optimizer.zero_grad()` method.
1. **Forward pass**: We pass the input data through the model to obtain the predictions.
1. **Compute the loss**: We compute the loss between the model's predictions and the true labels using the loss function.
1. **Backward pass**: We compute the gradients of the loss with respect to the model parameters using the `loss.backward()` method. This populates the `.grad` attributes of the model parameters.
1. **Update the parameters**: We update the model parameters using the optimizer's `step()` method, which applies the optimization algorithm to adjust the parameters based on the computed gradients.




In [None]:
def minibatch_gd(model, loss_fn,
                 training_data,
                 batch_size, nepochs, learning_rate,
                 visualizer=None):

    # Create a dataloader for training
    from torch.utils.data import DataLoader
    generator = torch.Generator(device=device)
    generator.manual_seed(189)
    loader = DataLoader(training_data,
                        batch_size=batch_size,
                        shuffle=True, # shuffles each epoch
                        generator=generator)

    # Define the optimizer (this is the update rule)
    # optimizer = torch.optim.SGD(model.parameters(), learning_rate)
    # Alternatively, you can use Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), learning_rate)

    # Loop through the epochs
    for epoch in range(nepochs):
        # Loop through all the batches
        for (x, t) in loader:
            # Zero the gradients to start the next step
            optimizer.zero_grad()
            # Compute prediction and loss
            pred = model(x)
            loss = loss_fn(pred, t)
            # Backpropagation (compute the gradient)
            loss.backward()
            # Update the parameters using the optimizer's update rule
            optimizer.step()

        # Visualize the model (if a visualizer function is provided)
        if visualizer is not None:
            with torch.no_grad():
                visualizer(epoch, model, loss_fn)


You can ignore the following visualization code.  This was added to track and update the loss and decision surface during training.

In [None]:
class LossVisualizer:
    def __init__(self, validation_data, pred_fig, loss_fig,
                 num_points=50, plot_probs=True, sleep_time=0):
        self.x_val, self.t_val = validation_data[:]
        self.pred_fig = pred_fig
        self.loss_fig = loss_fig
        self.epochs = []
        self.losses = []
        self.plot_probs = plot_probs
        self.sleep_time = sleep_time
        x1_min, x1_max = to_numpy(self.x_val[:,0]).min(), to_numpy(self.x_val[:,0]).max()
        x2_min, x2_max = to_numpy(self.x_val[:,1]).min(), to_numpy(self.x_val[:,1]).max()
        margin_x1 = 0.5 * (x1_max - x1_min)
        margin_x2 = 0.5 * (x2_max - x2_min)
        x1_min -= margin_x1
        x1_max += margin_x1
        x2_min -= margin_x2
        x2_max += margin_x2
        # Setup the grid of test points for decision boundary plotting
        x1, x2 = torch.meshgrid(torch.linspace(x1_min, x1_max, num_points),
                                torch.linspace(x2_min, x2_max, num_points), indexing='ij')
        self.grid = torch.cat([x1.reshape(-1, 1), x2.reshape(-1, 1)], dim=1)
        self.x1 = to_numpy(x1)
        self.x2 = to_numpy(x2)

    def plot_decision_boundary(self, model):
        with torch.no_grad():
            preds = F.softmax(model(self.grid), dim=1)
        num_classes = preds.shape[1]
        if num_classes > 2:  # support for multiclass
            preds = torch.argmax(preds, axis=1).reshape(self.x1.shape).T
            preds = to_numpy(preds)
            return go.Contour(x=self.x1[:, 0], y=self.x2[0], z=preds,
                            #   contours=dict(start=0, end=num_classes, size=1),
                              colorscale=px.colors.qualitative.Plotly[:num_classes],
                              contours=dict(start=-0.5, end=num_classes-0.5, size=1, coloring='fill'),
                              opacity=0.5, showscale=False)
        else:  # Binary classification case (red/blue)
            if self.plot_probs:
                preds = preds[:, 1].reshape(self.x1.shape).T
            else:
                preds = (preds[:, 1] > 0.5).astype(float).reshape(self.x1.shape).T
            preds = to_numpy(preds)
            return go.Contour(x=self.x1[:, 0], y=self.x2[0], z=preds,
                              colorscale=[[0, 'red'], [1, 'blue']],
                              #colorscale='Matter_r',
                              opacity = 0.5, showscale=False)
    def reset(self):
        self.epochs = []
        self.losses = []
        with self.loss_fig.batch_update():
            self.loss_fig.data[0].x = []
            self.loss_fig.data[0].y = []

    def __call__(self, epoch, model, loss_fn):
        with torch.no_grad():
            loss = loss_fn(model(self.x_val), self.t_val).item()
        self.epochs.append(epoch)
        self.losses.append(loss)
        # Visualization Code
        boundary = self.plot_decision_boundary(model)
        # plotly batch update
        with self.pred_fig.batch_update():
            self.pred_fig.data[-1].z = boundary.z
        with self.loss_fig.batch_update():
            self.loss_fig.data[0].x = self.epochs
            self.loss_fig.data[0].y = self.losses
        if(self.sleep_time > 0):
            import time
            time.sleep(self.sleep_time)


In [None]:
from ipywidgets import HBox
pred_fig = go.FigureWidget(data=data_fig.data, layout=data_fig.layout)
loss_fig = go.FigureWidget()
loss_fig.add_trace(go.Scatter(x=[], y=[], mode='lines', name='Train Loss'))
loss_fig.update_layout(title='Validation Loss', xaxis_title='Epochs', yaxis_title='Validation Loss (CE)')


model = MultiClassLinearModel(input_dim=2)
visualizer = LossVisualizer(
    validation_data=validation_data,
    pred_fig=pred_fig, loss_fig=loss_fig,
    num_points=50, plot_probs=True, sleep_time=0)

pred_fig.add_trace(visualizer.plot_decision_boundary(model))
display(HBox([pred_fig,loss_fig]))


In [None]:
model = MultiClassLinearModel(input_dim=2)
visualizer.reset()
minibatch_gd(
    model=model, loss_fn=loss_fn,
    training_data=training_data,
    batch_size=32, nepochs=50, learning_rate=0.01,
    visualizer=visualizer)


<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Building a Neural Network</h2>



Now that we have a working logistic regression model, we can extend it to a simple neural network by adding hidden layers and non-linear activation functions. This will allow the model to learn more complex decision boundaries.

In [None]:
class MLPModel(nn.Module):
    def __init__(self, dims, activation="relu"):
        super().__init__()
        # Notice here I create a ModuleList of layers so that PyTorch can track them
        self.layers = nn.ModuleList(
            [nn.Linear(dims[i], dims[i+1]) for i in range(len(dims)-1)]
        )
        # Store activation function
        if activation == "relu": self.act = nn.functional.relu
        else: self.act = torch.tanh

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            x = layer(x)
            if i < len(self.layers)-1:  # no activation on final layer
                x = self.act(x)
        return x

In [None]:
model = MLPModel(dims=[2, 4, 4, 2], activation="tanh")
model

In [None]:
model(x[:5])

The following code creates an interactive visualization to track the decision boundary and loss during training.  Unfortunately, this visualization will not work in the static HTML version of this notebook and doesn't appear to work in Google Colab.  You will need to run this code in a local Jupyter notebook environment.

In [None]:
from ipywidgets import HBox
pred_fig = go.FigureWidget(data=data_fig.data, layout=data_fig.layout)
loss_fig = go.FigureWidget()
loss_fig.add_trace(go.Scatter(x=[], y=[], mode='lines', name='Train Loss'))
loss_fig.update_layout(title='Validation Loss', xaxis_title='Epochs',
                       yaxis_title='Validation Loss (CE)')
visualizer = LossVisualizer(
    validation_data=validation_data,
    pred_fig=pred_fig, loss_fig=loss_fig,
    num_points=50, plot_probs=True, sleep_time=0)
pred_fig.add_trace(visualizer.plot_decision_boundary(model))
display(HBox([pred_fig,loss_fig]))


In [None]:
model = MLPModel(dims=[2, 10, 2], activation="tanh")
visualizer.reset()
minibatch_gd(
    model=model, loss_fn=loss_fn,
    training_data=training_data,
    batch_size=32, nepochs=20, learning_rate=0.01,
    visualizer=visualizer)


<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">More Complex Classification Dataset</h2>



Now that we have a working multi-layer neural network, we can test it on a more complex dataset. The `make_moons` function generates a two-dimensional dataset with a crescent moon shape, which is a common benchmark for testing classification algorithms. We can add some noise to the data to make the classification task more challenging.


In [None]:
from sklearn.datasets import make_circles, make_moons
# x, t = make_circles(n_samples=1000, noise=0.2, factor=0.5, random_state=42)
x, t = make_moons(n_samples=1000, noise=0.2, random_state=42)
data_fig = go.Figure()
data_fig.add_trace(go.Scatter(x=x[t == 0, 0], y=x[t == 0, 1],
                              mode='markers', marker=dict(color='red'),
                              name='0'))
data_fig.add_trace(go.Scatter(x=x[t == 1, 0], y=x[t == 1, 1],
                              mode='markers', marker=dict(color='blue'),
                              name='1'))
data_fig.update_layout(width=500, height=500,
                       xaxis_range=[-2, 3], yaxis_range=[-1.5, 2],
                       xaxis_title='Feature 1',
                       yaxis_title='Feature 2',
                       title='make_circles Dataset')
print("The first 5 training datapoints:", X[:5])
print("The labels for the first 5 datapoints:", t[:5])
data_fig.show()

Convert the data into PyTorch Tensor format and split it into training and test sets.

In [None]:
training_data, validation_data = make_datasets_from_numpy(x, t)

In [None]:
from ipywidgets import interact, IntSlider, FloatSlider, Dropdown, Checkbox

model = model = MLPModel(dims=[2, 4, 4, 2], activation="relu")
pred_fig = go.FigureWidget(data=data_fig.data, layout=data_fig.layout)
loss_fig = go.FigureWidget()
loss_fig.add_trace(go.Scatter(x=[], y=[], mode='lines', name='Train Loss'))
loss_fig.update_layout(title='Validation Loss', xaxis_title='Epochs', yaxis_title='Validation Loss (BC)')

visualizer = LossVisualizer(
    validation_data=validation_data,
    pred_fig=pred_fig, loss_fig=loss_fig,
    num_points=50, plot_probs=True, sleep_time=0)
pred_fig.add_trace(visualizer.plot_decision_boundary(model))

display(HBox([pred_fig,loss_fig]))

@interact(n_layers=IntSlider(min=1, max=5, step=1, value=2, description="Layers", continuous_update=False),
         neurons_per_layer=IntSlider(min=4, max=64, step=4, value=8, description="Neurons/Layer", continuous_update=False),
         activation_fn=Dropdown(options=['relu', 'tanh',], value='tanh', description="Activation", continuous_update=False),
         learning_rate=FloatSlider(min=0.001, max=0.1, step=0.001, value=0.01, description="Learning Rate", continuous_update=False),
         batch_size=IntSlider(min=1, max=128, step=8, value=32, description="Batch Size", continuous_update=False),
         epochs=IntSlider(min=10, max=200, step=10, value=10, description="Epochs", continuous_update=False))
def update_model(n_layers, neurons_per_layer, activation_fn, learning_rate, batch_size, epochs):
    # setup the model
    layers = [2] + ([neurons_per_layer] * n_layers) + [2]
    model = MLPModel(dims=layers, activation=activation_fn)
    visualizer.reset()
    minibatch_gd(
        model=model, loss_fn=loss_fn,
        training_data=training_data,
        batch_size=batch_size, nepochs=epochs, learning_rate=learning_rate,
        visualizer=visualizer)


<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Multi-Class Classification Dataset</h2>



We can try one more extension to the multi-class setting.  The `make_blobs` function generates isotropic Gaussian blobs for clustering. We can use this function to create a dataset with multiple classes and test our neural network's ability to classify them.

In [None]:
from sklearn.datasets import make_blobs
centers = 5 # number of classes
x, t = make_blobs(n_samples=1000, centers=centers, cluster_std=1.0, random_state=189)

data_blobs = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 't': t.astype(str)})
data_blobs.sort_values('t', inplace=True)

data_fig = px.scatter(data_blobs, x='x1', y='x2', color='t',
                      title='make_blobs Dataset',
                      category_orders={'t': [str(i) for i in range(centers)]},
                      labels={'x1': 'Feature 1', 'x2': 'Feature 2', 'color': 'Class'},
                      width=500, height=500)
print("The first 5 training datapoints:", X[:5])
print("The labels for the first 5 datapoints:", t[:5])
data_fig.show()

training_data, validation_data = make_datasets_from_numpy(x, t, split_ratio=0.8, seed=189)

In [None]:
from ipywidgets import interact, IntSlider, FloatSlider, Dropdown, Checkbox

model = model = MLPModel(dims=[2, 4, 4, centers], activation="relu")
pred_fig = go.FigureWidget(data=data_fig.data, layout=data_fig.layout)
# hide the color scale for the contour plot
pred_fig.update_layout(coloraxis_showscale=False)
loss_fig = go.FigureWidget()
loss_fig.add_trace(go.Scatter(x=[], y=[], mode='lines', name='Train Loss'))
loss_fig.update_layout(title='Validation Loss', xaxis_title='Epochs', yaxis_title='Validation Loss (BC)')

visualizer = LossVisualizer(
    validation_data=validation_data,
    pred_fig=pred_fig, loss_fig=loss_fig,
    num_points=50, plot_probs=True, sleep_time=0)
pred_fig.add_trace(visualizer.plot_decision_boundary(model))

display(HBox([pred_fig,loss_fig]))

@interact(n_layers=IntSlider(min=1, max=5, step=1, value=2, description="Layers", continuous_update=False),
         neurons_per_layer=IntSlider(min=4, max=64, step=4, value=8, description="Neurons/Layer", continuous_update=False),
         activation_fn=Dropdown(options=['relu', 'tanh',], value='tanh', description="Activation", continuous_update=False),
         learning_rate=FloatSlider(min=0.001, max=0.1, step=0.001, value=0.01, description="Learning Rate", continuous_update=False),
         batch_size=IntSlider(min=1, max=128, step=8, value=32, description="Batch Size", continuous_update=False),
         epochs=IntSlider(min=10, max=200, step=10, value=10, description="Epochs", continuous_update=False))
def update_model(n_layers, neurons_per_layer, activation_fn, learning_rate, batch_size, epochs):
    # setup the model
    layers = [2] + ([neurons_per_layer] * n_layers) + [centers]
    model = MLPModel(dims=layers, activation=activation_fn)
    visualizer.reset()
    minibatch_gd(
        model=model, loss_fn=loss_fn,
        training_data=training_data,
        batch_size=batch_size, nepochs=epochs, learning_rate=learning_rate,
        visualizer=visualizer)
