# Lesson 1 Homework — Exploring Layers and Activations

Goals:
- Recreate the TensorFlow Playground experiments locally
- Observe how depth, width, activation, and learning rate affect results
- Summarize your findings concisely

## Setup
If packages are missing, uncomment the pip installs.

In [None]:
# !pip install -q torch matplotlib
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

def make_circles(n_samples=1500, factor=0.5, noise=0.15, random_state=1):
    rng = np.random.default_rng(random_state)
    n_outer = n_samples // 2
    n_inner = n_samples - n_outer
    theta_outer = rng.uniform(0, 2*np.pi, n_outer)
    theta_inner = rng.uniform(0, 2*np.pi, n_inner)
    X_outer = np.c_[np.cos(theta_outer), np.sin(theta_outer)]
    X_inner = factor * np.c_[np.cos(theta_inner), np.sin(theta_inner)]
    X_outer += noise * rng.standard_normal(X_outer.shape)
    X_inner += noise * rng.standard_normal(X_inner.shape)
    X = np.vstack([X_outer, X_inner]).astype('float32')
    y = np.r_[np.zeros(n_outer), np.ones(n_inner)].astype('float32')
    return X, y

X, y = make_circles(n_samples=1500, factor=0.5, noise=0.15, random_state=1)
plt.figure(figsize=(4.5,4))
plt.scatter(X[:,0], X[:,1], c=y, cmap='RdBu', s=10, edgecolor='k')
plt.title('Dataset (make_circles)')
plt.show()


## Helper: build and train model
Adjust layers, units, activation, optimizer, learning rate, and epochs.

In [None]:
def build_model(units=(8,8), activation='relu', lr=1e-3):
    layers = []
    in_features = 2
    for u in units:
        layers.append(nn.Linear(in_features, u))
        if activation == 'relu':
            layers.append(nn.ReLU())
        elif activation == 'tanh':
            layers.append(nn.Tanh())
        elif activation == 'sigmoid':
            layers.append(nn.Sigmoid())
        else:
            layers.append(nn.ReLU())
        in_features = u
    layers.append(nn.Linear(in_features, 1))
    layers.append(nn.Sigmoid())
    model = nn.Sequential(*layers)
    model.lr = lr
    return model

def train_and_plot(model, X, y, epochs=20, title=''):
    # simple train/validation split (80/20)
    idx = np.arange(len(X))
    np.random.shuffle(idx)
    split = int(0.8 * len(X))
    train_idx, val_idx = idx[:split], idx[split:]
    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]

    X_train_t = torch.from_numpy(X_train)
    y_train_t = torch.from_numpy(y_train).unsqueeze(1)
    X_val_t = torch.from_numpy(X_val)
    y_val_t = torch.from_numpy(y_val).unsqueeze(1)

    optimizer = torch.optim.Adam(model.parameters(), lr=getattr(model, 'lr', 1e-3))
    loss_fn = nn.BCELoss()

    train_loss, val_loss = [], []
    train_acc, val_acc = [], []

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        preds = model(X_train_t)
        loss = loss_fn(preds, y_train_t)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            model.eval()
            train_pred = (preds > 0.5).float()
            train_acc_epoch = (train_pred.eq(y_train_t)).float().mean().item()

            val_probs = model(X_val_t)
            val_pred = (val_probs > 0.5).float()
            val_loss_epoch = loss_fn(val_probs, y_val_t).item()
            val_acc_epoch = (val_pred.eq(y_val_t)).float().mean().item()

        train_loss.append(loss.item())
        val_loss.append(val_loss_epoch)
        train_acc.append(train_acc_epoch)
        val_acc.append(val_acc_epoch)

    # loss curves
    fig, ax = plt.subplots(1,2, figsize=(9,3.8))
    ax[0].plot(train_loss, label='train')
    ax[0].plot(val_loss, label='val')
    ax[0].set_title('Loss'); ax[0].legend()
    ax[0].grid(True, alpha=0.3)
    ax[1].plot(train_acc, label='train')
    ax[1].plot(val_acc, label='val')
    ax[1].set_title('Accuracy'); ax[1].legend()
    ax[1].grid(True, alpha=0.3)
    fig.suptitle(title)
    plt.show()

    return {
        'loss': train_loss,
        'val_loss': val_loss,
        'accuracy': train_acc,
        'val_accuracy': val_acc,
    }


## Task 1 — Layers and Units
Try 1 hidden layer vs 2 hidden layers; vary hidden units (e.g., 4, 8, 16).
- Record your validation accuracy and observations.

In [None]:
m1 = build_model(units=(8,), activation='relu', lr=1e-3)
h1 = train_and_plot(m1, X, y, epochs=20, title='1 layer, 8 units, ReLU')

m2 = build_model(units=(8,8), activation='relu', lr=1e-3)
h2 = train_and_plot(m2, X, y, epochs=20, title='2 layers, 8-8 units, ReLU')


## Task 2 — Activation Choices
Compare ReLU vs Tanh vs Sigmoid for hidden layers.
- Which converges faster? Any saturation issues?

In [None]:
for act in ['relu','tanh','sigmoid']:
    m = build_model(units=(16,16), activation=act, lr=1e-3)
    _ = train_and_plot(m, X, y, epochs=25, title=f'2 layers, 16-16, activation={act}')


## Task 3 — Learning Rate Sensitivity
Try a range of learning rates (e.g., 1e-4, 1e-3, 1e-2, 1e-1).
- What happens for very large values?

In [None]:
for lr in [1e-4, 1e-3, 1e-2, 1e-1]:
    m = build_model(units=(8,8), activation='relu', lr=lr)
    _ = train_and_plot(m, X, y, epochs=20, title=f'LR={lr}')


## Optional — Decision Regions
Visualize learned decision boundaries for your favorite model.

In [None]:
def plot_regions(model, X, y):
    xx, yy = np.meshgrid(
        np.linspace(X[:,0].min()-0.5, X[:,0].max()+0.5, 220),
        np.linspace(X[:,1].min()-0.5, X[:,1].max()+0.5, 220)
    )
    grid = np.c_[xx.ravel(), yy.ravel()].astype('float32')
    model.eval()
    with torch.no_grad():
        grid_t = torch.from_numpy(grid)
        probs = model(grid_t).detach().numpy().reshape(xx.shape)
    plt.figure(figsize=(5,4))
    plt.contourf(xx, yy, probs, levels=20, cmap='RdBu', alpha=0.6)
    plt.scatter(X[:,0], X[:,1], c=y, cmap='RdBu', edgecolor='k', s=10)
    plt.title('Decision regions')
    plt.show()


## Short Reflection
Answer in your own words (2–4 sentences each):
1. How does adding a second hidden layer change what the model can represent?
2. When did you observe over- or under-fitting? What signaled it?
3. Which activation worked best here, and why do you think that is?

## To-Do (stubs only — do not implement)
These are optional extensions meant for practice. Leave them as TODOs.

In [None]:
# TODO: Implement a manual NumPy forward pass for a 2-layer network
# (ReLU hidden, sigmoid output) on the same dataset.
# Compare predictions qualitatively to the PyTorch model.

In [None]:
# TODO: Write a function plot_regions_threshold(model, X, y, threshold=0.5)
# that visualizes decision regions for different probability thresholds.
# Try thresholds: 0.3, 0.5, 0.7 and note changes.

In [None]:
# TODO: Swap loss to mean-squared error (MSE) while keeping sigmoid output.
# Train briefly and record any differences in convergence/accuracy vs BCE.

In [None]:
# TODO: Add L2 weight decay (kernel_regularizer) or a Dropout layer
# and observe effects on training/validation curves.
# Keep other hyperparameters the same for a fair comparison.