# Tutorial 10: Model Training and Optimization

## Module 4: Model Development

---

## Learning Objectives

By the end of this tutorial, you will be able to:

1. **Choose appropriate loss functions** for different ML tasks
2. **Implement regularization techniques** to prevent overfitting
3. **Apply optimization algorithms** effectively
4. **Use training strategies** like learning rate scheduling
5. **Decide between training from scratch vs fine-tuning**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from typing import List, Tuple, Callable

from sklearn.datasets import make_classification, make_regression, load_digits
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet, SGDClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss

np.random.seed(42)
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
print("Libraries imported successfully!")

In [None]:
# Optional: PyTorch for advanced examples
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    TORCH_AVAILABLE = True
    print("PyTorch available")
except ImportError:
    TORCH_AVAILABLE = False
    print("PyTorch not available - using sklearn alternatives")

## 1. Loss Functions

Loss functions measure how well a model's predictions match the actual values.

### Loss Function Selection Guide

| Task | Loss Function | When to Use |
|------|---------------|-------------|
| Binary Classification | Binary Cross-Entropy | Standard binary classification |
| Multi-class Classification | Categorical Cross-Entropy | Mutually exclusive classes |
| Regression | MSE | Standard regression, sensitive to outliers |
| Regression | MAE | Robust to outliers |
| Regression | Huber | Balance between MSE and MAE |
| Ranking | Pairwise Loss | Learning to rank |

### 1.1 Classification Loss Functions

In [None]:
class LossFunctions:
    """Common loss function implementations."""
    
    @staticmethod
    def binary_cross_entropy(y_true, y_pred, epsilon=1e-15):
        """Binary cross-entropy loss."""
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    
    @staticmethod
    def categorical_cross_entropy(y_true, y_pred, epsilon=1e-15):
        """Categorical cross-entropy loss."""
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))
    
    @staticmethod
    def mse(y_true, y_pred):
        """Mean Squared Error."""
        return np.mean((y_true - y_pred) ** 2)
    
    @staticmethod
    def mae(y_true, y_pred):
        """Mean Absolute Error."""
        return np.mean(np.abs(y_true - y_pred))
    
    @staticmethod
    def huber_loss(y_true, y_pred, delta=1.0):
        """Huber loss - combines MSE and MAE."""
        error = y_true - y_pred
        is_small_error = np.abs(error) <= delta
        squared_loss = 0.5 * error ** 2
        linear_loss = delta * (np.abs(error) - 0.5 * delta)
        return np.mean(np.where(is_small_error, squared_loss, linear_loss))
    
    @staticmethod
    def hinge_loss(y_true, y_pred):
        """Hinge loss for SVM."""
        # y_true should be -1 or 1
        return np.mean(np.maximum(0, 1 - y_true * y_pred))

print("LossFunctions class defined!")

In [None]:
# Demonstrate loss functions
y_true = np.array([1, 0, 1, 1, 0])
y_pred_good = np.array([0.9, 0.1, 0.8, 0.95, 0.2])
y_pred_bad = np.array([0.4, 0.6, 0.3, 0.5, 0.7])

print("Binary Cross-Entropy Loss:")
print(f"  Good predictions: {LossFunctions.binary_cross_entropy(y_true, y_pred_good):.4f}")
print(f"  Bad predictions: {LossFunctions.binary_cross_entropy(y_true, y_pred_bad):.4f}")

In [None]:
# Visualize BCE loss
probs = np.linspace(0.01, 0.99, 100)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss when true label is 1
loss_for_1 = -np.log(probs)
axes[0].plot(probs, loss_for_1, 'b-', linewidth=2)
axes[0].set_xlabel('Predicted Probability')
axes[0].set_ylabel('Loss')
axes[0].set_title('BCE Loss (True Label = 1)')
axes[0].axvline(x=0.5, color='gray', linestyle='--', alpha=0.7)

# Loss when true label is 0
loss_for_0 = -np.log(1 - probs)
axes[1].plot(probs, loss_for_0, 'r-', linewidth=2)
axes[1].set_xlabel('Predicted Probability')
axes[1].set_ylabel('Loss')
axes[1].set_title('BCE Loss (True Label = 0)')
axes[1].axvline(x=0.5, color='gray', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

### 1.2 Regression Loss Functions

In [None]:
# Compare regression losses with outliers
y_true_reg = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 100.0])  # Last one is outlier
y_pred_reg = np.array([1.1, 2.2, 2.8, 4.1, 5.2, 10.0])

print("Regression Loss Comparison (with outlier):")
print(f"  MSE: {LossFunctions.mse(y_true_reg, y_pred_reg):.4f}")
print(f"  MAE: {LossFunctions.mae(y_true_reg, y_pred_reg):.4f}")
print(f"  Huber (delta=1): {LossFunctions.huber_loss(y_true_reg, y_pred_reg, delta=1.0):.4f}")
print(f"  Huber (delta=10): {LossFunctions.huber_loss(y_true_reg, y_pred_reg, delta=10.0):.4f}")

In [None]:
# Visualize regression losses
errors = np.linspace(-5, 5, 100)

fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(errors, errors**2, 'b-', label='MSE (Squared)', linewidth=2)
ax.plot(errors, np.abs(errors), 'r-', label='MAE (Absolute)', linewidth=2)

# Huber loss
delta = 1.0
huber = np.where(np.abs(errors) <= delta, 0.5 * errors**2, delta * (np.abs(errors) - 0.5 * delta))
ax.plot(errors, huber, 'g-', label=f'Huber (delta={delta})', linewidth=2)

ax.set_xlabel('Error (y_true - y_pred)')
ax.set_ylabel('Loss')
ax.set_title('Comparison of Regression Loss Functions')
ax.legend()
ax.set_ylim([0, 10])
ax.axvline(x=0, color='gray', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

### 1.3 Ranking Loss (Pairwise)

In [None]:
def pairwise_ranking_loss(scores, relevance):
    """
    Compute pairwise ranking loss.
    Loss increases when a less relevant item is scored higher than a more relevant one.
    """
    n = len(scores)
    loss = 0
    pairs = 0
    
    for i in range(n):
        for j in range(i + 1, n):
            if relevance[i] != relevance[j]:
                # Item with higher relevance should have higher score
                if relevance[i] > relevance[j]:
                    diff = scores[j] - scores[i]  # Should be negative
                else:
                    diff = scores[i] - scores[j]  # Should be negative
                
                loss += max(0, 1 + diff)  # Hinge-like loss
                pairs += 1
    
    return loss / pairs if pairs > 0 else 0

# Example: Search ranking
relevance = np.array([3, 2, 1, 0])  # Higher is more relevant
good_scores = np.array([0.9, 0.7, 0.4, 0.1])  # Correctly ordered
bad_scores = np.array([0.2, 0.5, 0.8, 0.9])   # Incorrectly ordered

print("Pairwise Ranking Loss:")
print(f"  Good ranking: {pairwise_ranking_loss(good_scores, relevance):.4f}")
print(f"  Bad ranking: {pairwise_ranking_loss(bad_scores, relevance):.4f}")

## 2. Regularization Techniques

Regularization prevents overfitting by adding constraints to the model.

| Technique | Description | Effect |
|-----------|-------------|--------|
| **L1 (Lasso)** | Adds absolute value of weights | Sparse weights, feature selection |
| **L2 (Ridge)** | Adds squared weights | Small weights, prevents extreme values |
| **ElasticNet** | Combines L1 and L2 | Balance between sparsity and smoothness |
| **Dropout** | Randomly zero activations | Prevents co-adaptation |
| **Early Stopping** | Stop training before overfitting | Optimal complexity |

### 2.1 L1, L2, and ElasticNet Regularization

In [None]:
# Create dataset with some irrelevant features
X, y = make_regression(n_samples=200, n_features=20, n_informative=5, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Compare regularization methods
from sklearn.linear_model import LinearRegression

models = {
    'No Regularization': LinearRegression(),
    'L2 (Ridge)': Ridge(alpha=1.0),
    'L1 (Lasso)': Lasso(alpha=0.1),
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5)
}

results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    train_score = model.score(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    n_nonzero = np.sum(np.abs(model.coef_) > 0.01)
    
    results.append({
        'Model': name,
        'Train R2': train_score,
        'Test R2': test_score,
        'Non-zero Coefs': n_nonzero
    })

print("Regularization Comparison:")
print(pd.DataFrame(results).to_string(index=False))

In [None]:
# Visualize coefficient magnitudes
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for ax, (name, model) in zip(axes.flat, models.items()):
    model.fit(X_train_scaled, y_train)
    coefs = model.coef_
    
    colors = ['steelblue' if c != 0 else 'lightgray' for c in coefs]
    ax.bar(range(len(coefs)), np.abs(coefs), color=colors)
    ax.set_xlabel('Feature Index')
    ax.set_ylabel('|Coefficient|')
    ax.set_title(f'{name} - {np.sum(np.abs(coefs) > 0.01)} non-zero')

plt.tight_layout()
plt.show()

In [None]:
# Regularization strength analysis
alphas = np.logspace(-4, 2, 50)
ridge_train, ridge_test = [], []
lasso_train, lasso_test = [], []

for alpha in alphas:
    # Ridge
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_scaled, y_train)
    ridge_train.append(ridge.score(X_train_scaled, y_train))
    ridge_test.append(ridge.score(X_test_scaled, y_test))
    
    # Lasso
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train_scaled, y_train)
    lasso_train.append(lasso.score(X_train_scaled, y_train))
    lasso_test.append(lasso.score(X_test_scaled, y_test))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].semilogx(alphas, ridge_train, 'b-', label='Train', linewidth=2)
axes[0].semilogx(alphas, ridge_test, 'r-', label='Test', linewidth=2)
axes[0].set_xlabel('Alpha (Regularization Strength)')
axes[0].set_ylabel('R² Score')
axes[0].set_title('Ridge Regularization')
axes[0].legend()

axes[1].semilogx(alphas, lasso_train, 'b-', label='Train', linewidth=2)
axes[1].semilogx(alphas, lasso_test, 'r-', label='Test', linewidth=2)
axes[1].set_xlabel('Alpha (Regularization Strength)')
axes[1].set_ylabel('R² Score')
axes[1].set_title('Lasso Regularization')
axes[1].legend()

plt.tight_layout()
plt.show()

### 2.2 Dropout

In [None]:
def apply_dropout(X, dropout_rate=0.5):
    """Apply dropout to activations."""
    mask = np.random.binomial(1, 1 - dropout_rate, X.shape)
    # Scale by 1/(1-p) to maintain expected value
    return X * mask / (1 - dropout_rate)

# Demonstrate dropout
activations = np.random.randn(5, 10)
print("Original activations:")
print(activations[0])
print("\nAfter dropout (50%):")
print(apply_dropout(activations, 0.5)[0])

In [None]:
# Compare MLP with and without dropout-like regularization
X_clf, y_clf = make_classification(n_samples=500, n_features=20, n_informative=10, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

scaler_clf = StandardScaler()
X_train_clf_s = scaler_clf.fit_transform(X_train_clf)
X_test_clf_s = scaler_clf.transform(X_test_clf)

# No regularization
mlp_no_reg = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, alpha=0.0, random_state=42)
mlp_no_reg.fit(X_train_clf_s, y_train_clf)

# With L2 regularization (alpha parameter)
mlp_with_reg = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, alpha=0.01, random_state=42)
mlp_with_reg.fit(X_train_clf_s, y_train_clf)

print("MLP Regularization Comparison:")
print(f"No regularization - Train: {mlp_no_reg.score(X_train_clf_s, y_train_clf):.4f}, Test: {mlp_no_reg.score(X_test_clf_s, y_test_clf):.4f}")
print(f"With L2 (alpha=0.01) - Train: {mlp_with_reg.score(X_train_clf_s, y_train_clf):.4f}, Test: {mlp_with_reg.score(X_test_clf_s, y_test_clf):.4f}")

### 2.3 Early Stopping

In [None]:
class EarlyStopping:
    """Early stopping to prevent overfitting."""
    
    def __init__(self, patience=5, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.should_stop = False
    
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0
        
        return self.should_stop

# Simulate training with early stopping
np.random.seed(42)
train_losses = []
val_losses = []

for epoch in range(100):
    # Simulated losses
    train_loss = 1.0 / (1 + 0.1 * epoch) + np.random.randn() * 0.02
    val_loss = 1.0 / (1 + 0.08 * epoch) + np.random.randn() * 0.02
    if epoch > 30:
        val_loss += 0.01 * (epoch - 30)  # Overfitting
    train_losses.append(train_loss)
    val_losses.append(val_loss)

# Apply early stopping
early_stop = EarlyStopping(patience=5, min_delta=0.001)
stop_epoch = len(val_losses)

for epoch, val_loss in enumerate(val_losses):
    if early_stop(val_loss):
        stop_epoch = epoch
        break

print(f"Early stopping triggered at epoch {stop_epoch}")

In [None]:
# Visualize early stopping
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(train_losses, 'b-', label='Training Loss', linewidth=2)
ax.plot(val_losses, 'r-', label='Validation Loss', linewidth=2)
ax.axvline(x=stop_epoch, color='green', linestyle='--', label=f'Early Stop (epoch {stop_epoch})', linewidth=2)

ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Early Stopping Demonstration')
ax.legend()
ax.fill_between(range(stop_epoch, 100), 0, max(val_losses), alpha=0.2, color='red', label='Overfitting region')

plt.tight_layout()
plt.show()

## 3. Optimization Algorithms

| Optimizer | Description | Best For |
|-----------|-------------|----------|
| **SGD** | Classic gradient descent | Simple, well-understood |
| **Momentum** | SGD with momentum | Faster convergence |
| **Adam** | Adaptive learning rates | Most problems, default choice |
| **RMSprop** | Root mean square propagation | RNNs |
| **AdaGrad** | Adaptive gradients | Sparse features |

In [None]:
class Optimizer:
    """Base optimizer class."""
    def __init__(self, lr=0.01):
        self.lr = lr
    
    def step(self, params, grads):
        raise NotImplementedError

class SGD(Optimizer):
    """Stochastic Gradient Descent."""
    def step(self, params, grads):
        return params - self.lr * grads

class MomentumSGD(Optimizer):
    """SGD with Momentum."""
    def __init__(self, lr=0.01, momentum=0.9):
        super().__init__(lr)
        self.momentum = momentum
        self.velocity = None
    
    def step(self, params, grads):
        if self.velocity is None:
            self.velocity = np.zeros_like(params)
        self.velocity = self.momentum * self.velocity - self.lr * grads
        return params + self.velocity

class Adam(Optimizer):
    """Adam optimizer."""
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__(lr)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.t = 0
    
    def step(self, params, grads):
        if self.m is None:
            self.m = np.zeros_like(params)
            self.v = np.zeros_like(params)
        
        self.t += 1
        self.m = self.beta1 * self.m + (1 - self.beta1) * grads
        self.v = self.beta2 * self.v + (1 - self.beta2) * (grads ** 2)
        
        m_hat = self.m / (1 - self.beta1 ** self.t)
        v_hat = self.v / (1 - self.beta2 ** self.t)
        
        return params - self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)

print("Optimizers defined!")

In [None]:
# Compare optimizers on a simple quadratic function
def quadratic_loss(params):
    """f(x, y) = x^2 + 10*y^2"""
    return params[0]**2 + 10 * params[1]**2

def quadratic_grad(params):
    return np.array([2 * params[0], 20 * params[1]])

# Track optimization paths
optimizers = {
    'SGD': SGD(lr=0.05),
    'Momentum': MomentumSGD(lr=0.05, momentum=0.9),
    'Adam': Adam(lr=0.2)
}

paths = {}
n_steps = 50
start = np.array([5.0, 2.0])

for name, opt in optimizers.items():
    params = start.copy()
    path = [params.copy()]
    
    for _ in range(n_steps):
        grads = quadratic_grad(params)
        params = opt.step(params, grads)
        path.append(params.copy())
    
    paths[name] = np.array(path)

# Visualize optimization paths
fig, ax = plt.subplots(figsize=(10, 8))

# Contour plot
x = np.linspace(-6, 6, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
Z = X**2 + 10 * Y**2

ax.contour(X, Y, Z, levels=30, alpha=0.5)

colors = {'SGD': 'blue', 'Momentum': 'red', 'Adam': 'green'}
for name, path in paths.items():
    ax.plot(path[:, 0], path[:, 1], 'o-', color=colors[name], label=name, markersize=3, linewidth=1.5)

ax.plot(0, 0, 'k*', markersize=15, label='Optimum')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Optimizer Comparison on f(x,y) = x² + 10y²')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Compare convergence speed
fig, ax = plt.subplots(figsize=(10, 6))

for name, path in paths.items():
    losses = [quadratic_loss(p) for p in path]
    ax.plot(losses, label=name, linewidth=2)

ax.set_xlabel('Iteration')
ax.set_ylabel('Loss')
ax.set_title('Convergence Comparison')
ax.legend()
ax.set_yscale('log')

plt.tight_layout()
plt.show()

## 4. Training Strategies

### 4.1 Learning Rate Scheduling

In [None]:
class LRScheduler:
    """Learning rate schedulers."""
    
    @staticmethod
    def step_decay(initial_lr, epoch, drop_rate=0.5, epochs_drop=10):
        """Step decay: reduce LR by factor every N epochs."""
        return initial_lr * (drop_rate ** (epoch // epochs_drop))
    
    @staticmethod
    def exponential_decay(initial_lr, epoch, decay_rate=0.95):
        """Exponential decay."""
        return initial_lr * (decay_rate ** epoch)
    
    @staticmethod
    def cosine_annealing(initial_lr, epoch, total_epochs):
        """Cosine annealing."""
        return initial_lr * (1 + np.cos(np.pi * epoch / total_epochs)) / 2
    
    @staticmethod
    def warmup_linear(initial_lr, epoch, warmup_epochs=5, total_epochs=100):
        """Linear warmup followed by linear decay."""
        if epoch < warmup_epochs:
            return initial_lr * epoch / warmup_epochs
        else:
            return initial_lr * (1 - (epoch - warmup_epochs) / (total_epochs - warmup_epochs))

# Visualize schedulers
epochs = np.arange(100)
initial_lr = 0.1

schedulers = {
    'Step Decay': [LRScheduler.step_decay(initial_lr, e) for e in epochs],
    'Exponential': [LRScheduler.exponential_decay(initial_lr, e) for e in epochs],
    'Cosine Annealing': [LRScheduler.cosine_annealing(initial_lr, e, 100) for e in epochs],
    'Warmup + Linear': [LRScheduler.warmup_linear(initial_lr, e) for e in epochs]
}

fig, ax = plt.subplots(figsize=(12, 6))

for name, lrs in schedulers.items():
    ax.plot(epochs, lrs, label=name, linewidth=2)

ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.set_title('Learning Rate Schedulers')
ax.legend()

plt.tight_layout()
plt.show()

### 4.2 Batch Size Selection

In [None]:
# Simulate effect of batch size
batch_sizes = [8, 32, 128, 512]

print("Batch Size Trade-offs:")
print("=" * 60)
print(f"{'Batch Size':<12} {'Gradient Noise':<18} {'Memory':<12} {'Speed':<12}")
print("-" * 60)
for bs in batch_sizes:
    noise = 'High' if bs < 32 else 'Medium' if bs < 128 else 'Low'
    memory = 'Low' if bs < 64 else 'Medium' if bs < 256 else 'High'
    speed = 'Slow' if bs < 32 else 'Medium' if bs < 128 else 'Fast'
    print(f"{bs:<12} {noise:<18} {memory:<12} {speed:<12}")

In [None]:
# Demonstrate batch size effect with SGDClassifier
X, y = make_classification(n_samples=5000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Different partial_fit iterations to simulate batch sizes
import time

results = []
for n_iter in [10, 50, 100, 200]:
    clf = SGDClassifier(max_iter=n_iter, tol=None, random_state=42)
    
    start = time.time()
    clf.fit(X_train_s, y_train)
    train_time = time.time() - start
    
    train_acc = clf.score(X_train_s, y_train)
    test_acc = clf.score(X_test_s, y_test)
    
    results.append({
        'Iterations': n_iter,
        'Train Acc': train_acc,
        'Test Acc': test_acc,
        'Time (s)': train_time
    })

print("\nTraining Iterations Effect:")
print(pd.DataFrame(results).to_string(index=False))

### 4.3 Gradient Clipping

In [None]:
def clip_gradients(grads, max_norm=1.0):
    """Clip gradients to prevent exploding gradients."""
    total_norm = np.sqrt(sum(np.sum(g**2) for g in grads))
    clip_coef = max_norm / (total_norm + 1e-6)
    
    if clip_coef < 1:
        return [g * clip_coef for g in grads], total_norm, True
    return grads, total_norm, False

# Demonstrate gradient clipping
gradients = [np.array([3.0, 4.0]), np.array([5.0, 12.0])]

clipped, norm, was_clipped = clip_gradients(gradients, max_norm=5.0)

print("Gradient Clipping Demo:")
print(f"Original gradient norm: {norm:.4f}")
print(f"Was clipped: {was_clipped}")
if was_clipped:
    new_norm = np.sqrt(sum(np.sum(g**2) for g in clipped))
    print(f"Clipped gradient norm: {new_norm:.4f}")

## 5. Training from Scratch vs Fine-tuning

| Approach | When to Use | Advantages |
|----------|-------------|------------|
| **From Scratch** | Unique domain, lots of data | Full control, no bias |
| **Fine-tuning** | Limited data, similar domain | Faster, better performance |

In [None]:
# Simulate transfer learning scenario
# Source task: larger dataset
X_source, y_source = make_classification(n_samples=5000, n_features=20, n_informative=15, random_state=42)

# Target task: smaller dataset (simulated as similar distribution)
X_target, y_target = make_classification(n_samples=200, n_features=20, n_informative=15, random_state=43)

X_train_target, X_test_target, y_train_target, y_test_target = train_test_split(
    X_target, y_target, test_size=0.3, random_state=42
)

scaler = StandardScaler()
X_source_s = scaler.fit_transform(X_source)
X_train_target_s = scaler.transform(X_train_target)
X_test_target_s = scaler.transform(X_test_target)

print(f"Source dataset: {len(X_source)} samples")
print(f"Target dataset: {len(X_target)} samples")

In [None]:
# Approach 1: Train from scratch on target only
model_scratch = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42)
model_scratch.fit(X_train_target_s, y_train_target)
scratch_score = model_scratch.score(X_test_target_s, y_test_target)

# Approach 2: Pre-train on source, fine-tune on target
model_pretrain = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42)
model_pretrain.fit(X_source_s, y_source)  # Pre-train on source
pretrain_score_before = model_pretrain.score(X_test_target_s, y_test_target)

# Fine-tune with warm_start (simulating transfer learning)
model_finetune = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=100, warm_start=True, random_state=42)
model_finetune.coefs_ = model_pretrain.coefs_
model_finetune.intercepts_ = model_pretrain.intercepts_
model_finetune.fit(X_train_target_s, y_train_target)
finetune_score = model_finetune.score(X_test_target_s, y_test_target)

print("Transfer Learning Comparison:")
print(f"  From Scratch (target only): {scratch_score:.4f}")
print(f"  Pre-trained (before fine-tune): {pretrain_score_before:.4f}")
print(f"  Fine-tuned: {finetune_score:.4f}")

In [None]:
# Analyze effect of target dataset size
target_sizes = [50, 100, 200, 500, 1000]
scratch_scores = []
finetune_scores = []

for size in target_sizes:
    X_t, y_t = X_target[:size], y_target[:size]
    X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_t, y_t, test_size=0.3, random_state=42)
    
    X_train_t_s = scaler.transform(X_train_t)
    X_test_t_s = scaler.transform(X_test_t)
    
    # From scratch
    m1 = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42)
    m1.fit(X_train_t_s, y_train_t)
    scratch_scores.append(m1.score(X_test_t_s, y_test_t))
    
    # Pre-trained
    m2 = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=100, random_state=42)
    m2.coefs_ = model_pretrain.coefs_
    m2.intercepts_ = model_pretrain.intercepts_
    m2.fit(X_train_t_s, y_train_t)
    finetune_scores.append(m2.score(X_test_t_s, y_test_t))

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(target_sizes, scratch_scores, 'b-o', label='From Scratch', linewidth=2, markersize=8)
ax.plot(target_sizes, finetune_scores, 'r-o', label='Fine-tuned', linewidth=2, markersize=8)
ax.set_xlabel('Target Dataset Size')
ax.set_ylabel('Test Accuracy')
ax.set_title('Transfer Learning Benefit vs Dataset Size')
ax.legend()
ax.fill_between(target_sizes, scratch_scores, finetune_scores, alpha=0.2, color='green')
plt.tight_layout()
plt.show()

## 6. Hands-on Exercise

In [None]:
# Exercise: Complete Training Pipeline
print("Exercise: Build a Complete Training Pipeline")
print("=" * 60)

# Load dataset
digits = load_digits()
X, y = digits.data, digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

In [None]:
# Step 1: Baseline
baseline = LogisticRegression(max_iter=1000, random_state=42)
baseline.fit(X_train_s, y_train)
print(f"Step 1 - Baseline (Logistic Regression): {baseline.score(X_test_s, y_test):.4f}")

In [None]:
# Step 2: Compare regularization
reg_strengths = [0.001, 0.01, 0.1, 1.0]
best_alpha, best_score = 0, 0

for alpha in reg_strengths:
    model = MLPClassifier(hidden_layer_sizes=(100, 50), alpha=alpha, max_iter=200, random_state=42)
    model.fit(X_train_s, y_train)
    val_score = model.score(X_val_s, y_val)
    if val_score > best_score:
        best_score = val_score
        best_alpha = alpha
    print(f"  Alpha={alpha}: Val Acc={val_score:.4f}")

print(f"\nStep 2 - Best regularization: alpha={best_alpha}")

In [None]:
# Step 3: Train final model with early stopping
final_model = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    alpha=best_alpha,
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42
)

final_model.fit(X_train_s, y_train)
print(f"Step 3 - Final model stopped at {final_model.n_iter_} iterations")
print(f"  Validation Accuracy: {final_model.score(X_val_s, y_val):.4f}")

In [None]:
# Step 4: Final evaluation
test_acc = final_model.score(X_test_s, y_test)
print(f"\nStep 4 - Final Test Accuracy: {test_acc:.4f}")
print(f"Improvement over baseline: {(test_acc - baseline.score(X_test_s, y_test))*100:.2f}%")

In [None]:
# Visualize training curve
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(final_model.loss_curve_, 'b-', linewidth=2)
ax.set_xlabel('Iteration')
ax.set_ylabel('Loss')
ax.set_title('Training Loss Curve')
plt.tight_layout()
plt.show()

## 7. Summary

### Key Takeaways

1. **Loss Functions**: Choose based on task type and data characteristics
2. **Regularization**: Prevents overfitting - use L1 for sparsity, L2 for smoothness
3. **Optimizers**: Adam is a good default, but SGD with momentum works well for large-scale
4. **Learning Rate**: Use scheduling for better convergence
5. **Transfer Learning**: Highly effective when target data is limited

### Quick Reference

| Scenario | Recommendation |
|----------|----------------|
| Classification | Cross-entropy loss, Adam optimizer |
| Regression with outliers | Huber loss or MAE |
| Overfitting | Increase regularization, early stopping |
| Slow convergence | Reduce learning rate, use momentum |
| Limited data | Transfer learning with fine-tuning |