In [8]:
"""scout_opt_cifar10.py

Quick‑and‑dirty *large(-ish)* demo of the scout–guided optimizer on CIFAR‑10.

• Dataset  : CIFAR‑10 (50 k train / 10 k test, 32×32 colour)
• Model    : lightweight CNN (~0.6 M params)
• Baseline : Adam
• ScoutOpt : wraps Adam, periodically probes 2 directions
             (current grad + one random orthogonal) with
             a 1‑step look‑ahead on the same mini‑batch.

Run from a Jupyter cell (accepts the stray -f arg) or from CLI:
    python scout_opt_cifar10.py --gpu  
-------------------------------------------------------"""

import argparse, time, pathlib, random, sys
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# ------------------------------------------------------------------ #
#  1. A small-ish CNN                                                #
# ------------------------------------------------------------------ #
class SmallCNN(nn.Module):
    def __init__(self, n_classes: int = 10):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, 3, padding=1)
        self.pool  = nn.MaxPool2d(2, 2)
        self.fc1   = nn.Linear(256 * 4 * 4, 256)
        self.fc2   = nn.Linear(256, n_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 32→16
        x = self.pool(F.relu(self.conv2(x)))  # 16→8
        x = self.pool(F.relu(self.conv3(x)))  # 8 →4
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# ------------------------------------------------------------------ #
#  2. Scout‑guided Optimizer                                         #
# ------------------------------------------------------------------ #
import torch
import torch.nn as nn
from typing import List

class ModifiedScoutOptimizer(torch.optim.Optimizer):
    """
    Extended Scout Optimizer with multi-step probes along ±orthogonal directions.
    Now supports walking 'scout_steps' forward in each direction before deciding.
    """

    def __init__(
        self,
        params,
        base_opt_cls,
        *,
        lr=3e-4,
        scout_every=100,
        scout_radius=5e-4,
        scout_steps=100,
        **base_kwargs
    ):
        param_list = list(params)
        self.base = base_opt_cls(param_list, lr=lr, **base_kwargs)
        super().__init__(param_list, defaults=dict(lr=lr))
        self.s_every = scout_every
        self.radius = scout_radius
        self.steps = scout_steps
        self._step_id = 0
        self._params = param_list

    @staticmethod
    @torch.no_grad()
    def _flatten(tensors: List[torch.Tensor]):
        return torch.cat([t.reshape(-1) for t in tensors])

    @torch.no_grad()
    def _apply_delta(self, vec_flat: torch.Tensor, coef: float):
        offset = 0
        for p in self._params:
            n = p.numel()
            p.add_(vec_flat[offset:offset+n].view_as(p), alpha=coef)
            offset += n

    @staticmethod
    @torch.no_grad()
    def _rand_orthogonal(base: torch.Tensor):
        r = torch.randn_like(base)
        proj = (r @ base) / (base @ base + 1e-12)
        ortho = r - proj * base
        return ortho / (ortho.norm() + 1e-12)

    @torch.no_grad()
    def _track_direction(self, direction: torch.Tensor, closure, max_steps: int):
        best_loss = float('inf')
        best_step = 0

        for i in range(1, max_steps + 1):
            self._apply_delta(direction, self.radius)
            loss = closure()
            if loss < best_loss:
                best_loss = loss
                best_step = i

        self._apply_delta(direction, -self.radius * best_step)
        return best_loss, direction, best_step

    def step(self, closure=None):
        if closure is None:
            raise RuntimeError("ModifiedScoutOptimizer needs a closure that returns loss and does backward()")

        loss = closure()
        self._step_id += 1

        if self._step_id % self.s_every == 0:
            grad_flat = self._flatten([p.grad for p in self._params])
            grad_norm = grad_flat.norm() + 1e-12
            grad_dir = grad_flat / grad_norm
            ortho_dir = self._rand_orthogonal(grad_dir)

            directions = [
                grad_dir,
                ortho_dir,
                -ortho_dir
            ]

            best_loss = loss.item()
            best_dir = None
            best_scale = 0

            for d in directions:
                tracked_loss, d_dir, d_steps = self._track_direction(d, closure, self.steps)
                if tracked_loss < best_loss:
                    best_loss = tracked_loss
                    best_dir = d_dir
                    best_scale = d_steps

            if best_dir is not None:
                new_grad = best_dir * grad_norm
                offset = 0
                for p in self._params:
                    n = p.numel()
                    p.grad.copy_(new_grad[offset:offset+n].view_as(p))
                    offset += n

        self.base.step()
        return loss


# ------------------------------------------------------------------ #
#  3. Training / evaluation loops                                    #
# ------------------------------------------------------------------ #
@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    tot_loss, correct, total = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        out = model(x)
        tot_loss += criterion(out, y).item() * x.size(0)
        pred = out.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return tot_loss / total, correct / total


def train_one_epoch(model, loader, criterion, opt, device, log_every=100):
    model.train()
    running = 0.0
    for i, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)
    def closure():
        opt.base.zero_grad()
        # Clone the inputs to keep computation graph fresh for each probe
        x_cloned = x.detach().clone().requires_grad_(True)
        y_cloned = y.detach().clone()
        out = model(x_cloned)
        loss = criterion(out, y_cloned)
        loss.backward(retain_graph=True)
        return loss


        loss = opt.step(closure)
        running += loss.item()
        if (i + 1) % log_every == 0:
            print(f"  batch {i+1}/{len(loader)}  loss {running/log_every:.3f}")
            running = 0.0
# ------------------------------------------------------------------ #
#  4. Baseline Adam Check                                                     #
# ------------------------------------------------------------------ #


def run_baseline_adam(epochs=6, batch=128, lr=3e-4, results=None):
    print("\nBaseline: Adam")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])
    root = str(pathlib.Path.home() / ".torchdata")
    train_set = datasets.CIFAR10(root=root, train=True,  download=True, transform=transform)
    test_set  = datasets.CIFAR10(root=root, train=False, download=True, transform=transform)
    train_loader = DataLoader(train_set, batch_size=batch, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=256,   shuffle=False, num_workers=2, pin_memory=True)

    model = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=1e-4)

    for ep in range(1, epochs+1):
        print(f"\nEpoch {ep}/{epochs}")
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

        val_loss, val_acc = evaluate(model, test_loader, criterion, device)
        print(f"  val_loss={val_loss:.3f}  val_acc={val_acc:.3f}")
        if results is not None:
            results['adam_loss'].append(val_loss)
            results['adam_acc'].append(val_acc)

# ------------------------------------------------------------------ #
#  5. Main                                                           #
# ------------------------------------------------------------------ #


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--batch',  type=int, default=128)
    parser.add_argument('--lr',     type=float, default=3e-4)
    parser.add_argument('--scout_every',  type=int, default=100)
    parser.add_argument('--scout_radius', type=float, default=5e-4)
    parser.add_argument('--gpu', action='store_true')
    args, _ = parser.parse_known_args()

    device = torch.device('cuda' if args.gpu and torch.cuda.is_available() else 'cpu')
    print('Device:', device)

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    root = str(pathlib.Path.home() / ".torchdata")
    train_set = datasets.CIFAR10(root=root, train=True,  download=True, transform=transform)
    test_set  = datasets.CIFAR10(root=root, train=False, download=True, transform=transform)

    train_loader = DataLoader(train_set, batch_size=args.batch, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=256,       shuffle=False, num_workers=2, pin_memory=True)

    model = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    opt = ModifiedScoutOptimizer(
        model.parameters(), optim.Adam,
        lr=args.lr,
        scout_every=args.scout_every,
        scout_radius=args.scout_radius,
        scout_steps=50,
        betas=(0.9, 0.999),
        weight_decay=1e-4
    )


    results = {'scout_loss': [], 'scout_acc': [], 'adam_loss': [], 'adam_acc': []}

    for ep in range(1, args.epochs+1):
        print(f"\nEpoch {ep}/{args.epochs}")
        start = time.time()
        train_one_epoch(model, train_loader, criterion, opt, device)
        val_loss, val_acc = evaluate(model, test_loader, criterion, device)
        print(f"  val_loss={val_loss:.3f}  val_acc={val_acc:.3f}  ({time.time() - start:.1f}s)")
        results['scout_loss'].append(val_loss)
        results['scout_acc'].append(val_acc)

    torch.save(model.state_dict(), "scout_cnn.pth")
    print("Model saved to scout_cnn.pth")

    run_baseline_adam(epochs=args.epochs, batch=args.batch, lr=args.lr, results=results)

    print("\nFinal comparison table:")
    for i in range(args.epochs):
        print(f"Epoch {i+1:>2}:  Scout acc = {results['scout_acc'][i]*100:.2f}%  |  Adam acc = {results['adam_acc'][i]*100:.2f}%")

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(results['scout_loss'], label='Scout Loss')
    plt.plot(results['adam_loss'],  label='Adam Loss')
    plt.legend()
    plt.title("Validation Loss")

    plt.subplot(1, 2, 2)
    plt.plot(results['scout_acc'], label='Scout Acc')
    plt.plot(results['adam_acc'],  label='Adam Acc')
    plt.legend()
    plt.title("Validation Accuracy")
    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    if sys.argv[0].endswith("ipykernel_launcher.py"):
        sys.argv = ['scout_opt_cifar10.py']
    main()


Device: cpu
Files already downloaded and verified
Files already downloaded and verified

Epoch 1/50
  val_loss=2.305  val_acc=0.109  (28.8s)

Epoch 2/50
  val_loss=2.305  val_acc=0.109  (29.1s)

Epoch 3/50


KeyboardInterrupt: 

In [9]:
"""scout_opt_cifar10.py

Quick‑and‑dirty *large(-ish)* demo of the scout–guided optimizer on CIFAR‑10.

• Dataset  : CIFAR‑10 (50 k train / 10 k test, 32×32 colour)
• Model    : lightweight CNN (~0.6 M params)
• Baseline : Adam
• ScoutOpt : wraps Adam, periodically probes 2 directions
             (current grad + one random orthogonal) with
             a 1‑step look‑ahead on the same mini‑batch.

Run from a Jupyter cell (accepts the stray -f arg) or from CLI:
    python scout_opt_cifar10.py --gpu  
-------------------------------------------------------"""

import argparse, time, pathlib, random, sys
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# ------------------------------------------------------------------ #
#  1. A small-ish CNN                                                #
# ------------------------------------------------------------------ #
class SmallCNN(nn.Module):
    def __init__(self, n_classes: int = 10):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, 3, padding=1)
        self.pool  = nn.MaxPool2d(2, 2)
        self.fc1   = nn.Linear(256 * 4 * 4, 256)
        self.fc2   = nn.Linear(256, n_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 32→16
        x = self.pool(F.relu(self.conv2(x)))  # 16→8
        x = self.pool(F.relu(self.conv3(x)))  # 8 →4
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# ------------------------------------------------------------------ #
#  2. Scout‑guided Optimizer                                         #
# ------------------------------------------------------------------ #
import torch
import torch.nn as nn
from typing import List

class ModifiedScoutOptimizer(torch.optim.Optimizer):
    """
    Extended Scout Optimizer with multi-step probes along ±orthogonal directions.
    Now supports walking 'scout_steps' forward in each direction before deciding.
    """

    def __init__(
        self,
        params,
        base_opt_cls,
        *,
        lr=3e-4,
        scout_every=100,
        scout_radius=5e-4,
        scout_steps=100,
        **base_kwargs
    ):
        param_list = list(params)
        self.base = base_opt_cls(param_list, lr=lr, **base_kwargs)
        super().__init__(param_list, defaults=dict(lr=lr))
        self.s_every = scout_every
        self.radius = scout_radius
        self.steps = scout_steps
        self._step_id = 0
        self._params = param_list

    @staticmethod
    @torch.no_grad()
    def _flatten(tensors: List[torch.Tensor]):
        return torch.cat([t.reshape(-1) for t in tensors])

    @torch.no_grad()
    def _apply_delta(self, vec_flat: torch.Tensor, coef: float):
        offset = 0
        for p in self._params:
            n = p.numel()
            p.add_(vec_flat[offset:offset+n].view_as(p), alpha=coef)
            offset += n

    @staticmethod
    @torch.no_grad()
    def _rand_orthogonal(base: torch.Tensor):
        r = torch.randn_like(base)
        proj = (r @ base) / (base @ base + 1e-12)
        ortho = r - proj * base
        return ortho / (ortho.norm() + 1e-12)

    @torch.no_grad()
    def _track_direction(self, direction: torch.Tensor, closure, max_steps: int):
        best_loss = float('inf')
        best_step = 0

        for i in range(1, max_steps + 1):
            self._apply_delta(direction, self.radius)
            loss = closure()
            if loss < best_loss:
                best_loss = loss
                best_step = i

        self._apply_delta(direction, -self.radius * best_step)
        return best_loss, direction, best_step

    def step(self, closure=None):
        if closure is None:
            raise RuntimeError("ModifiedScoutOptimizer needs a closure that returns loss and does backward()")

        loss = closure()
        self._step_id += 1

        if self._step_id % self.s_every == 0:
            grad_flat = self._flatten([p.grad for p in self._params])
            grad_norm = grad_flat.norm() + 1e-12
            grad_dir = grad_flat / grad_norm
            ortho_dir = self._rand_orthogonal(grad_dir)

            directions = [
                grad_dir,
                ortho_dir,
                -ortho_dir
            ]

            best_loss = loss.item()
            best_dir = None
            best_scale = 0

            for d in directions:
                tracked_loss, d_dir, d_steps = self._track_direction(d, closure, self.steps)
                if tracked_loss < best_loss:
                    best_loss = tracked_loss
                    best_dir = d_dir
                    best_scale = d_steps

            if best_dir is not None:
                new_grad = best_dir * grad_norm
                offset = 0
                for p in self._params:
                    n = p.numel()
                    p.grad.copy_(new_grad[offset:offset+n].view_as(p))
                    offset += n

        self.base.step()
        return loss


# ------------------------------------------------------------------ #
#  3. Training / evaluation loops                                    #
# ------------------------------------------------------------------ #
@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    tot_loss, correct, total = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        out = model(x)
        tot_loss += criterion(out, y).item() * x.size(0)
        pred = out.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return tot_loss / total, correct / total


def train_one_epoch(model, loader, criterion, opt, device, log_every=100):
    model.train()
    running = 0.0
    for i, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)

        def closure():
            opt.base.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            return loss

        loss = opt.step(closure)
        running += loss.item()
        if (i + 1) % log_every == 0:
            print(f"  batch {i+1}/{len(loader)}  loss {running/log_every:.3f}")
            running = 0.0

# ------------------------------------------------------------------ #
#  4. Baseline Adam Check                                            #
# ------------------------------------------------------------------ #

def run_baseline_adam(epochs=6, batch=128, lr=3e-4, results=None):
    print("\nBaseline: Adam")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])
    root = str(pathlib.Path.home() / ".torchdata")
    train_set = datasets.CIFAR10(root=root, train=True,  download=True, transform=transform)
    test_set  = datasets.CIFAR10(root=root, train=False, download=True, transform=transform)
    train_loader = DataLoader(train_set, batch_size=batch, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=256,   shuffle=False, num_workers=2, pin_memory=True)

    model = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=1e-4)

    for ep in range(1, epochs+1):
        print(f"\nEpoch {ep}/{epochs}")
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

        val_loss, val_acc = evaluate(model, test_loader, criterion, device)
        print(f"  val_loss={val_loss:.3f}  val_acc={val_acc:.3f}")
        if results is not None:
            results['adam_loss'].append(val_loss)
            results['adam_acc'].append(val_acc)

# ------------------------------------------------------------------ #
#  5. Main                                                           #
# ------------------------------------------------------------------ #

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--batch',  type=int, default=128)
    parser.add_argument('--lr',     type=float, default=3e-4)
    parser.add_argument('--scout_every',  type=int, default=100)
    parser.add_argument('--scout_radius', type=float, default=5e-4)
    parser.add_argument('--gpu', action='store_true')
    args, _ = parser.parse_known_args()

    args.gpu = True  # Force GPU usage in Jupyter
    device = torch.device('cuda' if args.gpu and torch.cuda.is_available() else 'cpu')
    print('Using CUDA:', torch.cuda.is_available(), 'Device:', device)

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    root = str(pathlib.Path.home() / ".torchdata")
    train_set = datasets.CIFAR10(root=root, train=True,  download=True, transform=transform)
    test_set  = datasets.CIFAR10(root=root, train=False, download=True, transform=transform)

    train_loader = DataLoader(train_set, batch_size=args.batch, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=256,       shuffle=False, num_workers=2, pin_memory=True)

    model = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    opt = ModifiedScoutOptimizer(
        model.parameters(), optim.Adam,
        lr=args.lr,
        scout_every=args.scout_every,
        scout_radius=args.scout_radius,
        scout_steps=50,
        betas=(0.9, 0.999),
        weight_decay=1e-4
    )

    results = {'scout_loss': [], 'scout_acc': [], 'adam_loss': [], 'adam_acc': []}

    for ep in range(1, args.epochs+1):
        print(f"\nEpoch {ep}/{args.epochs}")
        start = time.time()
        train_one_epoch(model, train_loader, criterion, opt, device)
        val_loss, val_acc = evaluate(model, test_loader, criterion, device)
        print(f"  val_loss={val_loss:.3f}  val_acc={val_acc:.3f}  ({time.time() - start:.1f}s)")
        results['scout_loss'].append(val_loss)
        results['scout_acc'].append(val_acc)

    torch.save(model.state_dict(), "scout_cnn.pth")
    print("Model saved to scout_cnn.pth")

    run_baseline_adam(epochs=args.epochs, batch=args.batch, lr=args.lr, results=results)

    print("\nFinal comparison table:")
    for i in range(args.epochs):
        print(f"Epoch {i+1:>2}:  Scout acc = {results['scout_acc'][i]*100:.2f}%  |  Adam acc = {results['adam_acc'][i]*100:.2f}%")

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(results['scout_loss'], label='Scout Loss')
    plt.plot(results['adam_loss'],  label='Adam Loss')
    plt.legend()
    plt.title("Validation Loss")

    plt.subplot(1, 2, 2)
    plt.plot(results['scout_acc'], label='Scout Acc')
    plt.plot(results['adam_acc'],  label='Adam Acc')
    plt.legend()
    plt.title("Validation Accuracy")
    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    if 'ipykernel' in sys.argv[0]:
        sys.argv = ['scout_opt_cifar10.py', '--gpu']
    main()

Using CUDA: True Device: cuda
Files already downloaded and verified
Files already downloaded and verified

Epoch 1/50


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [10]:
"""scout_opt_cifar10.py

Quick‑and‑dirty *large(-ish)* demo of the scout–guided optimizer on CIFAR‑10.

• Dataset  : CIFAR‑10 (50 k train / 10 k test, 32×32 colour)
• Model    : lightweight CNN (~0.6 M params)
• Baseline : Adam
• ScoutOpt : wraps Adam, periodically probes 2 directions
             (current grad + one random orthogonal) with
             a 1‑step look‑ahead on the same mini‑batch.

Run from a Jupyter cell (accepts the stray -f arg) or from CLI:
    python scout_opt_cifar10.py --gpu  
-------------------------------------------------------"""

import argparse, time, pathlib, random, sys
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# ------------------------------------------------------------------ #
#  1. A small-ish CNN                                                #
# ------------------------------------------------------------------ #
class SmallCNN(nn.Module):
    def __init__(self, n_classes: int = 10):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, 3, padding=1)
        self.pool  = nn.MaxPool2d(2, 2)
        self.fc1   = nn.Linear(256 * 4 * 4, 256)
        self.fc2   = nn.Linear(256, n_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 32→16
        x = self.pool(F.relu(self.conv2(x)))  # 16→8
        x = self.pool(F.relu(self.conv3(x)))  # 8 →4
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# ------------------------------------------------------------------ #
#  2. Scout‑guided Optimizer                                         #
# ------------------------------------------------------------------ #
import torch
import torch.nn as nn
from typing import List

class ModifiedScoutOptimizer(torch.optim.Optimizer):
    """
    Extended Scout Optimizer with multi-step probes along ±orthogonal directions.
    Now supports walking 'scout_steps' forward in each direction before deciding.
    """

    def __init__(
        self,
        params,
        base_opt_cls,
        *,
        lr=3e-4,
        scout_every=100,
        scout_radius=5e-4,
        scout_steps=100,
        **base_kwargs
    ):
        param_list = list(params)
        self.base = base_opt_cls(param_list, lr=lr, **base_kwargs)
        super().__init__(param_list, defaults=dict(lr=lr))
        self.s_every = scout_every
        self.radius = scout_radius
        self.steps = scout_steps
        self._step_id = 0
        self._params = param_list

    @staticmethod
    @torch.no_grad()
    def _flatten(tensors: List[torch.Tensor]):
        return torch.cat([t.reshape(-1) for t in tensors])

    @torch.no_grad()
    def _apply_delta(self, vec_flat: torch.Tensor, coef: float):
        offset = 0
        for p in self._params:
            n = p.numel()
            p.add_(vec_flat[offset:offset+n].view_as(p), alpha=coef)
            offset += n

    @staticmethod
    @torch.no_grad()
    def _rand_orthogonal(base: torch.Tensor):
        r = torch.randn_like(base)
        proj = (r @ base) / (base @ base + 1e-12)
        ortho = r - proj * base
        return ortho / (ortho.norm() + 1e-12)

    @torch.no_grad()
    def _track_direction(self, direction: torch.Tensor, closure, max_steps: int):
        best_loss = float('inf')
        best_step = 0

        for i in range(1, max_steps + 1):
            self._apply_delta(direction, self.radius)
            loss = closure()
            if loss < best_loss:
                best_loss = loss
                best_step = i

        self._apply_delta(direction, -self.radius * best_step)
        return best_loss, direction, best_step

    def step(self, closure=None):
        if closure is None:
            raise RuntimeError("ModifiedScoutOptimizer needs a closure that returns loss and does backward()")

        loss = closure()
        self._step_id += 1

        if self._step_id % self.s_every == 0:
            grad_flat = self._flatten([p.grad for p in self._params])
            grad_norm = grad_flat.norm() + 1e-12
            grad_dir = grad_flat / grad_norm
            ortho_dir = self._rand_orthogonal(grad_dir)

            directions = [
                grad_dir,
                ortho_dir,
                -ortho_dir
            ]

            best_loss = loss.item()
            best_dir = None
            best_scale = 0

            for d in directions:
                tracked_loss, d_dir, d_steps = self._track_direction(d, closure, self.steps)
                if tracked_loss < best_loss:
                    best_loss = tracked_loss
                    best_dir = d_dir
                    best_scale = d_steps

            if best_dir is not None:
                new_grad = best_dir * grad_norm
                offset = 0
                for p in self._params:
                    n = p.numel()
                    p.grad.copy_(new_grad[offset:offset+n].view_as(p))
                    offset += n

        self.base.step()
        return loss


# ------------------------------------------------------------------ #
#  3. Training / evaluation loops                                    #
# ------------------------------------------------------------------ #
@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    tot_loss, correct, total = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        out = model(x)
        tot_loss += criterion(out, y).item() * x.size(0)
        pred = out.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return tot_loss / total, correct / total


def train_one_epoch(model, loader, criterion, opt, device, log_every=100):
    model.train()
    running = 0.0
    for i, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)

        # Move x and y to new scope for each closure evaluation
        def closure():
            opt.base.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            if not loss.requires_grad:
                raise RuntimeError("Loss does not require grad. Check computation graph.")
            loss.backward()
            return loss

        loss = opt.step(closure)

        running += loss.item()
        if (i + 1) % log_every == 0:
            print(f"  batch {i+1}/{len(loader)}  loss {running/log_every:.3f}")
            running = 0.0

        # Debug: ensure gradient exists
        for name, param in model.named_parameters():
            if param.grad is None:
                print(f"WARNING: No gradient for {name}")
            else:
                print(f"Gradient norm for {name}: {param.grad.norm().item():.4e}")

def run_baseline_adam(epochs=6, batch=128, lr=3e-4, results=None):
    print("\nBaseline: Adam")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])
    root = str(pathlib.Path.home() / ".torchdata")
    train_set = datasets.CIFAR10(root=root, train=True,  download=True, transform=transform)
    test_set  = datasets.CIFAR10(root=root, train=False, download=True, transform=transform)
    train_loader = DataLoader(train_set, batch_size=batch, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=256,   shuffle=False, num_workers=2, pin_memory=True)

    model = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=1e-4)

    for ep in range(1, epochs+1):
        print(f"\nEpoch {ep}/{epochs}")
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

        val_loss, val_acc = evaluate(model, test_loader, criterion, device)
        print(f"  val_loss={val_loss:.3f}  val_acc={val_acc:.3f}")
        if results is not None:
            results['adam_loss'].append(val_loss)
            results['adam_acc'].append(val_acc)

# ------------------------------------------------------------------ #
#  5. Main                                                           #
# ------------------------------------------------------------------ #

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--batch',  type=int, default=128)
    parser.add_argument('--lr',     type=float, default=3e-4)
    parser.add_argument('--scout_every',  type=int, default=100)
    parser.add_argument('--scout_radius', type=float, default=5e-4)
    parser.add_argument('--gpu', action='store_true')
    args, _ = parser.parse_known_args()

    args.gpu = True  # Force GPU usage in Jupyter
    device = torch.device('cuda' if args.gpu and torch.cuda.is_available() else 'cpu')
    print('Using CUDA:', torch.cuda.is_available(), 'Device:', device)

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    root = str(pathlib.Path.home() / ".torchdata")
    train_set = datasets.CIFAR10(root=root, train=True,  download=True, transform=transform)
    test_set  = datasets.CIFAR10(root=root, train=False, download=True, transform=transform)

    train_loader = DataLoader(train_set, batch_size=args.batch, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=256,       shuffle=False, num_workers=2, pin_memory=True)

    model = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    opt = ModifiedScoutOptimizer(
        model.parameters(), optim.Adam,
        lr=args.lr,
        scout_every=args.scout_every,
        scout_radius=args.scout_radius,
        scout_steps=50,
        betas=(0.9, 0.999),
        weight_decay=1e-4
    )

    results = {'scout_loss': [], 'scout_acc': [], 'adam_loss': [], 'adam_acc': []}

    for ep in range(1, args.epochs+1):
        print(f"\nEpoch {ep}/{args.epochs}")
        start = time.time()
        train_one_epoch(model, train_loader, criterion, opt, device)
        val_loss, val_acc = evaluate(model, test_loader, criterion, device)
        print(f"  val_loss={val_loss:.3f}  val_acc={val_acc:.3f}  ({time.time() - start:.1f}s)")
        results['scout_loss'].append(val_loss)
        results['scout_acc'].append(val_acc)

    torch.save(model.state_dict(), "scout_cnn.pth")
    print("Model saved to scout_cnn.pth")

    run_baseline_adam(epochs=args.epochs, batch=args.batch, lr=args.lr, results=results)

    print("\nFinal comparison table:")
    for i in range(args.epochs):
        print(f"Epoch {i+1:>2}:  Scout acc = {results['scout_acc'][i]*100:.2f}%  |  Adam acc = {results['adam_acc'][i]*100:.2f}%")

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(results['scout_loss'], label='Scout Loss')
    plt.plot(results['adam_loss'],  label='Adam Loss')
    plt.legend()
    plt.title("Validation Loss")

    plt.subplot(1, 2, 2)
    plt.plot(results['scout_acc'], label='Scout Acc')
    plt.plot(results['adam_acc'],  label='Adam Acc')
    plt.legend()
    plt.title("Validation Accuracy")
    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    if 'ipykernel' in sys.argv[0]:
        sys.argv = ['scout_opt_cifar10.py', '--gpu']
    main()


Using CUDA: True Device: cuda
Files already downloaded and verified
Files already downloaded and verified

Epoch 1/50


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x000001F774670310>
Traceback (most recent call last):
  File "C:\Users\Machine-Learning\anaconda3\envs\pytorch_env\lib\site-packages\torch\utils\data\dataloader.py", line 1477, in __del__
    self._shutdown_workers()
  File "C:\Users\Machine-Learning\anaconda3\envs\pytorch_env\lib\site-packages\torch\utils\data\dataloader.py", line 1435, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'


Gradient norm for conv1.weight: 2.7949e-02
Gradient norm for conv1.bias: 4.2175e-03
Gradient norm for conv2.weight: 1.2244e-01
Gradient norm for conv2.bias: 8.3029e-03
Gradient norm for conv3.weight: 1.9695e-01
Gradient norm for conv3.bias: 1.7949e-02
Gradient norm for fc1.weight: 4.4855e-01
Gradient norm for fc1.bias: 3.7970e-02
Gradient norm for fc2.weight: 1.1058e-01
Gradient norm for fc2.bias: 1.0069e-01
Gradient norm for conv1.weight: 2.6102e-02
Gradient norm for conv1.bias: 4.6061e-03
Gradient norm for conv2.weight: 1.3854e-01
Gradient norm for conv2.bias: 1.0564e-02
Gradient norm for conv3.weight: 2.2419e-01
Gradient norm for conv3.bias: 2.3586e-02
Gradient norm for fc1.weight: 5.0366e-01
Gradient norm for fc1.bias: 4.7662e-02
Gradient norm for fc2.weight: 1.9627e-01
Gradient norm for fc2.bias: 1.1011e-01
Gradient norm for conv1.weight: 3.5828e-02
Gradient norm for conv1.bias: 4.3844e-03
Gradient norm for conv2.weight: 1.5692e-01
Gradient norm for conv2.bias: 9.5890e-03
Gradient

RuntimeError: Loss does not require grad. Check computation graph.

In [11]:
"""scout_opt_cifar10.py

Quick‑and‑dirty *large(-ish)* demo of the scout–guided optimizer on CIFAR‑10.

• Dataset  : CIFAR‑10 (50 k train / 10 k test, 32×32 colour)
• Model    : lightweight CNN (~0.6 M params)
• Baseline : Adam
• ScoutOpt : wraps Adam, periodically probes 2 directions
             (current grad + one random orthogonal) with
             a 1‑step look‑ahead on the same mini‑batch.

Run from a Jupyter cell (accepts the stray -f arg) or from CLI:
    python scout_opt_cifar10.py --gpu  
-------------------------------------------------------"""

import argparse, time, pathlib, random, sys
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# ------------------------------------------------------------------ #
#  1. A small-ish CNN                                                #
# ------------------------------------------------------------------ #
class SmallCNN(nn.Module):
    def __init__(self, n_classes: int = 10):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, 3, padding=1)
        self.pool  = nn.MaxPool2d(2, 2)
        self.fc1   = nn.Linear(256 * 4 * 4, 256)
        self.fc2   = nn.Linear(256, n_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 32→16
        x = self.pool(F.relu(self.conv2(x)))  # 16→8
        x = self.pool(F.relu(self.conv3(x)))  # 8 →4
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# ------------------------------------------------------------------ #
#  2. Scout‑guided Optimizer                                         #
# ------------------------------------------------------------------ #
import torch
import torch.nn as nn
from typing import List

class ModifiedScoutOptimizer(torch.optim.Optimizer):
    """
    Extended Scout Optimizer with multi-step probes along ±orthogonal directions.
    Now supports walking 'scout_steps' forward in each direction before deciding.
    """

    def __init__(
        self,
        params,
        base_opt_cls,
        *,
        lr=3e-4,
        scout_every=100,
        scout_radius=5e-4,
        scout_steps=100,
        **base_kwargs
    ):
        param_list = list(params)
        self.base = base_opt_cls(param_list, lr=lr, **base_kwargs)
        super().__init__(param_list, defaults=dict(lr=lr))
        self.s_every = scout_every
        self.radius = scout_radius
        self.steps = scout_steps
        self._step_id = 0
        self._params = param_list

    @staticmethod
    @torch.no_grad()
    def _flatten(tensors: List[torch.Tensor]):
        return torch.cat([t.reshape(-1) for t in tensors])

    @torch.no_grad()
    def _apply_delta(self, vec_flat: torch.Tensor, coef: float):
        offset = 0
        for p in self._params:
            n = p.numel()
            p.add_(vec_flat[offset:offset+n].view_as(p), alpha=coef)
            offset += n

    @staticmethod
    @torch.no_grad()
    def _rand_orthogonal(base: torch.Tensor):
        r = torch.randn_like(base)
        proj = (r @ base) / (base @ base + 1e-12)
        ortho = r - proj * base
        return ortho / (ortho.norm() + 1e-12)

    @torch.no_grad()
    def _track_direction(self, direction: torch.Tensor, closure, max_steps: int):
        best_loss = float('inf')
        best_step = 0

        for i in range(1, max_steps + 1):
            self._apply_delta(direction, self.radius)
            loss = closure()
            if loss < best_loss:
                best_loss = loss
                best_step = i

        self._apply_delta(direction, -self.radius * best_step)
        return best_loss, direction, best_step

    def step(self, closure=None):
        if closure is None:
            raise RuntimeError("ModifiedScoutOptimizer needs a closure that returns loss and does backward()")

        loss = closure()
        self._step_id += 1

        if self._step_id % self.s_every == 0:
            grad_flat = self._flatten([p.grad for p in self._params])
            grad_norm = grad_flat.norm() + 1e-12
            grad_dir = grad_flat / grad_norm
            ortho_dir = self._rand_orthogonal(grad_dir)

            directions = [
                grad_dir,
                ortho_dir,
                -ortho_dir
            ]

            best_loss = loss.item()
            best_dir = None
            best_scale = 0

            for d in directions:
                tracked_loss, d_dir, d_steps = self._track_direction(d, closure, self.steps)
                if tracked_loss < best_loss:
                    best_loss = tracked_loss
                    best_dir = d_dir
                    best_scale = d_steps

            if best_dir is not None:
                new_grad = best_dir * grad_norm
                offset = 0
                for p in self._params:
                    n = p.numel()
                    p.grad.copy_(new_grad[offset:offset+n].view_as(p))
                    offset += n

        self.base.step()
        return loss


# ------------------------------------------------------------------ #
#  3. Training / evaluation loops                                    #
# ------------------------------------------------------------------ #
@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    tot_loss, correct, total = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        out = model(x)
        tot_loss += criterion(out, y).item() * x.size(0)
        pred = out.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return tot_loss / total, correct / total


def train_one_epoch(model, loader, criterion, opt, device, log_every=100):
    model.train()
    running = 0.0
    for i, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)

        # Move x and y to new scope for each closure evaluation
        def closure():
            opt.base.zero_grad()
            with torch.enable_grad():
                out = model(x)
                loss = criterion(out, y)
            if not loss.requires_grad:
                raise RuntimeError("Loss does not require grad. Check computation graph.")
            loss.backward()
            return loss

        loss = opt.step(closure)

        running += loss.item()
        if (i + 1) % log_every == 0:
            print(f"  batch {i+1}/{len(loader)}  loss {running/log_every:.3f}")
            running = 0.0

        # Debug: ensure gradient exists
        for name, param in model.named_parameters():
            if param.grad is None:
                print(f"WARNING: No gradient for {name}")
            else:
                print(f"Gradient norm for {name}: {param.grad.norm().item():.4e}")

def run_baseline_adam(epochs=6, batch=128, lr=3e-4, results=None):
    print("\nBaseline: Adam")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])
    root = str(pathlib.Path.home() / ".torchdata")
    train_set = datasets.CIFAR10(root=root, train=True,  download=True, transform=transform)
    test_set  = datasets.CIFAR10(root=root, train=False, download=True, transform=transform)
    train_loader = DataLoader(train_set, batch_size=batch, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=256,   shuffle=False, num_workers=2, pin_memory=True)

    model = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=1e-4)

    for ep in range(1, epochs+1):
        print(f"\nEpoch {ep}/{epochs}")
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

        val_loss, val_acc = evaluate(model, test_loader, criterion, device)
        print(f"  val_loss={val_loss:.3f}  val_acc={val_acc:.3f}")
        if results is not None:
            results['adam_loss'].append(val_loss)
            results['adam_acc'].append(val_acc)

# ------------------------------------------------------------------ #
#  5. Main                                                           #
# ------------------------------------------------------------------ #

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--batch',  type=int, default=128)
    parser.add_argument('--lr',     type=float, default=3e-4)
    parser.add_argument('--scout_every',  type=int, default=100)
    parser.add_argument('--scout_radius', type=float, default=5e-4)
    parser.add_argument('--gpu', action='store_true')
    args, _ = parser.parse_known_args()

    args.gpu = True  # Force GPU usage in Jupyter
    device = torch.device('cuda' if args.gpu and torch.cuda.is_available() else 'cpu')
    print('Using CUDA:', torch.cuda.is_available(), 'Device:', device)

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    root = str(pathlib.Path.home() / ".torchdata")
    train_set = datasets.CIFAR10(root=root, train=True,  download=True, transform=transform)
    test_set  = datasets.CIFAR10(root=root, train=False, download=True, transform=transform)

    train_loader = DataLoader(train_set, batch_size=args.batch, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test_set,  batch_size=256,       shuffle=False, num_workers=2, pin_memory=True)

    model = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    opt = ModifiedScoutOptimizer(
        model.parameters(), optim.Adam,
        lr=args.lr,
        scout_every=args.scout_every,
        scout_radius=args.scout_radius,
        scout_steps=50,
        betas=(0.9, 0.999),
        weight_decay=1e-4
    )

    results = {'scout_loss': [], 'scout_acc': [], 'adam_loss': [], 'adam_acc': []}

    for ep in range(1, args.epochs+1):
        print(f"\nEpoch {ep}/{args.epochs}")
        start = time.time()
        train_one_epoch(model, train_loader, criterion, opt, device)
        val_loss, val_acc = evaluate(model, test_loader, criterion, device)
        print(f"  val_loss={val_loss:.3f}  val_acc={val_acc:.3f}  ({time.time() - start:.1f}s)")
        results['scout_loss'].append(val_loss)
        results['scout_acc'].append(val_acc)

    torch.save(model.state_dict(), "scout_cnn.pth")
    print("Model saved to scout_cnn.pth")

    run_baseline_adam(epochs=args.epochs, batch=args.batch, lr=args.lr, results=results)

    print("\nFinal comparison table:")
    for i in range(args.epochs):
        print(f"Epoch {i+1:>2}:  Scout acc = {results['scout_acc'][i]*100:.2f}%  |  Adam acc = {results['adam_acc'][i]*100:.2f}%")

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(results['scout_loss'], label='Scout Loss')
    plt.plot(results['adam_loss'],  label='Adam Loss')
    plt.legend()
    plt.title("Validation Loss")

    plt.subplot(1, 2, 2)
    plt.plot(results['scout_acc'], label='Scout Acc')
    plt.plot(results['adam_acc'],  label='Adam Acc')
    plt.legend()
    plt.title("Validation Accuracy")
    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    if 'ipykernel' in sys.argv[0]:
        sys.argv = ['scout_opt_cifar10.py', '--gpu']
    main()


Using CUDA: True Device: cuda
Files already downloaded and verified
Files already downloaded and verified

Epoch 1/50
Gradient norm for conv1.weight: 2.4639e-02
Gradient norm for conv1.bias: 3.3465e-03
Gradient norm for conv2.weight: 1.1100e-01
Gradient norm for conv2.bias: 6.4690e-03
Gradient norm for conv3.weight: 1.5646e-01
Gradient norm for conv3.bias: 1.5207e-02
Gradient norm for fc1.weight: 3.0689e-01
Gradient norm for fc1.bias: 2.8272e-02
Gradient norm for fc2.weight: 7.3349e-02
Gradient norm for fc2.bias: 7.0145e-02
Gradient norm for conv1.weight: 2.2018e-02
Gradient norm for conv1.bias: 4.7214e-03
Gradient norm for conv2.weight: 1.3160e-01
Gradient norm for conv2.bias: 1.0567e-02
Gradient norm for conv3.weight: 1.8447e-01
Gradient norm for conv3.bias: 2.2075e-02
Gradient norm for fc1.weight: 3.9826e-01
Gradient norm for fc1.bias: 4.4352e-02
Gradient norm for fc2.weight: 1.4893e-01
Gradient norm for fc2.bias: 1.0585e-01
Gradient norm for conv1.weight: 2.7568e-02
Gradient norm f

KeyboardInterrupt: 

In [None]:
"""scout_opt_cifar10.py

Scout-guided optimiser demo on CIFAR-10 (lightweight CNN).

Key changes in this revision
----------------------------
1. **Removed runaway debug spam** – gradient norms are now *optional* (`--debug`) and printed only once per `log_every`.
2. **Windows/Jupyter stability** – default `num_workers=0` to dodge the lingering `_workers_status` bug; override with `--workers` if you like.
3. **Minor hygiene** – clarified comments, kept CUDA auto-detection, no logic changes to the optimiser itself.
"""

import argparse, time, pathlib, sys
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# ------------------------------------------------------------------ #
#  1. Model                                                          #
# ------------------------------------------------------------------ #
class SmallCNN(nn.Module):
    def __init__(self, n_classes: int = 10):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, 3, padding=1)
        self.pool  = nn.MaxPool2d(2, 2)
        self.fc1   = nn.Linear(256 * 4 * 4, 256)
        self.fc2   = nn.Linear(256, n_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 32→16
        x = self.pool(F.relu(self.conv2(x)))  # 16→8
        x = self.pool(F.relu(self.conv3(x)))  # 8 →4
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# ------------------------------------------------------------------ #
#  2. Scout-guided Optimizer                                         #
# ------------------------------------------------------------------ #
class ModifiedScoutOptimizer(torch.optim.Optimizer):
    """Adam wrapped with periodic gradient-direction scouting."""

    def __init__(
        self,
        params,
        base_opt_cls,
        *,
        lr=3e-4,
        scout_every=100,
        scout_radius=5e-4,
        scout_steps=100,
        **base_kwargs,
    ):
        params = list(params)
        self.base = base_opt_cls(params, lr=lr, **base_kwargs)
        super().__init__(params, defaults=dict(lr=lr))
        self.s_every = scout_every
        self.radius  = scout_radius
        self.steps   = scout_steps
        self._step_id = 0
        self._params  = params

    # ---- helper --------------------------------------------------- #
    @staticmethod
    @torch.no_grad()
    def _flatten(ts: List[torch.Tensor]):
        return torch.cat([t.reshape(-1) for t in ts])

    @torch.no_grad()
    def _apply_delta(self, vec_flat: torch.Tensor, coef: float):
        off = 0
        for p in self._params:
            n = p.numel()
            p.add_(vec_flat[off : off + n].view_as(p), alpha=coef)
            off += n

    @staticmethod
    @torch.no_grad()
    def _rand_orthogonal(base: torch.Tensor):
        r = torch.randn_like(base)
        proj = (r @ base) / (base @ base + 1e-12)
        ortho = r - proj * base
        return ortho / (ortho.norm() + 1e-12)

    @torch.no_grad()
    def _track(self, direction: torch.Tensor, closure):
        """Walk up to self.steps small steps in *direction*, keep best loss."""
        best_loss, best_step = float("inf"), 0
        for step in range(1, self.steps + 1):
            self._apply_delta(direction, self.radius)
            loss = closure()
            if loss < best_loss:
                best_loss, best_step = loss, step
        # rewind to original params
        self._apply_delta(direction, -self.radius * best_step)
        return best_loss, direction

    # ---- public API ---------------------------------------------- #
    def step(self, closure=None):
        if closure is None:
            raise RuntimeError("Scout optimiser needs a closure that computes loss + backward().")

        loss = closure()  # grad on current params
        self._step_id += 1

        if self._step_id % self.s_every == 0:
            g_flat   = self._flatten([p.grad for p in self._params])
            g_norm   = g_flat.norm() + 1e-12
            g_dir    = g_flat / g_norm
            ortho    = self._rand_orthogonal(g_dir)

            cand_dirs = (g_dir, ortho, -ortho)
            best_loss, best_dir = loss.item(), None
            for d in cand_dirs:
                cand_loss, cand_dir = self._track(d, closure)
                if cand_loss < best_loss:
                    best_loss, best_dir = cand_loss, cand_dir
            if best_dir is not None:
                # replace gradient with the better direction, keep magnitude
                new_grad = best_dir * g_norm
                off = 0
                for p in self._params:
                    n = p.numel()
                    p.grad.copy_(new_grad[off : off + n].view_as(p))
                    off += n

        self.base.step()
        return loss

# ------------------------------------------------------------------ #
#  3. Training utilities                                             #
# ------------------------------------------------------------------ #
@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    tot_loss, correct, total = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        out  = model(x)
        tot_loss += criterion(out, y).item() * x.size(0)
        correct  += (out.argmax(1) == y).sum().item()
        total    += y.size(0)
    return tot_loss / total, correct / total


def train_one_epoch(model, loader, criterion, opt, device, *, log_every=100, debug=False):
    model.train()
    running = 0.0
    for i, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)

        def closure():
            opt.base.zero_grad()
            with torch.enable_grad():
                out  = model(x)
                loss = criterion(out, y)
            loss.backward()
            return loss

        loss = opt.step(closure)
        running += loss.item()

        if (i + 1) % log_every == 0:
            print(f"  batch {i+1}/{len(loader)}  loss {running/log_every:.3f}")
            running = 0.0
            if debug:
                for n, p in model.named_parameters():
                    print(f"    grad {n:20s}: {p.grad.norm():.3e}")

# ------------------------------------------------------------------ #
#  4. Baseline Adam (optional)                                       #
# ------------------------------------------------------------------ #

def run_baseline_adam(epochs, batch, lr, results):
    device   = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])
    root = pathlib.Path.home() / ".torchdata"
    train_loader = DataLoader(
        datasets.CIFAR10(root, train=True, download=True, transform=transform),
        batch_size=batch, shuffle=True, num_workers=0, pin_memory=True,
    )
    test_loader = DataLoader(
        datasets.CIFAR10(root, train=False, download=True, transform=transform),
        batch_size=256, shuffle=False, num_workers=0, pin_memory=True,
    )
    model     = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    opt       = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

    for ep in range(1, epochs + 1):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            opt.zero_grad()
            criterion(model(x), y).backward()
            opt.step()
        v_loss, v_acc = evaluate(model, test_loader, criterion, device)
        results["adam_loss"].append(v_loss)
        results["adam_acc"].append(v_acc)
        print(f"Adam  epoch {ep:2d}:  val_acc={v_acc*100:.1f}%  val_loss={v_loss:.3f}")

# ------------------------------------------------------------------ #
#  5. Entry point                                                    #
# ------------------------------------------------------------------ #

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--epochs",     type=int,   default=50)
    p.add_argument("--batch",      type=int,   default=128)
    p.add_argument("--lr",         type=float, default=3e-4)
    p.add_argument("--scout_every",type=int,   default=100)
    p.add_argument("--scout_radius",type=float,default=5e-4)
    p.add_argument("--workers",    type=int,   default=0)
    p.add_argument("--debug",      action="store_true")
    p.add_argument("--gpu",        action="store_true")
    args, _ = p.parse_known_args()

    device = torch.device("cuda" if args.gpu and torch.cuda.is_available() else "cpu")
    print("Device:", device)

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])
    root = pathlib.Path.home() / ".torchdata"
    train_loader = DataLoader(
        datasets.CIFAR10(root, train=True, download=True, transform=transform),
        batch_size=args.batch, shuffle=True, num_workers=args.workers, pin_memory=True,
    )
    test_loader = DataLoader(
        datasets.CIFAR10(root, train=False, download=True, transform=transform),
        batch_size=256, shuffle=False, num_workers=args.workers, pin_memory=True,
    )

    model     = SmallCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    scout_opt = ModifiedScoutOptimizer(
        model.parameters(), optim.Adam, lr=args.lr,
        scout_every=args.scout_every, scout_radius=args.scout_radius, scout_steps=50,
        betas=(0.9, 0.999), weight_decay=1e-4,
    )

    results = {k: [] for k in ("scout_loss","scout_acc","adam_loss","adam_acc")}

    for ep in range(1, args.epochs + 1):
        print(f"\nEpoch {ep}/{args.epochs}")
        start = time.time()
        train_one_epoch(model, train_loader, criterion, scout_opt, device, debug=args.debug)
        v_loss, v_acc = evaluate(model, test_loader, criterion, device)
        results["scout_loss"].append(v_loss)
        results["scout_acc"].append(v_acc)
        print(f"  Scout val_acc={v_acc*100:.1f}%  val_loss={v_loss:.3f}  ({time.time()-start:.1f}s)")

    torch.save(model.state_dict(), "scout_cnn.pth")
    print("Model saved → scout_cnn.pth")

    run_baseline_adam(args.epochs, args.batch, args.lr, results)

    # quick comparison
    for ep in range(args.epochs):
        print(f"Epoch {ep+1:2d}: Scout {results['scout_acc'][ep]*100:.1f}%  |  Adam {results['adam_acc'][ep]*100:.1f}%")

if __name__ == "__main__":
    if "ipykernel" in sys.argv[0]:
        sys.argv = ["scout_opt_cifar10.py", "--gpu"]
    main()

Device: cpu
Files already downloaded and verified
Files already downloaded and verified

Epoch 1/50
