In [1]:
# Autoimport on changes
%load_ext autoreload
%autoreload 2

# Import commonly used libraries
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# graphics
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# type annotation
import jaxtyping
from jaxtyping import Float32, Int64, jaxtyped
from typeguard import typechecked as typechecker

# more itertools
import more_itertools as mi

# itertools
import itertools
import collections

# tensor manipulation
from einops import rearrange, reduce, repeat

# automatically apply jaxtyping
%load_ext jaxtyping
%jaxtyping.typechecker typeguard.typechecked

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

import dataclasses
import pprint

import wandb

import tqdm

WANDB_PROJECT_NAME = "simple_regression"


# Define a simple dataset
class SimpleDataset(Dataset):
    def __init__(self, num_samples: int = 1000):
        """
        Initialize the dataset with random samples.

        :param num_samples: Number of samples to generate
        """
        self.x = torch.randn(num_samples, 10)  # 10 input features
        self.y = torch.sum(self.x[:, :5], dim=1) - torch.sum(
            self.x[:, 5:], dim=1
        )  # Target is sum of first 5 features minus sum of last 5
        self.y += torch.randn(num_samples) * 0.1  # Add some noise

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(
        self, idx: int
    ) -> tuple[Float32[torch.Tensor, "features"], Float32[torch.Tensor, ""]]:
        return self.x[idx], self.y[idx]


class QuadraticDataset(Dataset):
    def __init__(self, num_samples: int = 1000):
        """
        Initialize the dataset with a quadratic pattern.

        :param num_samples: Number of samples to generate
        """
        self.x = torch.randn(num_samples, 5)  # 5 input features
        self.y = torch.sum(self.x**2, dim=1)  # Sum of squared features
        self.y += torch.randn(num_samples) * 0.1  # Add some noise

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(
        self, idx: int
    ) -> tuple[Float32[torch.Tensor, "features"], Float32[torch.Tensor, ""]]:
        return self.x[idx], self.y[idx]


class ExponentialDataset(Dataset):
    def __init__(self, num_samples: int = 1000):
        """
        Initialize the dataset with an exponential pattern.

        :param num_samples: Number of samples to generate
        """
        self.x = torch.randn(num_samples, 5)  # 5 input features
        self.y = torch.sum(torch.exp(self.x * 0.5), dim=1)  # Sum of exponentials
        self.y = (self.y - self.y.mean()) / self.y.std()  # Normalize
        self.y += torch.randn(num_samples) * 0.1  # Add some noise

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(
        self, idx: int
    ) -> tuple[Float32[torch.Tensor, "features"], Float32[torch.Tensor, ""]]:
        return self.x[idx], self.y[idx]


class SinusoidalDataset(Dataset):
    def __init__(self, num_samples: int = 1000):
        """
        Initialize the dataset with a sinusoidal pattern.

        :param num_samples: Number of samples to generate
        """
        self.x = torch.randn(num_samples, 5)  # 5 input features
        self.y = torch.sum(torch.sin(self.x), dim=1)  # Sum of sines
        self.y += torch.randn(num_samples) * 0.1  # Add some noise

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(
        self, idx: int
    ) -> tuple[Float32[torch.Tensor, "features"], Float32[torch.Tensor, ""]]:
        return self.x[idx], self.y[idx]


class InteractionDataset(Dataset):
    def __init__(self, num_samples: int = 1000):
        """
        Initialize the dataset with feature interactions.

        :param num_samples: Number of samples to generate
        """
        self.x = torch.randn(num_samples, 5)  # 5 input features
        self.y = (
            self.x[:, 0] * self.x[:, 1] + self.x[:, 2] * self.x[:, 3] + self.x[:, 4]
        )
        self.y += torch.randn(num_samples) * 0.1  # Add some noise

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(
        self, idx: int
    ) -> tuple[Float32[torch.Tensor, "features"], Float32[torch.Tensor, ""]]:
        return self.x[idx], self.y[idx]


class ComplexDataset(Dataset):
    def __init__(self, num_samples: int = 1000):
        """
        Initialize a complex dataset combining multiple patterns.

        :param num_samples: Number of samples to generate
        """
        self.x = torch.randn(num_samples, 10)  # 10 input features

        # Combine different patterns for each group of features
        self.y = torch.zeros(num_samples)

        # Quadratic pattern for first two features
        self.y += torch.sum(self.x[:, :2] ** 2, dim=1)

        # Exponential pattern for next two features
        self.y += torch.sum(torch.exp(self.x[:, 2:4] * 0.5), dim=1)

        # Sinusoidal pattern for next two features
        self.y += torch.sum(torch.sin(self.x[:, 4:6]), dim=1)

        # Interaction between next two features
        self.y += self.x[:, 6] * self.x[:, 7]

        # Linear combination of last two features
        self.y += 2 * self.x[:, 8] - 3 * self.x[:, 9]

        # Add non-linear transformation
        self.y = torch.tanh(self.y)

        # Normalize
        self.y = (self.y - self.y.mean()) / self.y.std()

        # Add some noise
        self.y += torch.randn(num_samples) * 0.05

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(
        self, idx: int
    ) -> tuple[Float32[torch.Tensor, "features"], Float32[torch.Tensor, ""]]:
        return self.x[idx], self.y[idx]


class MultiClassDataset(Dataset):
    def __init__(self, num_samples: int = 1000, num_classes: int = 10):
        """
        Initialize a multi-class dataset for classification tasks.

        :param num_samples: Number of samples to generate
        :param num_classes: Number of classes for classification
        """
        self.num_classes = num_classes
        self.x = torch.randn(num_samples, 10)  # 10 input features

        # Generate class probabilities
        logits = torch.zeros(num_samples, num_classes)

        # Create complex decision boundaries
        for i in range(num_classes):
            # Use different combinations of features for each class
            # - The sinusoidal term adds periodic non-linearity.
            # - The quadratic term adds parabolic curvature.
            # - The exponential term adds rapid growth for positive values.
            # - The interaction term captures pairwise feature interactions.

            logits[:, i] = (
                torch.sin(
                    self.x[:, i] + self.x[:, (i + 1) % 10]
                )  # Non-linear combination
                + self.x[:, (i + 2) % 10] ** 2  # Quadratic term
                + torch.exp(self.x[:, (i + 3) % 10] * 0.1)  # Exponential term
                + self.x[:, (i + 4) % 10] * self.x[:, (i + 5) % 10]  # Interaction term
            )

        # Convert logits to probabilities
        self.probs = F.softmax(logits, dim=1)

        # Sample classes based on probabilities
        self.y = torch.multinomial(self.probs, 1).squeeze()

    def __len__(self) -> int:
        return len(self.x)

    def __getitem__(
        self, idx: int
    ) -> tuple[Float32[torch.Tensor, "features"], Int64[torch.Tensor, ""]]:
        return self.x[idx], self.y[idx]

    def get_class_probabilities(self, idx: int) -> Float32[torch.Tensor, "num_classes"]:
        """
        Get the probability distribution over classes for a specific sample.

        :param idx: Index of the sample
        :return: Probability distribution over classes
        """
        return self.probs[idx]


@dataclasses.dataclass
class SimpleNetConfig:
    d_model: int
    n_layers: int

    d_input: int = 10
    d_output: int = 10


# Define a simple neural network


class SimpleNet(nn.Module):
    def __init__(self, cfg: SimpleNetConfig) -> None:
        """
        Initialize the neural network with multiple hidden layers for multi-class classification.
        """
        super(SimpleNet, self).__init__()

        self.cfg = cfg

        # Input layer: 10 features to d_model
        self.input_layer = nn.Linear(self.cfg.d_input, self.cfg.d_model)
        self.activation_fn = nn.ReLU()

        # Hidden layers
        self.hidden_layers = nn.Sequential(
            *[
                nn.Sequential(
                    nn.Linear(self.cfg.d_model, self.cfg.d_model),
                    nn.ReLU(),
                )
                for _ in range(cfg.n_layers)
            ]
        )

        # Output layer: d_model to num_classes (10 for MultiClassDataset)
        self.output_layer = nn.Linear(self.cfg.d_model, self.cfg.d_output)

    def forward(
        self,
        x: Float32[torch.Tensor, "batch features"],
    ) -> Float32[torch.Tensor, "batch num_classes"]:
        """
        Forward pass of the network for multi-class classification.

        :param x: Input tensor
        :return: Logits for each class
        """
        # Apply activation_fn after input layer
        x = self.activation_fn(self.input_layer(x))

        # Hidden layers already have activation_fn
        x = self.hidden_layers(x)

        # No activation_fn after output layer (logits)
        x = self.output_layer(x)

        return x  # Return logits for all classes


@dataclasses.dataclass
class TrainingConfig:
    # model
    model_cfg: SimpleNetConfig

    # data
    num_samples: int = 10000

    # training
    num_epochs: int = 1000
    test_interval: int = 100

    # optimizer
    learning_rate: float = 0.001
    batch_size: int = 32


def train(cfg: TrainingConfig) -> None:

    # Determine the device to use
    device = (
        torch.device("mps")
        if torch.backends.mps.is_available()
        else torch.device("cpu")
    )
    print(f"Using device: {device}")

    # Create dataset and dataloader
    dataset = MultiClassDataset(
        num_samples=cfg.num_samples
    )  # Changed to MultiClassDataset
    dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True)

    # Initialize the model, loss function, and optimizer
    model = SimpleNet(cfg=cfg.model_cfg)
    model = model.to(device)
    criterion = (
        nn.CrossEntropyLoss()
    )  # Changed to CrossEntropyLoss for multi-class classification
    optimizer = optim.AdamW(model.parameters(), lr=cfg.learning_rate)

    # TODO(bschoen): Plot from optimizer
    """
    params
    lr
    betas
    eps
    weight_decay
    amsgrad
    foreach
    maximize
    capturable
    differentiable
    fused
    """
    # for param_group in optimizer.param_groups:
    #    for key in param_group:
    #        print(key)

    # We'll use these in the next cell for training
    # Initialize wandb
    # wandb.init(
    #     project=WANDB_PROJECT_NAME,
    #    config=dataclasses.asdict(cfg),
    # )

    # watch model and criterion
    wandb.watch(
        [model],
        criterion=criterion,
        log_freq=100,
        log="all",
    )

    # Training loop
    for epoch in range(cfg.num_epochs):
        model.train()
        epoch_loss = 0.0

        # for batch_x, batch_y in tqdm.tqdm(
        #    desc="Iterating batches...",
        #    iterable=dataloader,
        # ):
        for batch_x, batch_y in dataloader:

            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Forward pass
            outputs = model(batch_x)

            # No need to assert shape equality for multi-class classification
            # as CrossEntropyLoss expects logits and class indices

            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            optimizer.zero_grad()

            epoch_loss += loss.item()

        # Calculate average loss for the epoch
        avg_loss = epoch_loss / len(dataloader)

        # Compute test loss every n epochs (or once at very end)
        if ((epoch + 1) % cfg.test_interval == 0) or (epoch == cfg.num_epochs - 1):

            # Compute test loss
            model.eval()
            test_loss = 0.0
            correct = 0
            total = 0
            with torch.no_grad():
                for batch_x, batch_y in dataloader:
                    batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                    test_outputs = model(batch_x)
                    test_loss += criterion(test_outputs, batch_y).item()
                    _, predicted = torch.max(test_outputs.data, 1)
                    total += batch_y.size(0)
                    correct += (predicted == batch_y).sum().item()
            avg_test_loss = test_loss / len(dataloader)
            accuracy = 100 * correct / total
            model.train()

            # print progress
            print(
                f"Epoch [{epoch+1}/{cfg.num_epochs}], "
                f"Train Loss: {avg_loss:.4f}, Test Loss: {avg_test_loss:.4f}, "
                f"Accuracy: {accuracy:.2f}%"
            )

            # Log metrics to wandb
            wandb.log(
                {
                    "epoch": epoch,
                    "train_loss": avg_loss,
                    "test_loss": avg_test_loss,
                    "accuracy": accuracy,
                }
            )

        # At least print test loss every 1/10 epochs
        elif ((epoch + 1) % (cfg.num_epochs // 10)) == 0:
            print(f"Epoch [{epoch+1}/{cfg.num_epochs}] Train Loss: {avg_loss:.6f}")

    # Finish the wandb run
    wandb.finish()

    print("Training completed!")

In [8]:
# TODO(bschoen): Use wandb to plot adam
cfg = TrainingConfig(
    model_cfg=SimpleNetConfig(d_model=64, n_layers=4),  # Increase model capacity
    num_samples=100000,
    num_epochs=10,  # Train for longer
    test_interval=1,
    learning_rate=1e-2,  # Adjust learning rate
    batch_size=64,  # Increase batch size
)

wandb.init(project=WANDB_PROJECT_NAME, config=dataclasses.asdict(cfg))

train(cfg)

Using device: mps
Epoch [1/10], Train Loss: 1.9145, Test Loss: 1.7449, Accuracy: 43.32%
Epoch [2/10], Train Loss: 1.6732, Test Loss: 1.5893, Accuracy: 49.17%
Epoch [3/10], Train Loss: 1.5650, Test Loss: 1.5042, Accuracy: 50.89%
Epoch [4/10], Train Loss: 1.5209, Test Loss: 1.5057, Accuracy: 50.58%
Epoch [5/10], Train Loss: 1.5042, Test Loss: 1.4846, Accuracy: 51.12%
Epoch [6/10], Train Loss: 1.4943, Test Loss: 1.4571, Accuracy: 52.06%
Epoch [7/10], Train Loss: 1.4873, Test Loss: 1.4399, Accuracy: 52.59%
Epoch [8/10], Train Loss: 1.4789, Test Loss: 1.4516, Accuracy: 52.24%
Epoch [9/10], Train Loss: 1.4734, Test Loss: 1.4794, Accuracy: 51.50%
Epoch [10/10], Train Loss: 1.4692, Test Loss: 1.4398, Accuracy: 52.32%


VBox(children=(Label(value='0.001 MB of 0.048 MB uploaded\r'), FloatProgress(value=0.024206732189635816, max=1…

0,1
accuracy,▁▅▇▆▇███▇█
epoch,▁▂▃▃▄▅▆▆▇█
test_loss,█▄▂▃▂▁▁▁▂▁
train_loss,█▄▃▂▂▁▁▁▁▁

0,1
accuracy,52.319
epoch,9.0
test_loss,1.43982
train_loss,1.46924


Training completed!


# Sweep Configuration For Hyperparameter Search

In [None]:
sweep_configuration = {
    "name": "sweepdemo",
    "method": "bayes",
    "metric": {"goal": "minimize", "name": "test_loss"},
    "parameters": {
        "d_model": {"values": [8, 16, 32, 64, 128]},
        "n_layers": {"min": 1, "max": 16},
        "learning_rate": {
            # a flat distribution between 0 and 0.1
            "distribution": "uniform",
            "min": 0,
            "max": 0.1,
        },
        "batch_size": {
            "distribution": "q_log_uniform_values",
            "max": 256,
            "min": 32,
            "q": 8,
        },
        "num_epochs": {"value": 100},
        "test_interval": {"value": 1000},
        # "optimizer": {"values": ["adam", "sgd", "adamw", "rmsprop"]},
    },
}

In [None]:
def train_wandb() -> None:

    # construct training config from wandb

    wandb.init(project=WANDB_PROJECT_NAME)

    # TODO(bschoen): What's the usual way to init config from here so don't copy explicitly
    # - usually will have this as completely flatten so can directly grab from
    #
    # `wandb.config` accesses current sweep params
    cfg = TrainingConfig(
        model_cfg=SimpleNetConfig(
            d_model=wandb.config["d_model"],
            n_layers=wandb.config["n_layers"],
        ),  # Increase model capacity
        num_samples=1000,
        num_epochs=wandb.config["num_epochs"],
        test_interval=wandb.config["test_interval"],
        learning_rate=wandb.config["learning_rate"],
        batch_size=wandb.config["batch_size"],
    )

    train(cfg)

In [None]:
wandb.agent?

## Intuition on Hyperparam Runs

* Bad hyperparameters are usually clearly worse by the end of the **first 1-2 epochs.**

* You can manually abort runs that don't look promising, or you can try to do it automatically
  * **Hyperband** is a popular algorithm for this.

* Starting with a smaller ResNet than the full ResNet34 is a good idea. 
  * Good hyperparameters on the small model tend to transfer over to the larger model because the architecture and the data are the same


    * the main difference is the **larger model** may require **more regularization** to prevent overfitting.

In [None]:
sweep_id = wandb.sweep(
    sweep=sweep_configuration,
    project=WANDB_PROJECT_NAME,
)

num_trials = 100

# Inu
# TODO(bschoen): Use hyperband to abort sweep early [](https://www.jmlr.org/papers/volume18/16-558/16-558.pdf)
# - popular to use
#
# note: Larger batch size increases GPU memory usage and doubling batch size often allows doubling learning rate, up to a point where this relationship breaks down.
wandb.agent(
    sweep_id=sweep_id,
    function=train_wandb,
    count=num_trials,
)

wandb.finish()

# Scaling Laws

In [10]:
# MSE - pretty clean scaling on model size: https://wandb.ai/bronsonschoen-personal-use/simple_regression/sweeps/c0ivdnhj?nw=nwuserbronsonschoen
#
# CrossEntropy
# - https://wandb.ai/bronsonschoen-personal-use/simple_regression/sweeps/m0p58ake?nw=nwuserbronsonschoen
#
#
sweep_configuration = {
    "name": "scaling_laws",
    # important that this is random, so it shows everything
    "method": "random",
    "metric": {"goal": "minimize", "name": "test_loss"},
    "parameters": {
        "d_model": {
            "values": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
        },
        # "d_model": {"min": 1, "max": 4096},
        "n_layers": {"value": 4},
        "batch_size": {"value": 64},
        # "num_samples": { "values": [1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10]},
        "num_samples": {"value": 100000},
        "num_epochs": {"value": 1},
        "test_interval": {"value": 1e20},
        # "optimizer": {"values": ["adam", "sgd", "adamw", "rmsprop"]},
    },
}

In [11]:
import math


def train_wandb_scaling() -> None:

    # construct training config from wandb

    wandb.init(project=WANDB_PROJECT_NAME)

    d_model = wandb.config["d_model"]
    num_samples = wandb.config["num_samples"]

    # learning rate scales at `1 / sqrt(d_model)`
    base_learning_rate = 1e-2
    learning_rate = base_learning_rate * (1 / math.sqrt(d_model))

    # TODO(bschoen): What's the usual way to init config from here so don't copy explicitly
    # - usually will have this as completely flatten so can directly grab from
    #
    # `wandb.config` accesses current sweep params
    cfg = TrainingConfig(
        model_cfg=SimpleNetConfig(
            d_model=d_model,
            n_layers=wandb.config["n_layers"],
        ),  # Increase model capacity
        num_samples=num_samples,
        num_epochs=wandb.config["num_epochs"],
        test_interval=wandb.config["test_interval"],
        learning_rate=learning_rate,
        batch_size=wandb.config["batch_size"],
    )

    train(cfg)

In [12]:
sweep_id = wandb.sweep(
    sweep=sweep_configuration,
    project=WANDB_PROJECT_NAME,
)

num_trials = 100

# Inu
# TODO(bschoen): Use hyperband to abort sweep early [](https://www.jmlr.org/papers/volume18/16-558/16-558.pdf)
# - popular to use
#
# note: Larger batch size increases GPU memory usage and doubling batch size often allows doubling learning rate, up to a point where this relationship breaks down.
wandb.agent(
    sweep_id=sweep_id,
    function=train_wandb_scaling,
    count=num_trials,
)

wandb.finish()

Create sweep with ID: m0p58ake
Sweep URL: https://wandb.ai/bronsonschoen-personal-use/simple_regression/sweeps/m0p58ake


[34m[1mwandb[0m: Agent Starting Run: tyc382v8 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 64
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.6528, Test Loss: 1.4495, Accuracy: 51.53%


VBox(children=(Label(value='0.001 MB of 0.048 MB uploaded\r'), FloatProgress(value=0.024220566788853416, max=1…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,51.532
epoch,0.0
test_loss,1.44953
train_loss,1.65282


Training completed!


[34m[1mwandb[0m: Agent Starting Run: 4sdzqwbq with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 16
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 2.0726, Test Loss: 1.9536, Accuracy: 31.72%


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,31.719
epoch,0.0
test_loss,1.95359
train_loss,2.0726


Training completed!


[34m[1mwandb[0m: Agent Starting Run: casz1zht with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 16
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 2.0522, Test Loss: 1.8908, Accuracy: 34.17%


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,34.168
epoch,0.0
test_loss,1.89084
train_loss,2.05221


Training completed!


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: m8kw0hkk with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 32
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.8502, Test Loss: 1.6162, Accuracy: 46.76%


VBox(children=(Label(value='0.001 MB of 0.048 MB uploaded\r'), FloatProgress(value=0.024446035724231215, max=1…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,46.763
epoch,0.0
test_loss,1.61621
train_loss,1.85024


Training completed!


[34m[1mwandb[0m: Agent Starting Run: fnad28g9 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 128
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011169237500063092, max=1.0…

Using device: mps
Epoch [1/1], Train Loss: 1.5403, Test Loss: 1.3884, Accuracy: 53.42%


VBox(children=(Label(value='0.001 MB of 0.049 MB uploaded\r'), FloatProgress(value=0.023994533385396328, max=1…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,53.418
epoch,0.0
test_loss,1.38841
train_loss,1.54028


Training completed!


[34m[1mwandb[0m: Agent Starting Run: 81kghk8n with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 128
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.5525, Test Loss: 1.4399, Accuracy: 51.83%


VBox(children=(Label(value='0.048 MB of 0.049 MB uploaded\r'), FloatProgress(value=0.9863392770145182, max=1.0…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,51.825
epoch,0.0
test_loss,1.43991
train_loss,1.55253


Training completed!


[34m[1mwandb[0m: Agent Starting Run: htl4w523 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 8
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 2.1625, Test Loss: 2.0967, Accuracy: 23.77%


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.21399965174995647, max=1.…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,23.771
epoch,0.0
test_loss,2.09671
train_loss,2.16246


Training completed!


[34m[1mwandb[0m: Agent Starting Run: ee5cteho with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 1024
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.4957, Test Loss: 1.3817, Accuracy: 53.48%


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,53.483
epoch,0.0
test_loss,1.38168
train_loss,1.49574


Training completed!


[34m[1mwandb[0m: Agent Starting Run: pvd5nwbe with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 512
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.5005, Test Loss: 1.3861, Accuracy: 53.44%


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,53.438
epoch,0.0
test_loss,1.3861
train_loss,1.50054


Training completed!


[34m[1mwandb[0m: Agent Starting Run: 3fd8cv43 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 4096
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.5091, Test Loss: 1.4128, Accuracy: 52.73%


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,52.734
epoch,0.0
test_loss,1.41276
train_loss,1.50914


Training completed!


[34m[1mwandb[0m: Agent Starting Run: uvonzaay with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 2
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 2.3052, Test Loss: 2.3029, Accuracy: 10.14%


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,10.144
epoch,0.0
test_loss,2.30285
train_loss,2.30521


Training completed!


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: egeosc1j with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 4096
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.5060, Test Loss: 1.3887, Accuracy: 53.79%


VBox(children=(Label(value='0.001 MB of 0.052 MB uploaded\r'), FloatProgress(value=0.022695375978726548, max=1…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,53.793
epoch,0.0
test_loss,1.38865
train_loss,1.50601


Training completed!


[34m[1mwandb[0m: Agent Starting Run: 6c0ljo2d with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 2
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 2.2714, Test Loss: 2.2379, Accuracy: 14.81%


VBox(children=(Label(value='0.001 MB of 0.048 MB uploaded\r'), FloatProgress(value=0.024667817430050983, max=1…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,14.81
epoch,0.0
test_loss,2.23786
train_loss,2.27144


Training completed!


[34m[1mwandb[0m: Agent Starting Run: qg8aacq5 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 2048
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.5091, Test Loss: 1.3976, Accuracy: 53.33%


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,53.327
epoch,0.0
test_loss,1.39763
train_loss,1.50912


Training completed!


[34m[1mwandb[0m: Agent Starting Run: nr2v8z0k with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 16
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 2.0593, Test Loss: 1.9257, Accuracy: 32.18%


VBox(children=(Label(value='0.001 MB of 0.048 MB uploaded\r'), FloatProgress(value=0.0244513857111593, max=1.0…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,32.178
epoch,0.0
test_loss,1.92568
train_loss,2.05926


Training completed!


[34m[1mwandb[0m: Agent Starting Run: j70pm4p7 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 128
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps
Epoch [1/1], Train Loss: 1.5571, Test Loss: 1.4040, Accuracy: 52.95%


VBox(children=(Label(value='0.001 MB of 0.049 MB uploaded\r'), FloatProgress(value=0.023924469534747906, max=1…

0,1
accuracy,▁
epoch,▁
test_loss,▁
train_loss,▁

0,1
accuracy,52.949
epoch,0.0
test_loss,1.40405
train_loss,1.5571


Training completed!


[34m[1mwandb[0m: Agent Starting Run: 07htcqeu with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	d_model: 4096
[34m[1mwandb[0m: 	n_layers: 4
[34m[1mwandb[0m: 	num_epochs: 1
[34m[1mwandb[0m: 	num_samples: 100000
[34m[1mwandb[0m: 	test_interval: 100000000000000000000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using device: mps


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
# Evaluate the model (optional)
model.eval()
with torch.no_grad():

    test_x = torch.randn(100, 10)  # Generate some test data
    test_x.to(device)
    test_y = model(test_x)

    print("Sample predictions:", test_y[:5].numpy())