
# Zero Initialization - Bias Initialization

In [None]:
import math
import os
import random

import joblib
import numpy as np
import optuna
import torch
from torch.optim import Adam, RMSprop, SGD
from torch.utils.tensorboard import SummaryWriter

from config import ExperimentConfig
from data import get_mnist_loader, get_cifar10_loader
from models import FNN, CNN, CIFARCNN, ResNet50
from train import run_experiments, Trainer

ELU_SCALE_NORMAL = math.sqrt(1.615)
ELU_SCALE_UNIFORM = math.sqrt(1.574)
RELU_SCALE = math.sqrt(2)
R_SEED = 1777

## 1 MNIST - FNN

### 1.1 Normal Distribution

In [None]:
train_loader, val_loader = get_mnist_loader(train=True, batch_size=32)
test_loader = get_mnist_loader(train=False, batch_size=32)

#### 1.1.1 Pytorch Default Initialization

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)

config = ExperimentConfig(
    model_name="MNIST_FNN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.1.2 Standard Normal Distribution

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal")

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.1.3 Negative Mean Shift

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", mean=-1.0)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_MEAN_NEG",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.1.4 Positive Mean Shift

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", mean=1.0)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_MEAN_POS",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.1.5 Negative Mean Shift High

In [None]:
torch.manual_seed(R_SEED)
random.seed(R_SEED)
np.random.seed(R_SEED)

model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", mean=-5.0)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_NEG_HIGH",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.1.6 Positive Mean Shift High

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", mean=5.0)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_MEAN_POS_HIGH",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.1.7 Negative Mean Shift very High

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", mean=-50.0)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_MEAN_NEG_OVER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.1.8 Positive Mean Shift very High

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", mean=50.0)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_MEAN_POS_OVER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", softmax_init=True)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_LAST_LAYER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", scale_factor=ELU_SCALE_NORMAL)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_ELU_SCALE",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", scale_factor=0.1)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_0.1_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", scale_factor=0.5)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_0.5_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal", scale_factor=2)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_2.0_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal_in_features")

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_IN_FEATURES",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="normal_in_features", softmax_init=True)

config = ExperimentConfig(
    model_name="MNIST_FNN_NORMAL_IN_FEATURES_LAST_LAYER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.1.1 Finding The Point Of Break

In [None]:
train_loader, val_loader = get_mnist_loader(train=True, batch_size=32)
test_loader = get_mnist_loader(train=False, batch_size=32)

In [None]:
factor_range = range(10, 301, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 500},
            {"in_features": 500, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_FNN_NORMAL_1_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
factors = [10, 100, 1000, 10_000, 100_000, 1_000_000]

for factor in factors:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    factor = 1 / factor

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 500},
            {"in_features": 500, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_FNN_NORMAL_1_DOWN_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
factor_range = range(10, 301, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 400},
            {"in_features": 400, "out_features": 300},
            {"in_features": 300, "out_features": 150},
            {"in_features": 150, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_FNN_NORMAL_2_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
factors = [10, 100, 1000, 10_000, 100_000, 1_000_000]

for factor in factors:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    factor = 1 / factor

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 400},
            {"in_features": 400, "out_features": 300},
            {"in_features": 300, "out_features": 150},
            {"in_features": 150, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_FNN_NORMAL_2_DOWN_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
train_loader, val_loader = get_mnist_loader(train=True, batch_size=8)
test_loader = get_mnist_loader(train=False, batch_size=8)

In [None]:
factor_range = range(10, 301, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 500},
            {"in_features": 500, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_FNN_NORMAL_3_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
factor_range = range(120, 221, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 400},
            {"in_features": 400, "out_features": 300},
            {"in_features": 300, "out_features": 150},
            {"in_features": 150, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_FNN_NORMAL_4_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

### 1.2 Uniform Distribution

In [None]:
train_loader, val_loader = get_mnist_loader(train=True, batch_size=32)
test_loader = get_mnist_loader(train=False, batch_size=32)

#### 1.2.1 Zero To One Pre-Activations

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="uniform", a=0, b=1)

config = ExperimentConfig(
    model_name="MNIST_FNN_UNIFORM_0-1",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.2.2 Unit Variance And Zero Mean Pre-Activations

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="uniform", a=-math.sqrt(3), b=math.sqrt(3))

config = ExperimentConfig(
    model_name="MNIST_FNN_UNIFORM_PRE_ACT",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="deterministic", a=-1.73, b=1.73)

config = ExperimentConfig(
    model_name="MNIST_FNN_UNIFORM_PRE_ACT_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.2.3 Unit Variance and Zero Mean Activations

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="uniform", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="MNIST_FNN_UNIFORM_UNIT_VAR_ZERO_MEAN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 28 * 28, "out_features": 500},
        {"in_features": 500, "out_features": 10},
    ]
)
model.initialize(mode="deterministic", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="MNIST_FNN_UNIFORM_UNIT_VAR_ZERO_MEAN_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 1.2.5 Comparison

### 1.3 Compare Our Approach To Others

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"])
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"])

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 500},
            {"in_features": 500, "out_features": 10},
        ]
    )

    writer = SummaryWriter("./results/MNIST_FNN_TRIAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_fnn.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_fnn.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"])
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"])

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 500},
            {"in_features": 500, "out_features": 10},
        ]
    )
    model.initialize("normal")

    writer = SummaryWriter("./results/MNIST_FNN_TRIAL_NORMAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_fnn_normal.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_fnn_normal.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"])
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"])

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 500},
            {"in_features": 500, "out_features": 10},
        ]
    )
    model.initialize(mode="normal_in_features", softmax_init=True)

    writer = SummaryWriter("./results/MNIST_FNN_TRIAL_NORMAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_fnn_normal_in_last.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_fnn_normal_in_last.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"])
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"])

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 500},
            {"in_features": 500, "out_features": 10},
        ]
    )
    model.initialize("uniform", a=0, b=1)

    writer = SummaryWriter("./results/MNIST_FNN_TRIAL_UNIFORM")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_fnn_uniform.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_fnn_uniform.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"])
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"])

    model = FNN(
        [
            {"in_features": 28 * 28, "out_features": 500},
            {"in_features": 500, "out_features": 10},
        ]
    )
    model.initialize(mode="uniform", a=-1.29, b=2.5)

    writer = SummaryWriter("./results/MNIST_FNN_TRIAL_UNIFORM")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_fnn_uniform_unit_var.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_fnn_uniform_unit_var.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

## 2 MNIST - CNN

### 2.1 Normal Distribution

In [None]:
train_loader, val_loader = get_mnist_loader(train=True, batch_size=32, flatten=False)
test_loader = get_mnist_loader(train=False, batch_size=32, flatten=False)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

config = ExperimentConfig(
    model_name="MNIST_CNN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal")

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal", mean=-1.0)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_MEAN_NEG",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal", mean=1.0)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_MEAN_POS",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal", mean=-5.0)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_MEAN_NEG_HIGH",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal", mean=5.0)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_MEAN_POS_HIGH",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal", mean=-50.0)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_MEAN_NEG_OVER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal", mean=50.0)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_MEAN_POS_OVER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal_in_features")

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_IN_FEATURES",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal", scale_factor=ELU_SCALE_NORMAL)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_ELU_SCALE",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal", softmax_init=True)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_LAST_LAYER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("normal_in_features", softmax_init=True)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_IN_FEATURES_LAST_LAYER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)
model.initialize("normal", scale_factor=0.1)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_0.1_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)
model.initialize("normal", scale_factor=0.5)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_0.5_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)
model.initialize("normal", scale_factor=2.0)

config = ExperimentConfig(
    model_name="MNIST_CNN_NORMAL_2.0_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 2.1.1 Finding The Point Of Break

In [None]:
factor_range = range(10, 31, 2)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 10}
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_CNN_NORMAL_1_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, train_summary=False)

In [None]:
factor_range = range(10, 31, 2)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 8, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 8, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2}
        ],
        [
            {"in_features": 16 * 28 * 28, "out_features": 10}
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_CNN_NORMAL_2_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, train_summary=False)

In [None]:
factor_range = range(10, 31, 2)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 10}
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_CNN_NORMAL_3_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, train_summary=False)

In [None]:
factor_range = range(10, 31, 2)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 100},
            {"in_features": 100, "out_features": 10}
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_CNN_NORMAL_4_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, train_summary=False)

In [None]:
factors = [10, 100, 1000, 10_000, 100_000, 1_000_000]

for factor in factors:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    factor = 1 / factor

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 10}
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/MNIST_CNN_NORMAL_DOWN_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, train_summary=False)

### 2.2 Uniform Distribution

In [None]:
train_loader, val_loader = get_mnist_loader(train=True, batch_size=32, flatten=False)
test_loader = get_mnist_loader(train=False, batch_size=32, flatten=False)

#### 2.2.1 Zero To One Pre-Activations

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize("uniform", a=0, b=1)

config = ExperimentConfig(
    model_name="MNIST_CNN_UNIFORM_0-1",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 2.2.2 Unit Variance And Zero Mean Pre-Activations

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize(mode="uniform", a=-math.sqrt(3), b=math.sqrt(3))

config = ExperimentConfig(
    model_name="MNIST_CNN_UNIFORM_PRE_ACT",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize(mode="deterministic", a=-math.sqrt(3), b=math.sqrt(3))

config = ExperimentConfig(
    model_name="MNIST_CNN_UNIFORM_PRE_ACT_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 2.2.3 Unit Variance and Zero Mean Activation

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize(mode="uniform", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="MNIST_CNN_UNIFORM_UNIT_VAR_ZERO_MEAN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

In [None]:
model = CNN(
    [
        {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
        {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

    ],
    [
        {"in_features": 32 * 28 * 28, "out_features": 10}
    ]
)

model.initialize(mode="deterministic", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="MNIST_CNN_UNIFORM_UNIT_VAR_ZERO_MEAN_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=20
)

run_experiments(model, config)

#### 2.2.5 Comparison

In [None]:
# TODO do several runs

### 2.3 Compare Our Approach To Others

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"], flatten=False)
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"], flatten=False)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 10}
        ]
    )

    writer = SummaryWriter("./results/MNIST_CNN_TRIAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_cnn.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_cnn.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"], flatten=False)
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"], flatten=False)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 10}
        ]
    )
    model.initialize("normal")

    writer = SummaryWriter("./results/MNIST_CNN_TRIAL_NORMAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_cnn_normal.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_cnn_normal.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"], flatten=False)
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"], flatten=False)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 10}
        ]
    )
    model.initialize("normal", scale_factor=ELU_SCALE_NORMAL)

    writer = SummaryWriter("./results/MNIST_CNN_TRIAL_NORMAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_cnn_normal_elu.pkl"))
study = joblib.load(os.path.join("results", "optuna", "mnist_cnn_normal_elu.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"], flatten=False)
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"], flatten=False)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 10}
        ]
    )
    model.initialize("uniform", a=0, b=1)

    writer = SummaryWriter("./results/MNIST_CNN_TRIAL_UNIFORM")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_cnn_uniform.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_cnn_uniform.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_mnist_loader(train=True, batch_size=params["batch_size"], flatten=False)
    test_loader = get_mnist_loader(train=False, batch_size=params["batch_size"], flatten=False)

    model = CNN(
        [
            {"in_channels": 1, "out_channels": 16, "kernel_size": 5, "stride": 1, "padding": 2},
            {"in_channels": 16, "out_channels": 32, "kernel_size": 5, "stride": 1, "padding": 2}

        ],
        [
            {"in_features": 32 * 28 * 28, "out_features": 10}
        ]
    )
    model.initialize(mode="uniform", a=-1.29, b=2.5)

    writer = SummaryWriter("./results/MNIST_CNN_TRIAL_UNIFORM")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "mnist_cnn_uniform_unit_var.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "mnist_cnn_uniform_unit_var.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

## 3 CIFAR-10 - FNN

### 3.1 Normal Distribution

In [None]:
train_loader, val_loader = get_cifar10_loader(train=True, batch_size=32)
test_loader = get_cifar10_loader(train=False, batch_size=32)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal")

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", softmax_init=True)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_LAST_LAYER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", ELU_SCALE_NORMAL)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_ELU_SCALE",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal_in_features")

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_IN_FEATURES",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal_in_features", softmax_init=True)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_IN_FEATURES_LAST_LAYER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", mean=-1.0)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_MEAN_NEG",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", mean=1.0)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_MEAN_POS",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", mean=-5.0)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_MEAN_NEG_HIGH",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", mean=5.0)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_MEAN_POS_HIGH",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", mean=-50.0)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_MEAN_NEG_OVER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", mean=50.0)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_MEAN_POS_OVER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", scale_factor=0.1)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_0.1_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", scale_factor=0.5)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_0.5_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)
run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize("normal", scale_factor=2.0)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_NORMAL_2.0_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

#### 3.1.1 Finding The Point Of Break

In [None]:
factor_range = range(10, 301, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = FNN(
        [
            {"in_features": 32 * 32 * 3, "out_features": 512},
            {"in_features": 512, "out_features": 256},
            {"in_features": 256, "out_features": 128},
            {"in_features": 128, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/CIFAR-10_FNN_NORMAL_1_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
factor_range = range(10, 301, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = FNN(
        [
            {"in_features": 32 * 32 * 3, "out_features": 512},
            {"in_features": 512, "out_features": 256},
            {"in_features": 256, "out_features": 256},
            {"in_features": 256, "out_features": 128},
            {"in_features": 128, "out_features": 128},
            {"in_features": 128, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/CIFAR-10_FNN_NORMAL_2_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
train_loader, val_loader = get_cifar10_loader(train=True, batch_size=8)
test_loader = get_cifar10_loader(train=False, batch_size=8)

In [None]:
factor_range = range(10, 301, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = FNN(
        [
            {"in_features": 32 * 32 * 3, "out_features": 512},
            {"in_features": 512, "out_features": 256},
            {"in_features": 256, "out_features": 128},
            {"in_features": 128, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/CIFAR-10_FNN_NORMAL_3_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
factor_range = range(10, 301, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = FNN(
        [
            {"in_features": 32 * 32 * 3, "out_features": 512},
            {"in_features": 512, "out_features": 256},
            {"in_features": 256, "out_features": 256},
            {"in_features": 256, "out_features": 128},
            {"in_features": 128, "out_features": 128},
            {"in_features": 128, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/CIFAR-10_FNN_NORMAL_4_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
factors = [10, 100, 1000, 10_000, 100_000, 1_000_000]

for factor in factors:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    factor = 1 / factor

    model = FNN(
        [
            {"in_features": 32 * 32 * 3, "out_features": 512},
            {"in_features": 512, "out_features": 256},
            {"in_features": 256, "out_features": 128},
            {"in_features": 128, "out_features": 10},
        ]
    )
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/CIFAR-10_FNN_NORMAL_DOWN_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

### 3.2 Uniform Distribution

In [None]:
train_loader, val_loader = get_cifar10_loader(train=True, batch_size=32)
test_loader = get_cifar10_loader(train=False, batch_size=32)

#### 3.2.1 Zero To One Pre-Activations

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize(mode="uniform", a=0, b=1)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_UNIFORM_0-1",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

#### 3.2.2 Unit Variance And Zero Mean Pre-Activations

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize(mode="uniform", a=-math.sqrt(3), b=math.sqrt(3))

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_UNIFORM_PRE_ACT",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize(mode="deterministic", a=-1.73, b=1.73)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_UNIFORM_PRE_ACT_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

#### 3.2.3 Unit Variance and Zero Mean Activations

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize(mode="uniform", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_UNIFORM_UNIT_VAR_ZERO_MEAN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = FNN(
    [
        {"in_features": 32 * 32 * 3, "out_features": 512},
        {"in_features": 512, "out_features": 256},
        {"in_features": 256, "out_features": 128},
        {"in_features": 128, "out_features": 10},
    ]
)
model.initialize(mode="deterministic", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="CIFAR-10_FNN_UNIFORM_UNIT_VAR_ZERO_MEAN_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

#### 3.2.5 Comparison

### 3.3 Compare Our Approach to Others

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_cifar10_loader(train=True, batch_size=params["batch_size"])
    test_loader = get_cifar10_loader(train=False, batch_size=params["batch_size"])

    model = FNN(
        [
            {"in_features": 32 * 32 * 3, "out_features": 512},
            {"in_features": 512, "out_features": 256},
            {"in_features": 256, "out_features": 128},
            {"in_features": 128, "out_features": 10},
        ]
    )

    writer = SummaryWriter("./results/CIFAR_FNN_TRIAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "cifar_fnn.pkl"))

In [None]:
study = joblib.load(os.path.join("results", "optuna", "cifar_fnn.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_cifar10_loader(train=True, batch_size=params["batch_size"])
    test_loader = get_cifar10_loader(train=False, batch_size=params["batch_size"])

    model = FNN(
        [
            {"in_features": 32 * 32 * 3, "out_features": 512},
            {"in_features": 512, "out_features": 256},
            {"in_features": 256, "out_features": 128},
            {"in_features": 128, "out_features": 10},
        ]
    )
    model.initialize("normal", softmax_init=True)

    writer = SummaryWriter("./results/CIFAR_FNN_TRIAL_NORMAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "cifar_fnn_normal.pkl"))

In [None]:
optuna.visualization.plot_parallel_coordinate(study)
study = joblib.load(os.path.join("results", "optuna", "cifar_fnn_normal.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_cifar10_loader(train=True, batch_size=params["batch_size"])
    test_loader = get_cifar10_loader(train=False, batch_size=params["batch_size"])

    model = FNN(
        [
            {"in_features": 32 * 32 * 3, "out_features": 512},
            {"in_features": 512, "out_features": 256},
            {"in_features": 256, "out_features": 128},
            {"in_features": 128, "out_features": 10},
        ]
    )
    model.initialize("uniform", a=0, b=1)

    writer = SummaryWriter("./results/CIFAR_FNN_TRIAL_UNIFORM")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "cifar_fnn_uniform.pkl"))

In [None]:
optuna.visualization.plot_parallel_coordinate(study)
study = joblib.load(os.path.join("results", "optuna", "cifar_fnn_uniform.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

## 4 CIFAR-10 CNN

In [None]:
train_loader, val_loader = get_cifar10_loader(train=True, batch_size=32, flatten=False)
test_loader = get_cifar10_loader(train=False, batch_size=32, flatten=False)

In [None]:
model = CIFARCNN()

config = ExperimentConfig(
    model_name="CIFAR-10_CNN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

### 4.1 Normal Distribution

In [None]:
model = CIFARCNN()
model.initialize("normal")

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", softmax_init=True)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_LAST_LAYER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", ELU_SCALE_NORMAL)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_ELU_SCALE",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal_in_features")

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_IN_FEATURES",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", mean=-1.0)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_MEAN_NEG",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", mean=1.0)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_MEAN_POS",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", mean=-5.0)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_MEAN_NEG_HIGH",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", mean=5.0)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_MEAN_POS_HIGH",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", mean=-50.0)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_MEAN_NEG_OVER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", mean=50.0)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_MEAN_POS_OVER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", scale_factor=0.1)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_0.1_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", scale_factor=0.5)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_0.5_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal", scale_factor=2.0)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_2.0_STD",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize("normal_in_features", softmax_init=True)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_NORMAL_IN_FEATURES_LAST_LAYER",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

### RESNET

In [None]:
model = ResNet50()

config = ExperimentConfig(
    model_name="CIFAR-10_RESNET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=40
)

run_experiments(model, config)

In [None]:
model = ResNet50()
model.initialize("normal")

config = ExperimentConfig(
    model_name="CIFAR-10_RESNET_NORMAL",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=40
)

run_experiments(model, config)

In [None]:
model = ResNet50()
model.initialize("normal", ELU_SCALE_NORMAL)

config = ExperimentConfig(
    model_name="CIFAR-10_RESNET_NORMAL_ELU_SCALE",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=40
)

run_experiments(model, config)

In [None]:
model = ResNet50()
model.initialize(mode="uniform", a=0, b=1)

config = ExperimentConfig(
    model_name="CIFAR-10_RESNET_UNIFORM_0-1",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=40
)

run_experiments(model, config)

In [None]:
model = ResNet50()
model.initialize(mode="uniform", a=-math.sqrt(3), b=math.sqrt(3))

config = ExperimentConfig(
    model_name="CIFAR-10_RESNET_UNIFORM_PRE_ACT",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=40
)

run_experiments(model, config)

In [None]:
model = ResNet50()
model.initialize(mode="deterministic", a=-1.73, b=1.73)

config = ExperimentConfig(
    model_name="CIFAR-10_RESNET_UNIFORM_PRE_ACT_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=40
)

run_experiments(model, config)

In [None]:
model = ResNet50()
model.initialize(mode="uniform", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="CIFAR-10_RESNET_UNIFORM_UNIT_VAR_ZERO_MEAN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=40
)

run_experiments(model, config)

In [None]:
model = ResNet50()
model.initialize(mode="deterministic", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="CIFAR-10_RESNET_UNIFORM_UNIT_VAR_ZERO_MEAN_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=40
)

run_experiments(model, config)

#### 4.1.1 Finding The Point of Break

In [None]:
factor_range = range(10, 301, 10)

for factor in factor_range:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    model = CIFARCNN()
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/CIFAR-10_CNN_NORMAL_1_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

In [None]:
factors = [10, 100, 1000, 10_000, 100_000, 1_000_000]

for factor in factors:
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    factor = 1 / factor

    model = CIFARCNN()
    model.initialize(mode="normal", scale_factor=factor)
    writer = SummaryWriter(f"./results/CIFAR-10_CNN_NORMAL_DOWN_{factor}")
    trainer = Trainer(model=model, lr=0.001, writer=writer)

    trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=30, train_summary=False)

### 4.2 Uniform Distribution

In [None]:
train_loader, val_loader = get_cifar10_loader(train=True, batch_size=32, flatten=False)
test_loader = get_cifar10_loader(train=False, batch_size=32, flatten=False)

#### 4.2.1 Zero To One Pre-Activations

In [None]:
model = CIFARCNN()
model.initialize(mode="uniform", a=0, b=1)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_UNIFORM_0-1",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

#### 4.2.2 Unit Variance And Zero Mean Pre-Activations

In [None]:
model = CIFARCNN()
model.initialize(mode="uniform", a=-math.sqrt(3), b=math.sqrt(3))

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_UNIFORM_PRE_ACT",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize(mode="deterministic", a=-1.73, b=1.73)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_UNIFORM_PRE_ACT_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

#### 4.2.3 Unit Variance and Zero Mean Activations

In [None]:
model = CIFARCNN()
model.initialize(mode="uniform", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_UNIFORM_UNIT_VAR_ZERO_MEAN",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

In [None]:
model = CIFARCNN()
model.initialize(mode="deterministic", a=-3.437, b=2.222)

config = ExperimentConfig(
    model_name="CIFAR-10_CNN_UNIFORM_UNIT_VAR_ZERO_MEAN_DET",
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30
)

run_experiments(model, config)

#### 4.2.5 Comparison

### 4.3 Compare Our Approach To Others

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_cifar10_loader(train=True, batch_size=params["batch_size"], flatten=False)
    test_loader = get_cifar10_loader(train=False, batch_size=params["batch_size"], flatten=False)

    model = CIFARCNN()

    writer = SummaryWriter("./results/CIFAR_CNN_TRIAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "cifar_cnn.pkl"))

In [None]:
optuna.visualization.plot_parallel_coordinate(study)
study = joblib.load(os.path.join("results", "optuna", "cifar_cnn.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_cifar10_loader(train=True, batch_size=params["batch_size"], flatten=False)
    test_loader = get_cifar10_loader(train=False, batch_size=params["batch_size"], flatten=False)

    model = CIFARCNN()
    model.initialize("normal_in_features")

    writer = SummaryWriter("./results/CIFAR_CNN_TRIAL_NORMAL")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "cifar_cnn_normal.pkl"))

In [None]:
optuna.visualization.plot_parallel_coordinate(study)
study = joblib.load(os.path.join("results", "optuna", "cifar_cnn_normal.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)

In [None]:
def objective(trial):
    torch.manual_seed(R_SEED)
    random.seed(R_SEED)
    np.random.seed(R_SEED)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
        "optimizer": trial.suggest_categorical("optimizer", [Adam, RMSprop, SGD]),
        "batch_size": trial.suggest_int("batch_size", 4, 64, 2),

    }

    train_loader, val_loader = get_cifar10_loader(train=True, batch_size=params["batch_size"], flatten=False)
    test_loader = get_cifar10_loader(train=False, batch_size=params["batch_size"], flatten=False)

    model = CIFARCNN()
    model.initialize('uniform', a=0, b=1)

    writer = SummaryWriter("./results/CIFAR_CNN_TRIAL_UNIFORM")
    trainer = Trainer(model=model, lr=params["learning_rate"], optimizer=params["optimizer"], writer=writer)
    accuracy = trainer.train(train_loader=train_loader, test_loader=test_loader, num_epochs=15, trial=trial,
                             train_summary=False)

    return accuracy


study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=28)

joblib.dump(study, os.path.join("results", "optuna", "cifar_cnn_uniform.pkl"))

In [None]:
optuna.visualization.plot_parallel_coordinate(study)
study = joblib.load(os.path.join("results", "optuna", "cifar_cnn_uniform.pkl"), 'r')

for key, value in study.best_trial.params.items():
    print("{}: {}".format(key, value))
print(f"final accuracy: {study.best_trial.value}")

optuna.visualization.plot_parallel_coordinate(study)