# Sweep attempt

In [1]:
!pip install wandb



In [2]:
import wandb
import numpy as np
import sys
import torch
import torch.utils.data as Data
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import torch.nn as nn

In [3]:
from funcs import regression_system
from funcs import fcnn
from funcs import dataset

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mamf2288[0m ([33mfagerheim[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
save_name ='sweep_test'
save_path = 'data/sweeps/'

In [6]:
inputs = ['grad_B','FCOR', 'Nsquared', 'HML', 'TAU', 'Q', 'HBL', 'div', 'vort', 'strain']

In [7]:
submeso_dataset=dataset.SubmesoDataset(inputs,res='1_4')

In [8]:
train_loader=DataLoader(
    submeso_dataset,
    #num_workers=1,
    batch_size=64,
    sampler=SubsetRandomSampler(submeso_dataset.train_ind))

In [9]:
test_loader=DataLoader(
    submeso_dataset,
    #num_workers=1,
    batch_size=len(submeso_dataset.test_ind),
    sampler=submeso_dataset.test_ind)

In [10]:
if torch.cuda.is_available():
    print("CUDA Available")
    device = torch.device('cuda')
else:
    print('CUDA Not Available')
    device = torch.device('cpu')

CUDA Available


In [11]:
def train(config=None):
    with wandb.init(project='submeso_ML', config=config):
        config = wandb.config

        wandb.init(project="submeso_ML",config=config)
        model=fcnn.FCNN(config)

        system=regression_system.RegressionSystem(model,wandb.config["lr"],wandb.config["wd"])
        wandb.watch(model, log_freq=1)
        wandb_logger = WandbLogger()

        trainer = pl.Trainer(
            accelerator="auto",
            max_epochs=config["epochs"],
            enable_progress_bar=False,
            logger=wandb_logger,
            )
        trainer.fit(system, train_loader, test_loader)

        wandb.finish()

In [16]:
sweep_config = {
    'method': 'bayes',
    'name': 'sweep_test',
    'metric': {
    'name': 'test_loss',      #what's the name of loss here? it's definted in regression_system right?
    'goal': 'minimize'},
    'parameters': {
        'seed': {
            'value': 123},
        'lr': {
            'distribution': 'uniform',
            'min': 0,
            'max': 0.1},
        'wd': {
            'distribution': 'uniform',
            'min': 0,
            'max': 1},
        'batch_size': {
            'values': [16,32,64,128,256,512,1024]},
        'input_channels': {
            'value': len(inputs)},
        'output_channels': {
            'value': 1},
        'activation': {
            'value': 'ReLU'},
        'save_name': {
            'value': save_name},
        'save_path': {
            'value': save_path},
        'arch': {
            'value': 'fcnn'},
        'conv_layers': {
            'values': [1,2,3,4,5,6,7,8]},
        'kernel': {
            'values': [2,3,4,5,6,7,8]},
        'kernel_hidden': {
            'values': [2,3,4,5,6,7,8]},
        'epochs': {
            'value': 100}}}

In [17]:
'''seed=123
batch_size=256
input_channels=len(inputs)
output_channels=1
conv_layers = 2
kernel = 5
kernel_hidden = 3
activation="ReLU"
arch="fcnn"
epochs=100
save_path=BASE
save_name="test-3.pt"
lr=0.0025
wd=0.023133758465751404''';

In [18]:
sweep_id = wandb.sweep(sweep_config,project='submeso_ML')
#sweep_id = wandb.sweep(sweep=sweep_configuration, project="%s" % project_name)

Create sweep with ID: hrqcjn74
Sweep URL: https://wandb.ai/fagerheim/submeso_ML/sweeps/hrqcjn74


In [19]:
wandb.agent(sweep_id, function=train, project='submeso_ML',count=5)
#wandb.agent(sweep_id, function=train, count=30)

[34m[1mwandb[0m: Agent Starting Run: t5u3hlgy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	arch: fcnn
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	conv_layers: 5
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	input_channels: 10
[34m[1mwandb[0m: 	kernel: 8
[34m[1mwandb[0m: 	kernel_hidden: 2
[34m[1mwandb[0m: 	lr: 0.08789217430250001
[34m[1mwandb[0m: 	output_channels: 1
[34m[1mwandb[0m: 	save_name: sweep_test
[34m[1mwandb[0m: 	save_path: data/sweeps/
[34m[1mwandb[0m: 	seed: 123
[34m[1mwandb[0m: 	wd: 0.8379778298556239
cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)




VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113712611101315, max=1.0…

cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params
--------------------------------------
0 | network   | FCNN    | 127 K 
1 | criterion | MSELoss | 0     
--------------------------------------
127 K     Trainable params
0         Non-trainable params
127 K     Total params
0.511     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=100` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,▅▁▂▁▁▁▂█████████████████████████████████
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_loss,█▁▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,99.0
train_loss,0.98681
trainer/global_step,12699.0
valid_loss,1.06522


[34m[1mwandb[0m: Agent Starting Run: zgo1n60g with config:
[34m[1mwandb[0m: 	activation: Tanh
[34m[1mwandb[0m: 	arch: fcnn
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	conv_layers: 2
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	input_channels: 10
[34m[1mwandb[0m: 	kernel: 3
[34m[1mwandb[0m: 	kernel_hidden: 4
[34m[1mwandb[0m: 	lr: 0.023603147852734294
[34m[1mwandb[0m: 	output_channels: 1
[34m[1mwandb[0m: 	save_name: sweep_test
[34m[1mwandb[0m: 	save_path: data/sweeps/
[34m[1mwandb[0m: 	seed: 123
[34m[1mwandb[0m: 	wd: 0.1433691453207162
cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)




VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113596811123392, max=1.0…

cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params
--------------------------------------
0 | network   | FCNN    | 176 K 
1 | criterion | MSELoss | 0     
--------------------------------------
176 K     Trainable params
0         Non-trainable params
176 K     Total params
0.706     Total estimated model params size (MB)
  rank_zero_warn(


Error: wrong ReLU parameter:
Error: wrong ReLU parameter:
Error: wrong ReLU parameter:


  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=100` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_loss,▅▅▂▄▁▃▃▂▃▂▂▂▃▃▃▂▂▄▁█▂▃▁▂▆▁▆▂▁▃▃▄▂▁▁▃▂▂▁▃

0,1
epoch,99.0
train_loss,0.69924
trainer/global_step,12699.0
valid_loss,0.7536


[34m[1mwandb[0m: Agent Starting Run: mlpraxmk with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	arch: fcnn
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	conv_layers: 5
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	input_channels: 10
[34m[1mwandb[0m: 	kernel: 3
[34m[1mwandb[0m: 	kernel_hidden: 4
[34m[1mwandb[0m: 	lr: 0.023626695699969436
[34m[1mwandb[0m: 	output_channels: 1
[34m[1mwandb[0m: 	save_name: sweep_test
[34m[1mwandb[0m: 	save_path: data/sweeps/
[34m[1mwandb[0m: 	seed: 123
[34m[1mwandb[0m: 	wd: 0.7931532195177394
cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)




VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112605888875906, max=1.0…

cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params
--------------------------------------
0 | network   | FCNN    | 193 K 
1 | criterion | MSELoss | 0     
--------------------------------------
193 K     Trainable params
0         Non-trainable params
193 K     Total params
0.772     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=100` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▂▁▁▁▁▁▂▁▂▁▁▁▁▂▁▁▂▁▁▁▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_loss,▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂▁▁▁▁▁▁▁▁

0,1
epoch,99.0
train_loss,0.58821
trainer/global_step,12699.0
valid_loss,1.01476


[34m[1mwandb[0m: Agent Starting Run: lny6n5dj with config:
[34m[1mwandb[0m: 	activation: LeakyReLu
[34m[1mwandb[0m: 	arch: fcnn
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	conv_layers: 7
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	input_channels: 10
[34m[1mwandb[0m: 	kernel: 7
[34m[1mwandb[0m: 	kernel_hidden: 5
[34m[1mwandb[0m: 	lr: 0.04328639362356693
[34m[1mwandb[0m: 	output_channels: 1
[34m[1mwandb[0m: 	save_name: sweep_test
[34m[1mwandb[0m: 	save_path: data/sweeps/
[34m[1mwandb[0m: 	seed: 123
[34m[1mwandb[0m: 	wd: 0.8217430879391501
cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)




VBox(children=(Label(value='0.035 MB of 0.035 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113040177770017, max=1.0…

cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params
--------------------------------------
0 | network   | FCNN    | 397 K 
1 | criterion | MSELoss | 0     
--------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.589     Total estimated model params size (MB)
  rank_zero_warn(


Error: wrong ReLU parameter:
Error: wrong ReLU parameter:
Error: wrong ReLU parameter:
Error: wrong ReLU parameter:
Error: wrong ReLU parameter:
Error: wrong ReLU parameter:


  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=100` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▄▁▁▂▁▂▅▁▁▁▁▂▅▁▁▁█▁▂▁

0,1
epoch,99.0
train_loss,0.7838
trainer/global_step,12699.0
valid_loss,0.95327


[34m[1mwandb[0m: Agent Starting Run: zw17cbqv with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	arch: fcnn
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	conv_layers: 6
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	input_channels: 10
[34m[1mwandb[0m: 	kernel: 4
[34m[1mwandb[0m: 	kernel_hidden: 4
[34m[1mwandb[0m: 	lr: 0.03176759889086589
[34m[1mwandb[0m: 	output_channels: 1
[34m[1mwandb[0m: 	save_name: sweep_test
[34m[1mwandb[0m: 	save_path: data/sweeps/
[34m[1mwandb[0m: 	seed: 123
[34m[1mwandb[0m: 	wd: 0.6382839418705073
cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)




VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112379366669404, max=1.0…

cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params
--------------------------------------
0 | network   | FCNN    | 218 K 
1 | criterion | MSELoss | 0     
--------------------------------------
218 K     Trainable params
0         Non-trainable params
218 K     Total params
0.874     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=100` reached.


VBox(children=(Label(value='0.048 MB of 0.048 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_loss,▁▁▁▂▁▁▁▁▁█▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,99.0
train_loss,0.98509
trainer/global_step,12699.0
valid_loss,1.06415


## Problems with Sweep

Not sure how many of these points are connected, but figured I'd make a list of everything that's currently not working correctly:

- **Activation Function:** `Error: wrong ReLU parameter:` seems to be returned for LeakyRelu.
- **Locked Parameters:** `wandb: WARNING Config item 'conv_layers' was locked by 'sweep' (ignored update)` seems to be returned for every run. I did some research into this and found [an example](https://github.com/wandb/wandb/issues/2641) of the same issue that seemed to be related to using the config.update() method, but I don't see that called anywhere. [Another example](https://github.com/wandb/wandb/issues/5769) seems to suggest it's a problem with versions downloaded.
- **Strange Looking Loss:** Now we're staring to get to the more prominent problems. In particular, when tracking the training and validation losses in wandb, a lot of things look off:
  - Validation losses are gigantic, ranging anywhere from 0.5 to dozens to even a few hundred in one cases
  - Training losses seem more reasonable (most are below 1), however there doesn't seem to be a decrease with epoch, in other words the model doesn't really seem to be training at all.
  - The plot showing each mode, its hyperparameters, and test losses also isn't displaying correctly because every run I've had is at a test loss of null.

## My Thoughts

- **Acvivation Function:** I feel like the best thing to do here is only use the relu function until I can clear other problems. I don't imaging using a different function would make a huge difference either, but maybe I'm wrong.
- **Locked Parameters:** Not entirely sure what this means, but it doesn't seem to be critically effecting anything, because each run has a different set of hyperparameters. I install pip everytime I run the notebook, so I don't imagine it's a problem with updates. Similarly, I'm fine to let this go until the more glaring errors are addressed.
- **Strange Looking Loss:** Fixing this issue seems absolutely essential and should be the first priority. It seems to me like something is wrong with how the loss function is assigned and optimized. What seems really odd is that training loss has generally reasonable values but validation loss is way off? Things to look into:
  - Config is assigned slightly differently in the wandb notebook compared to Abigail's code, and I went with wandb because Abigail's caused errors (I believe something along the lines of `ERROR: config not assigned`
  - How is the loss function assigned? Both in the training loader vs the torch_model notebook, and also the assignment of metrics in sweep_config
  - The dataset file is up to date with changed normalization code right?