# Train the model

In [1]:
!pip install wandb



In [2]:
import wandb
import numpy as np
import sys
import torch
import torch.utils.data as Data
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import torch.nn as nn

In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mamf2288[0m ([33mfagerheim[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [64]:
BASE = '/ML_project/'
PATH_NN= 'data/'
save_path="models/"
save_name="in_all.pt"

In [65]:
from funcs import regression_system
from funcs import fcnn
from funcs import dataset

In [66]:
# Define X,Y pairs (state, subgrid fluxes) for local network.local_torch_dataset = Data.TensorDataset(
BATCH_SIZE = 64  # Number of sample in each batch


In [79]:
all_inputs = ['grad_B','FCOR', 'Nsquared', 'HML', 'TAU', 'Q', 'HBL', 'div', 'vort', 'strain']
inputs   = all_inputs#['strain','vort','grad_B','HBL','HML']
#inputs_3   = inputs_2.append('grad_B')

In [80]:
all_inputs

['grad_B',
 'FCOR',
 'Nsquared',
 'HML',
 'TAU',
 'Q',
 'HBL',
 'div',
 'vort',
 'strain']

In [81]:
submeso_dataset=dataset.SubmesoDataset(all_inputs,res='1_4')

In [82]:
train_loader=DataLoader(
    submeso_dataset,
    #num_workers=1,
    batch_size=64,
    sampler=SubsetRandomSampler(submeso_dataset.train_ind))

In [83]:
test_loader=DataLoader(
    submeso_dataset,
    #num_workers=1,
    batch_size=len(submeso_dataset.test_ind),
    sampler=submeso_dataset.test_ind)

In [84]:
# use GPUs if available
if torch.cuda.is_available():
    print("CUDA Available")
    device = torch.device('cuda')
else:
    print('CUDA Not Available')
    device = torch.device('cpu')

CUDA Available


In [85]:
seed=123
batch_size=256
input_channels=len(inputs)
output_channels=1
conv_layers = 3
kernel = 5
kernel_hidden = 3
activation="ReLU"
arch="fcnn"
epochs=100
save_path=save_path
save_name=save_name
lr=0.00024594159283761457
wd=0.023133758465751404

In [86]:
## Wandb config file
config={"seed":seed,
        "lr":lr,
        "wd":wd,
        "batch_size":batch_size,
        "input_channels":input_channels,
        "output_channels":output_channels,
        "activation":activation,
        "save_name":save_name,
        "save_path":save_path,
        "arch":arch,
        "conv_layers":conv_layers,
        "kernel":kernel,
        "kernel_hidden":kernel_hidden,
        "epochs":epochs}

In [87]:
config["save_path"] + config["save_name"]

'models/in_all.pt'

In [88]:
wandb.init(project="submeso_ML",config=config)
model=fcnn.FCNN(config)
config["learnable parameters"]=sum(p.numel() for p in model.parameters())

total_params = sum(param.numel() for param in model.parameters())
wandb.config.update({"Model Parameters": total_params})

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
wandb.config.update({"Trainable Parameters": trainable_params})

system=regression_system.RegressionSystem(model,wandb.config["lr"],wandb.config["wd"])
wandb.watch(model, log_freq=1)
wandb_logger = WandbLogger()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112857688890978, max=1.0…

cat: /sys/module/amdgpu/initstate: No such file or directory
ERROR:root:Driver not initialized (amdgpu not found in modules)


  rank_zero_warn(


In [89]:
trainer = pl.Trainer(
    default_root_dir=model.config["save_path"],
    accelerator="auto",
    max_epochs=config["epochs"],
    enable_progress_bar=False,
    logger=wandb_logger,
    )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [90]:
trainer.fit(system, train_loader, test_loader)
#model.save_model()
torch.save(model, config["save_path"] + config["save_name"])

wandb.finish()
    

project_name="submeso_ML"

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params
--------------------------------------
0 | network   | FCNN    | 162 K 
1 | criterion | MSELoss | 0     
--------------------------------------
162 K     Trainable params
0         Non-trainable params
162 K     Total params
0.649     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=100` reached.


VBox(children=(Label(value='0.133 MB of 0.133 MB uploaded (0.008 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_loss,█▆▅▅▄▄▄▃▃▂▂▃▂▂▂▂▂▂▂▃▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,99.0
train_loss,0.19984
trainer/global_step,12699.0
valid_loss,0.21681
