## Lightning ANN 

In [1]:
import wandb
import numpy as np
import sys
import torch
import torch.utils.data as Data
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor

In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mabigail-bodner[0m ([33mabodner[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
BASE = '/scratch/ab10313/pleiades/'
PATH_NN= BASE+'NN_data_smooth/'

In [4]:
import systems.regression_system as regression_system
import models.ann as ann
#import lightning.util.performance as performance
#import util.misc as misc
#import pyqg_explorer.dataset.forcing_dataset as forcing_dataset

In [5]:
# load preprocessed data into input and output channels

# X INPUT
grad_B = np.load(PATH_NN+'grad_B.npy')
FCOR = np.load(PATH_NN+'FCOR.npy')
Nsquared = np.load(PATH_NN+'Nsquared.npy')
HML = np.load(PATH_NN+'HML.npy')
TAU = np.load(PATH_NN+'TAU.npy')
Q = np.load(PATH_NN+'Q.npy')
HBL = np.load(PATH_NN+'HBL.npy')
div = np.load(PATH_NN+'div.npy')
vort = np.load(PATH_NN+'vort.npy')
strain = np.load(PATH_NN+'strain.npy')

# note different reshaping of input/output for ANN
X_input = np.stack([FCOR, grad_B, HML, Nsquared, TAU, Q, HBL, div, vort, strain],axis=0).reshape(grad_B.shape[0],grad_B.shape[1],grad_B.shape[2],10) 
print('X input shape:')
print( X_input.shape)
print('')


# Y OUTPUT
WB_sg = np.load(PATH_NN+'WB_sg.npy')
WB_sg_mean = np.load(PATH_NN+'WB_sg_mean.npy')
WB_sg_std = np.load(PATH_NN+'WB_sg_std.npy')
              
Y_output = np.tile(WB_sg,(1,1,1,1)).reshape(WB_sg.shape[0],WB_sg.shape[1],WB_sg.shape[2],1) 
print('Y output shape:')
print(Y_output.shape)
print('')

np.isnan(X_input).any()
np.isnan(Y_output).any()

X input shape:
(8450, 40, 40, 10)

Y output shape:
(8450, 40, 40, 1)



False

In [6]:
# TRAIN AND TEST ONLY
# randomnly generate train, test and validation time indecies 
import random
time_ind = X_input.shape[0]
rand_ind = np.arange(time_ind)
rand_seed = 14
random.Random(rand_seed).shuffle(rand_ind)
train_percent = 0.9
test_percent = 0.1 
print(f"Dataset: train {np.round(train_percent*100)}%, test {np.round(test_percent*100)}%")
train_ind, test_ind =  rand_ind[:round(train_percent*time_ind)], rand_ind[round((train_percent)*time_ind):]                                                                        

# check no overlapping indecies
if np.intersect1d(train_ind, test_ind).any():
    print('overlapping indecies')
else:
    print ('no overlapping indecies')
    

Dataset: train 90.0%, test 10.0%
no overlapping indecies


In [7]:
# Define X,Y pairs (state, subgrid fluxes) for local network.local_torch_dataset = Data.TensorDataset(
BATCH_SIZE = 64  # Number of sample in each batch


###### training dataset #######
torch_dataset_train = Data.TensorDataset(
    torch.flatten(torch.from_numpy(X_input[train_ind]).float(), start_dim=0, end_dim=2) ,
    torch.flatten(torch.from_numpy(Y_output[train_ind]).float(), start_dim=0, end_dim=2) ,
)

train_loader = Data.DataLoader(
    dataset=torch_dataset_train, batch_size=BATCH_SIZE, shuffle=True
)
print('TRAIN')
print('X input shape:')
print(torch.flatten(torch.from_numpy(X_input[train_ind]).float(), start_dim=0, end_dim=2).shape)
print('Y output shape:')
print( torch.flatten(torch.from_numpy(Y_output[train_ind]).float(), start_dim=0, end_dim=2).shape)
print('')

###### test dataset #######
torch_dataset_test = Data.TensorDataset(
    torch.flatten(torch.from_numpy(X_input[test_ind]).float(), start_dim=0, end_dim=2),
    torch.flatten(torch.from_numpy(Y_output[test_ind]).float(), start_dim=0, end_dim=2)    
)

BATCH_SIZE_TEST = len(torch_dataset_test)

test_loader = Data.DataLoader(
    dataset=torch_dataset_test, batch_size=BATCH_SIZE_TEST, shuffle=False
)

print('TEST')
print('X input shape:')
print(torch.flatten(torch.from_numpy(X_input[test_ind]).float(), start_dim=0, end_dim=2).shape)
print('Y output shape:')
print( torch.flatten(torch.from_numpy(Y_output[test_ind]).float(), start_dim=0, end_dim=2).shape)
print('')



TRAIN
X input shape:
torch.Size([12168000, 10])
Y output shape:
torch.Size([12168000, 1])

TEST
X input shape:
torch.Size([1352000, 10])
Y output shape:
torch.Size([1352000, 1])



In [8]:

# use GPUs if available
if torch.cuda.is_available():
    print("CUDA Available")
    device = torch.device('cuda')
else:
    print('CUDA Not Available')
    device = torch.device('cpu')

CUDA Not Available


In [12]:
seed=123
batch_size=256
input_size=10
output_size=1
hidden_size = [10,10,10]
activation="ReLU"
arch="ann"
epochs=1
nz=24
save_path=BASE+"models"
save_name="ann_test.pt"
lr=0.0001
wd=0.01

## Wandb config file
config={"seed":seed,
        "lr":lr,
        "wd":wd,
        "batch_size":batch_size,
        "input_size":input_size,
        "output_size":output_size,
        "activation":activation,
        "save_name":save_name,
        "save_path":save_path,
        "arch":arch,
        "hidden_size":hidden_size,
        "epochs":epochs}

In [13]:
model=ann.ANN(config)
system=regression_system.RegressionSystem(model)

## Store the number of learanble parameters
config["learnable parameters"]=sum(p.numel() for p in model.parameters())

## Initialise wandb run - pass config dictionary storing the model parameters
wandb.init(project="submeso_ML",config=config)
wandb.watch(model, log_freq=1)

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[]

In [None]:
trainer = pl.Trainer(
    default_root_dir=model.config["save_path"],
    accelerator="auto",
    max_epochs=model.config["epochs"],
    #callbacks=pbar.ProgressBar(),
    logger=WandbLogger()
)

trainer.fit(system, train_loader, test_loader)
#wandb.log({"loss": loss,"epoch": epoch,"R-squared": r2, "correlation": corr })

model.save_model()

  rank_zero_warn(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type    | Params
--------------------------------------
0 | network   | ANN     | 341   
1 | criterion | MSELoss | 0     
--------------------------------------
341       Trainable params
0         Non-trainable params
341       Total params
0.001     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]