Here we want to run individual models for all sims, out upto 750 iterations, save models in scratch on NERSC, and be able to load the model individual parameters too. Then we can build maps / interpolators and also do a visualization based on t-SNE similar to the idea shown in the parametric neural networks paper that shows if implicitly the relation to the flux rope parameters can be learnt.

Load environment variables

In [1]:
import os
from pathlib import Path

In [2]:
os.getcwd()

'/global/u1/a/ajivani/WLROM/EdgeSS'

In [3]:
os.path.join(os.getcwd(), "logs")

'/global/u1/a/ajivani/WLROM/EdgeSS/logs'

In [4]:
import logging

# stealing this from: 
# https://github.com/rtqichen/torchdiffeq/blob/master/examples/odenet_mnist.py#L250C1-L274C18
def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False):
    logger = logging.getLogger()
    if debug:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logger.setLevel(level)
    if saving:
        info_file_handler = logging.FileHandler(logpath, mode="a")
        info_file_handler.setLevel(level)
        logger.addHandler(info_file_handler)
    if displaying:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(level)
        logger.addHandler(console_handler)
        #     logger.info(filepath)

        #     with open(filepath, "r") as f:
        #         logger.info(f.read())

        #     for f in package_files:
        #         logger.info(f)
        #         with open(f, "r") as package_f:
        #             logger.info(package_f.read())

    return logger

In [5]:
logger = get_logger(logpath=os.path.join(os.getcwd(), "individual_nn_training.log"))

In [6]:
print(os.environ['PSCRATCH'])

/pscratch/sd/a/ajivani


In [7]:
scratch_dir = os.environ["PSCRATCH"]
scratch_dir

'/pscratch/sd/a/ajivani'

In [8]:
def makedirs(dirname):
    try:
        os.mkdir(dirname)
        print("Dir '{}' created to save individual models".format(dirname))
    except FileExistsError:
        print("Path already exists")

In [9]:
trained_models = os.path.join(scratch_dir, "trained_models_1d_edge")

In [10]:
makedirs(trained_models)

Path already exists


Load Packages

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

In [12]:
adjoint=True

In [13]:
if adjoint:
    from torchdiffeq import odeint_adjoint as odeint
else:
    from torchdiffeq import odeint

In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [15]:
runModel=True

In [16]:
import numpy as np
import scipy.linalg as la
import scipy.sparse as sparse
import matplotlib.pyplot as plt

In [17]:
import time
import re

In [18]:
%matplotlib inline

In [19]:
plt.rc("axes.spines", right=True, top=True)
plt.rc("figure", dpi=300, 
       figsize=(9, 3)
      )
plt.rc("font", family="serif")
plt.rc("legend", edgecolor="none", frameon=True)
plt.style.use("dark_background")

In [20]:
import edge_utils as edut

Load a sample model and see how to access its parameters.

In [21]:
# sample_model = torch.load("dydt_Ay_global_model_5000_steps.pkl") 
# #this does not work, we shouldn't pickle model files probably?

Setup training loop for all sims, set up a logger

In [22]:
sims_to_remove = np.array([33, 39, 63, 73, 113, 128, 131, 142, 193, 218, 253, 264, 273, 312, 313, 324])

In [23]:
ed_2161, sd_2161 = edut.load_edge_data_blobfree(2161)

In [24]:
theta_s_2161, theta_e_2161 = np.linspace(0, 360, 512)[160] + 1.2 * 180 - 360, np.linspace(0, 360, 512)[320] + 1.2 * 180 - 360
print("Range of angles for CR2161: {} {}".format(theta_s_2161, theta_e_2161))

Range of angles for CR2161: -31.279843444227026 81.44031311154595


In [25]:
def getRValuesAllSims(edge_data_matrix):
    """
    Return r values for all sims at once so we don't lose time in training processing r values repeatedly
    """
    r_data_matrix = np.zeros(edge_data_matrix.shape)
    nsims = edge_data_matrix.shape[2]
    for i in range(nsims):
        r_vals, theta_vals = edut.getRValues(edge_data_matrix, simIdx=i, minStartIdx=0)
        r_data_matrix[:, :, i] = r_vals

    return r_data_matrix

In [26]:
rd_2161 = getRValuesAllSims(ed_2161)

In [27]:
nTimes, nTheta_2161, nSims_2161 = ed_2161.shape
nTimes, nTheta_2161, nSims_2161

(90, 160, 278)

In [28]:
theta_grid = np.linspace(np.ceil(theta_s_2161), np.ceil(theta_e_2161), nTheta_2161)

In [29]:
sd_modified = np.setdiff1d(sd_2161, sims_to_remove)

In [30]:
len(sd_modified)

262

In [31]:
## for reproducibility, we will just load these from file instead of regenerating each time because
## kernel will have to be restarted very frequently (long training process)
# from numpy.random import Generator, PCG64
# rng = Generator(PCG64())

# nTrain = int(np.floor(0.6 * len(sd_modified)))
# nCalib = int(np.floor(0.2 * len(sd_modified)))
# nTest = len(sd_modified) - nTrain - nCalib

# nTrain, nTest, nCalib

In [32]:
# sd_train = np.sort(np.random.choice(sd_modified, nTrain, replace=False))
# sd_test_calib = np.setdiff1d(sd_modified, sd_train)
# sd_calib = np.sort(np.random.choice(sd_test_calib, nCalib, replace=False))
# sd_test = np.setdiff1d(sd_test_calib, sd_calib)

# sd_test, sd_calib

In [33]:
### LOAD FROM FILE FOR NOW FOR REPRODUCIBILITY
sd_train = np.load(os.path.join(scratch_dir, trained_models, "sd_train.npy"))
sd_test  = np.load(os.path.join(scratch_dir, trained_models, "sd_test.npy"))
sd_calib = np.load(os.path.join(scratch_dir, trained_models, "sd_calib.npy"))

In [34]:
np.sum(np.sort(np.hstack((sd_test, sd_calib, sd_train))) == sd_modified)

262

In [35]:
train_sd_idx = np.array([np.where(sd_2161 == i)[0][0] for i in sd_train])
test_sd_idx  = np.array([np.where(sd_2161 == i)[0][0] for i in sd_test])

In [36]:
train_sd_idx_modified = np.array([np.where(sd_modified == i)[0][0] for i in sd_train])

In [37]:
test_sd_idx_modified = np.array([np.where(sd_modified == i)[0][0] for i in sd_test])

In [38]:
input_dim = rd_2161.shape[1]

In [39]:
input_dim

160

In [40]:
def getDataForSim(edge_data_matrix, r_data_matrix, sim_data, sid):
    """
    Take in a randomly chosen sim from the training set and return the following:
    y0_train_torch
    y_train_torch
    i.e. IC and data in torch tensor format on Device
    t_train_torch
    and correct sim_index from sim_data
    """
    
    sim_index = np.argwhere(sim_data == sid)[0][0]
    
    r_sim = r_data_matrix[:, :, sim_index]
    
    tMinIdx, tMin, tMaxIdx, tMax = edut.getTMinTMax(edge_data_matrix, simIdx=sim_index)
    
    r_sim_valid = r_sim[tMinIdx:(tMaxIdx+1), :]
    valid_times = np.arange(tMin, tMax + 2, step=2)
    
    tTrainEnd = tMin + np.floor((2/3)*(tMax - tMin))
    
    
    trainEndIdx = np.argmin(np.abs(valid_times - tTrainEnd))
    #     trainEndIdx = np.argwhere(valid_times == tTrainEnd)[0][0]
    
    tTrain = valid_times[:(trainEndIdx + 1)]
    
    tTest = valid_times[(trainEndIdx + 1):]
    
    tTrainScaled = (tTrain - tMin) / (tMax - tMin)
    tTestScaled = (tTest - tMin) / (tMax - tMin)
    
    tAllScaled = (valid_times - tMin) / (tMax - tMin)
    
    y0_train_orig = r_sim_valid[0, :]
    y0_train_torch = torch.from_numpy(np.float32(y0_train_orig))
    y0_train_torch = y0_train_torch.reshape((1, len(y0_train_torch))).to(device)
    
    
    y_train_orig = r_sim_valid[:(trainEndIdx + 1), :]
    y_train_torch = torch.from_numpy(np.expand_dims(np.float32(y_train_orig), axis=1)).to(device)
    
    y_full_torch = torch.from_numpy(np.expand_dims(np.float32(r_sim_valid), axis=1)).to(device)
    
    t_train_torch = torch.tensor(np.float32(tTrainScaled)).to(device)
    t_scaled_torch = torch.tensor(np.float32(tAllScaled)).to(device)
    
    return y0_train_torch, y_train_torch, y_full_torch, t_train_torch, t_scaled_torch, sim_index

In [41]:
def get_batch(torch_train_data, torch_train_time, batch_time=5, batch_size=10):
    s = torch.from_numpy(np.random.choice(np.arange(len(torch_train_time) - batch_time, dtype=np.int64),
                                          batch_size,
                                          replace=False))
    batch_y0 = torch_train_data[s]  # (M, D)
    batch_t = torch.zeros((batch_size, batch_time))
    for i in range(batch_size):
        batch_t[i, :] = torch_train_time[s[i]:(s[i] + batch_time)]
        
    batch_y = torch.stack([torch_train_data[s + i] for i in range(batch_time)], dim=0)  # (T, M, D)
    return batch_y0.to(device), batch_t.to(device), batch_y.to(device)

In [42]:
class ODEFunc(nn.Module):

    def __init__(self, dim):
        super(ODEFunc, self).__init__()

        self.net1 = nn.Sequential(
            nn.Linear(dim, 100),
            nn.Tanh(),
            nn.Linear(100, 100),
            nn.Tanh(),
            nn.Linear(100, dim),
        )
        
        for m in self.net1.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, mean=0, std=0.1)
                nn.init.constant_(m.bias, val=0)
                
    def forward(self, t, y):
        return self.net1(y)

In [43]:
class RunningAverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self, momentum=0.99):
        self.momentum = momentum
        self.losses = []
        self.reset()

    def reset(self):
        self.val = None
        self.avg = 0

    def update(self, val):
        if self.val is None:
            self.avg = val
        else:
            self.avg = self.avg * self.momentum + val * (1 - self.momentum)
        self.val = val
        self.losses.append(self.avg)

In [44]:
def getAbsoluteAvgBatchLoss(prediction_tensor, target_tensor):
    """
    both tensors are of dims (n_times, n_batch, 1, len_traj) or (n_times, 1, len_traj)
    """
    
    # swap dimensions of batch and time
    if len(prediction_tensor.shape) > 3:
        prediction_tensor = torch.permute(prediction_tensor, (1, 0, 2, 3))
        target_tensor = torch.permute(target_tensor, (1, 0, 2, 3))
        # print(prediction_tensor.shape)#testing
        # get torch.abs of difference
        abs_diff = torch.abs(prediction_tensor - target_tensor)

        # sum over trajectory length
        batch_sums = torch.sum(abs_diff, 3)
    else:
        prediction_tensor = torch.permute(prediction_tensor, (1, 0, 2))
        target_tensor = torch.permute(target_tensor, (1, 0, 2))
        # print(prediction_tensor.shape)#testing
        # get torch.abs of difference
        abs_diff = torch.abs(prediction_tensor - target_tensor)

        # sum over trajectory length
        batch_sums = torch.sum(abs_diff, 2)


    # print(batch_sums.shape)  #testing
    
    # get mean of final tensor to get batch and time averaged absolute loss
    batch_loss = torch.mean(batch_sums)
    
    # print(batch_loss)#testing
    
    return batch_loss

In [45]:
# Stealing from: 
# github.com/rtqichen/torchdiffeq/blob/84e220ac9ea9367c14933d8f141fc2791034ec88/
# examples/odenet_mnist.py#L241C1-L242C73

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [46]:
# Sketch of what each loop would look like:

def runNeuralODE(simID, model_save_path, niters=750, test_freq=5, batch_time=5, batch_size=10):
    y0_train_torch, y_train_torch, y_full_torch, t_train_torch, t_scaled_torch, sim_index = getDataForSim(ed_2161,rd_2161,sd_2161,simID)

    time_meter = RunningAverageMeter(0.97)
    loss_meter = RunningAverageMeter(0.97)

    ii = 0
    func = ODEFunc(160).to(device)
    optimizer = optim.Adam(func.parameters(), lr=1e-3)
    end = time.time()

    logger.info(func)
    logger.info('Number of parameters: {}'.format(count_parameters(func)))
    
    niters=niters
    test_freq=test_freq

    for itr in range(1, niters + 1):
        optimizer.zero_grad()
        batch_y0, batch_t, batch_y = get_batch(y_train_torch, 
                                               t_train_torch,
                                              batch_time=batch_time,
                                              batch_size=batch_size)
        pred_y = torch.zeros_like(batch_y)

        for i in range(batch_size):
            pred_y[:, i, :, :] = odeint(func, 
                                        batch_y0[i, :, :], 
                                        batch_t[i, :]).to(device)

        loss = getAbsoluteAvgBatchLoss(pred_y, batch_y)
        loss.backward()
        optimizer.step()

        time_meter.update(time.time() - end)
        loss_meter.update(loss.item())

        if itr % test_freq == 0:
            with torch.no_grad():
                pred_y_full_series = odeint(func, 
                                            y0_train_torch, 
                                            t_scaled_torch)
                loss_full_series = getAbsoluteAvgBatchLoss(pred_y_full_series, 
                                                           y_full_torch)
                # print("Iter {:04d} | Total Loss {:.6f} | Sim {:03d} ".format(itr,
                                                                # loss_full_series.item(), 
                                                                # simID))
                
                
                logger.info("Iter {:04d} | Total Loss {:.6f} | Sim {:03d} ".format(itr,
                                                                loss_full_series.item(), 
                                                                simID))

                ii += 1

        end = time.time()
    
        
    model_dict_name = "dydt_Ay_750_steps_sim_{:03d}.pt".format(simID)
    torch.save(func.state_dict(), os.path.join(model_save_path, model_dict_name))
    # print("Saved Model for Sim {:03d}".format(simID))
    logger.info("Saved Model for Sim {:03d}\n".format(simID))
    torch.cuda.empty_cache()

### Call training process for one or more sims and save model once trained

In [47]:
trained_models

'/pscratch/sd/a/ajivani/trained_models_1d_edge'

In [48]:
sd_train

array([ 31,  32,  35,  37,  40,  42,  43,  45,  46,  47,  48,  49,  52,
        53,  57,  59,  60,  61,  62,  66,  67,  70,  71,  74,  75,  76,
        77,  78,  79,  80,  83,  84,  85,  87,  90,  91,  92,  94,  95,
        96,  97,  98, 100, 101, 102, 106, 108, 111, 112, 114, 120, 121,
       123, 124, 130, 132, 135, 136, 138, 139, 140, 141, 143, 147, 148,
       149, 150, 151, 154, 155, 156, 157, 158, 159, 161, 164, 174, 178,
       180, 183, 184, 185, 190, 192, 194, 196, 197, 198, 199, 202, 204,
       205, 209, 210, 211, 212, 215, 216, 217, 219, 220, 221, 222, 223,
       229, 230, 232, 234, 235, 236, 238, 240, 241, 242, 243, 244, 246,
       248, 249, 251, 252, 254, 256, 257, 260, 261, 263, 271, 274, 275,
       276, 277, 278, 279, 281, 282, 284, 287, 288, 289, 290, 291, 293,
       298, 299, 301, 303, 305, 311, 314, 315, 317, 318, 319, 323, 325,
       329])

In [49]:
# SIMS STILL PENDING
# 32, 37, 40, 47, 52, 53, 61, 62, 78, 79, 87, 91, 92, 95, 98, 100 (rest all are correct)

In [50]:
sd_train[140]

290

In [51]:
sd_train[156]

329

In [52]:
pending_sims = np.array([32, 37, 40, 47, 52, 53, 61, 62, 78, 79, 87, 91, 92, 95, 98, 100])

We probably need some sort of script that can launch jobs on multiple GPUs or try running models in parallel on the same GPU? abandoning this idea for now as there are very specific implementations
that may or may not apply to this scenario.

In [53]:
# for sim in sd_train[140:]:
for sim in pending_sims:
    runNeuralODE(sim, trained_models, niters=750, test_freq=5)

ODEFunc(
  (net1): Sequential(
    (0): Linear(in_features=160, out_features=100, bias=True)
    (1): Tanh()
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): Tanh()
    (4): Linear(in_features=100, out_features=160, bias=True)
  )
)
Number of parameters: 42360
Iter 0005 | Total Loss 343.554352 | Sim 032 
Iter 0010 | Total Loss 309.172302 | Sim 032 
Iter 0015 | Total Loss 273.139252 | Sim 032 
Iter 0020 | Total Loss 234.920319 | Sim 032 
Iter 0025 | Total Loss 195.499725 | Sim 032 
Iter 0030 | Total Loss 156.908218 | Sim 032 
Iter 0035 | Total Loss 121.750351 | Sim 032 
Iter 0040 | Total Loss 90.421333 | Sim 032 
Iter 0045 | Total Loss 64.205299 | Sim 032 
Iter 0050 | Total Loss 47.325859 | Sim 032 
Iter 0055 | Total Loss 39.042061 | Sim 032 
Iter 0060 | Total Loss 35.162102 | Sim 032 
Iter 0065 | Total Loss 32.035881 | Sim 032 
Iter 0070 | Total Loss 29.533352 | Sim 032 
Iter 0075 | Total Loss 28.328007 | Sim 032 
Iter 0080 | Total Loss 27.382954 | Sim 032 
Iter 0