In [1]:
import numpy as np
import torch
import torch_geometric as pyg
import graphPINN
import math

from time import time
from scipy.io import savemat, loadmat
import os

from hanging_threads import start_monitoring

In [2]:
# os.environ['MKL_THREADING_LAYER'] = 'GNU' # fixes a weird intel multiprocessing error with numpy

folder = "C:\\Users\\NASA\\Documents\\ML_checkpoints\\2023-10-10\\"
if not os.path.exists(folder):
    os.makedirs(folder)
logfn = graphPINN.debug.Logfn(folder)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for j in range(torch.cuda.device_count()):
    logfn(f"{j}: {graphPINN.debug.pretty_size(torch.cuda.get_device_properties(j).total_memory)} of {'cuda' if torch.cuda.is_available() else 'cpu'} memory")

k = 100
ddp = True

l = 100
bd = 200
dataset = graphPINN.data.MHSDataset(f'D:\\v4_set_k={k}_l={l}_bd={bd}',k=k, l=l, bd=bd)

0: 48.0 GiB of cuda memory
1: 48.0 GiB of cuda memory


In [4]:
# layer 1 of prop: B_0 + x,y_0 + F_0 + x,y,z_node + F_node = 14
#     propdesign = [12,6,3]
propdesign = [14,6,3]
# layer 1 of conv: P_k + x,y,z_k + F_k + P_node + x,y,z_node + F_node = 18
#     convdesign = [18,9,6,3]
convdesign = [18,12,3]

propkernel = graphPINN.KernelNN(propdesign, torch.nn.ReLU)
propgraph = graphPINN.BDPropGraph(propkernel)
convkernel = graphPINN.KernelNN(convdesign, torch.nn.ReLU)
convgraph = graphPINN.ConvGraph(convkernel)
model = graphPINN.FullModel(propgraph, convgraph)

In [5]:
trainset, validset, testset = torch.utils.data.random_split(dataset,[0.8, 0.1, 0.1],generator=torch.Generator().manual_seed(314))
#     trainset, validset, testset = torch.utils.data.random_split(dataset,[0.01, 0.005, 0.985],generator=torch.Generator().manual_seed(314))

epochs = 5

logfn(len(trainset))
logfn(len(validset))

4594
574


In [9]:
training_loss, validation_loss, state_dict = graphPINN.learn.train(
            model, trainset, validset, use_tqdm = True, lossindex=[-1],
            epochs=epochs, logfn=logfn, checkpointfile=folder, use_ddp = False, optmethod=torch.optim.LBFGS)
model.load_state_dict(state_dict)
lossdict = { 'trainloss':  training_loss.numpy(),
             'validloss':validation_loss.numpy(),
             'index_array':str(lossindex)
           }
logfn(f'training loss:\n{lossdict[f"trainloss"]}')
logfn(f'validation loss:\n{lossdict[f"validloss"]}')

torch.save(model, f'{folder}model_trainsize-{len(trainset)}_k-{k}_params-{math.prod(convdesign)+math.prod(propdesign)}.pt')
savemat(f'{folder}loss_{epochs}_trainsize-{len(trainset)}_k-{k}_params-{math.prod(convdesign)+math.prod(propdesign)}.mat',lossdict)

-- starting index 0 = -1 --


  0%|          | 0/4594 [00:00<?, ?it/s]

--vec:0.1374366581439972, mhs:0.05237735062837601, div:0.015456970781087875--
  iter 1/4594, loss 0.20527097582817078
--vec:0.04796988517045975, mhs:0.0870659202337265, div:0.006511620711535215--
  iter 2/4594, loss 0.14154741168022156
--vec:0.05644085258245468, mhs:0.17706474661827087, div:0.008398346602916718--
  iter 3/4594, loss 0.24190396070480347
--vec:0.054155923426151276, mhs:0.13004478812217712, div:0.012861459515988827--
  iter 4/4594, loss 0.1970621645450592
--vec:0.04105019569396973, mhs:0.07264723628759384, div:0.008229188621044159--
  iter 5/4594, loss 0.12192662805318832
--vec:0.058447204530239105, mhs:0.13542506098747253, div:0.00864883791655302--
  iter 6/4594, loss 0.20252110064029694
--vec:9592769536.0, mhs:9.644118193213958e+19, div:2518644480.0--
  iter 7/4594, loss 9.644118193213958e+19
--vec:nan, mhs:nan, div:nan--
NaN found! dataset [tensor([8417], device='cuda:0')]
tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan

AssertionError: Nan found

In [None]:
import importlib
importlib.reload(graphPINN)