In [1]:
from astropy.table import Table
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import h5py
import matplotlib.pyplot as plt
from time import time

In [2]:
torch.cuda.current_device()

0

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA A100-PCIE-40GB MIG 3g.20gb'

In [4]:
# defining the Dataset class
class train_set(Dataset):
    def __init__(self,file):
        fn = h5py.File(file, 'r')
        self.f = fn
        
    def __len__(self):
        return self.f['group_1']['data'].shape[1]
  
    def __getitem__(self, index):
        # get data
        dset = self.f['group_1']['data']
        x = dset[:].T
        x = torch.Tensor(x[index])
        

        # get label
        ydset = self.f['group_1']['label']
        y = ydset[:].T
        y = torch.Tensor(y[index])
        # torch.from_numpy(y[index]) does not work since y is doubles and not floats.
        
        # get error in label # comment out for non-error label runs
        errdset = self.f['group_1']['e_label']
        err = errdset[:].T
        err = torch.Tensor(err[index])
        return (x,y,err)

class test_set(Dataset):
    def __init__(self,file):
        fn = h5py.File(file, 'r')
        self.f = fn
        
    def __len__(self):
        return self.f['group_2']['data'].shape[1]
  
    def __getitem__(self, index):
        # get data
        dset = self.f['group_2']['data']
        x = dset[:].T
        x = torch.from_numpy(x[index])

        # get label
        ydset = self.f['group_2']['label']
        y = ydset[:].T
        y = torch.from_numpy(y[index])
        
        # get error in label # comment out for non-error label runs
        errdset = self.f['group_2']['e_label']
        err = errdset[:].T
        err = torch.from_numpy(err[index])
        return (x.float(),y.float(),err.float())

In [5]:
# training_data = train_set("/arc/home/aydanmckay/mydata.h5")
# test_data = test_set("/arc/home/aydanmckay/mydata.h5")
training_data = train_set("/arc/home/aydanmckay/mydataelabelssmallscalecuts.h5")
test_data = test_set("/arc/home/aydanmckay/mydataelabelssmallscalecuts.h5")
##################################################################################
# training_data = test_set("/arc/home/aydanmckay/mydataelabelssmallscalecuts.h5")
# test_data = test_set("/arc/home/aydanmckay/mydataelabelssmallscalecuts.h5")

In [6]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(3, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 110)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [7]:
batchlen = 16
train_dataloader = DataLoader(
    training_data,
    batch_size=batchlen,
    # shuffle=True
)
test_dataloader = DataLoader(
    test_data,
    batch_size=batchlen,
    # shuffle=True
)

In [8]:
len(train_dataloader)*batchlen

45008

In [9]:
model = Net()
print(model)

Net(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=128, out_features=256, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Linear(in_features=256, out_features=512, bias=True)
    (7): LeakyReLU(negative_slope=0.01)
    (8): Linear(in_features=512, out_features=110, bias=True)
  )
)


In [10]:
lr = 1e-2
epochs = 20

In [11]:
# loss_fn = nn.L1Loss()
loss_fn = nn.MSELoss()
# loss_fn = nn.GaussianNLLLoss()

In [12]:
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=lr,
    momentum=0.9
)
# optimizer = torch.optim.Adam(
#     model.parameters(),
#     lr=lr
# )

In [13]:
def res(preds,dataloader,epoch,resi='rel'):
    fig, axs = plt.subplots(110)
    fig.set_figheight(600)
    
    with torch.no_grad():
        for num, (X, y, z) in enumerate(dataloader):
            if resi == 'rel':
                residual = (y-preds[num])/y
                string = 'Relative Residual'
            elif resi == 'err':
                residual = (y-preds[num])/z
                string = 'Residual Over Label Error'
            elif resi == 'res':
                residual = y-preds[num]
                string = 'Residual'
            for it in range(len(y.T)):
                axs[it].plot(y.T[it],residual.T[it],'k.',alpha=0.1)
                axs[it].set_xlabel('Observed XP Coefficient Value')
                axs[it].set_ylabel('Relative Residual')
                axs[it].set_title('XP Coefficient '+str(it+1)+' '+string)
                
    plt.savefig('/arc/home/aydanmckay/torchplots/test'+resi+'residualsWL1smallepoch'+str(epoch)+'scalecutsep5.png')
    plt.close()
    
def corrplot(preds,dataloader,epoch):
    fig, axs = plt.subplots(110)
    fig.set_figheight(600)
    
    with torch.no_grad():
        for num, (X, y, z) in enumerate(dataloader):
            pred = preds[num]
            # print(pred)
            for it in range(len(y.T)):
                axs[it].plot(y.T[it],pred.T[it],'k.',alpha=0.1)
                axs[it].set_xlabel('Observed')
                axs[it].set_ylabel('Predicted')
                axs[it].set_title('XP Coefficient '+str(it+1))
                
    plt.savefig('/arc/home/aydanmckay/torchplots/testcorrWL1smallepoch'+str(epoch)+'scalecutsep5.png')
    plt.close()

In [14]:
def train(dataloader, model, loss_fn, optimizer,epoch):
    model.train()
    size = len(dataloader.dataset)
    running_loss = 0.
    for batch, (X, y, z) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred,y)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if (batch % 100 == 0) and (batch != 0):
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {running_loss/batch:>7f}  [{current:>5d}/{size:>5d}]")
            
    print(f"loss: {running_loss/len(dataloader):>7f}  [45000/{size:>5d}]")
    return running_loss/len(dataloader)

def test(dataloader, model, loss_fn,epoch):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # preds = []
    with torch.no_grad():
        for X, y, z in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred == y).type(torch.float).sum().item()
            # preds.append(pred)
    
    # preds = np.array(preds)
    # res(preds,dataloader,epoch,resi='res')
    # res(preds,dataloader,epoch,resi='rel')
    # res(preds,dataloader,epoch,resi='err')
    # corrplot(preds,dataloader,epoch)
    
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

In [None]:
valloss = []
traloss = []
for t in range(epochs):
    t0 = time()
    print(f"Epoch {t+1}\n-------------------------------")
    trainloss = train(train_dataloader, model, loss_fn, optimizer,t)
    testloss = test(test_dataloader, model, loss_fn,t)
    valloss.append(testloss)
    traloss.append(trainloss)
    if valloss[t] > valloss[t-1]:
        print('Validation loss increase.')
        break
    # torch.save({
    #             'epoch': t,
    #             'model_state_dict': model.state_dict(),
    #             'optimizer_state_dict': optimizer.state_dict(),
    #             'loss': tloss,
    #             }, '/arc/home/aydanmckay/ml/torchnn/checkpoints/checkpointWGLsmallepoch'+str(t)+'scalecutsep5.pth')
    # torch.save(model.state_dict(), "/arc/home/aydanmckay/torchmodel/torchmodelWsmallscalecutsep5iter"+str(t)+".pth")
    t1 = time()
    print(f'Elapsed epoch time: {t1-t0:.2f} s')
print("Training completed")

Epoch 1
-------------------------------
loss: 1.001409  [ 1600/45000]
loss: 0.979415  [ 3200/45000]
loss: 0.990593  [ 4800/45000]
loss: 0.988683  [ 6400/45000]
loss: 0.994332  [ 8000/45000]
loss: 0.987078  [ 9600/45000]
loss: 0.970611  [11200/45000]
loss: 0.961671  [12800/45000]
loss: 0.946789  [14400/45000]
loss: 0.940683  [16000/45000]
loss: 0.933641  [17600/45000]
loss: 0.928824  [19200/45000]
loss: 0.922830  [20800/45000]
loss: 0.915777  [22400/45000]
loss: 0.913579  [24000/45000]
loss: 0.907119  [25600/45000]
loss: 0.907263  [27200/45000]
loss: 0.905170  [28800/45000]
loss: 0.901415  [30400/45000]
loss: 0.901980  [32000/45000]
loss: 0.896823  [33600/45000]
loss: 0.894377  [35200/45000]
loss: 0.889752  [36800/45000]
loss: 0.886499  [38400/45000]
loss: 0.884921  [40000/45000]
loss: 0.883924  [41600/45000]
loss: 0.880981  [43200/45000]
loss: 0.877998  [44800/45000]
loss: 0.877228  [45000/45000]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.840261 

Elapsed epoch time: 1844.37 s
Epoch 2
-

In [None]:
plt.plot(range(1,epochs+1),traloss,'-o',label='Train Loss')
plt.plot(range(1,epochs+1),valloss,'-o',label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (L2)')
# plt.ylim(0.65,0.67)
plt.legend(fancybox=True)
plt.title('Test Loss per Epoch')
plt.savefig('/arc/home/aydanmckay/torchplots/lossL2smallscalecutsbl16lr-2SGDep20.png')
plt.show()

In [None]:
torch.save(model.state_dict(), "/arc/home/aydanmckay/torchmodel/torchmodelL2smallscalecutsbl16lr-2SGDep20.pth")