In [4]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import optim
from adahessian import Adahessian, get_params_grad
import torch.optim.lr_scheduler as lr_scheduler

## Writing the data-loader class for the GEO-dataset (can be changed to wrok for both the training and validation sets)

In [5]:
def GEOdataLoader(path, bSize= 16):
    """
    Specify the path to the parent folder containing the .npy files
    """
    X_tr= np.load(os.path.join(path, "X_tr.npy"))
    Y_tr= np.load(os.path.join(path, "Y_tr.npy"))
    print(X_tr.shape)
    print(Y_tr.shape)
    test_data= torch.utils.data.TensorDataset(torch.from_numpy(X_tr).float(), torch.from_numpy(Y_tr).float())
    trainLoader= torch.utils.data.DataLoader(test_data, batch_size=bSize, shuffle=True) 
    return trainLoader 

# loading the data
trainLoader= GEOdataLoader("/home/aryamanj/Downloads/LGdata")

(88807, 943)
(88807, 4760)


In [6]:
# standardised class for all network architectures
class Network(nn.Module):
    def __init__(self, size1, size2, drop):
        super(Network, self).__init__()
        self.net= nn.Sequential(
            nn.Linear(943, size1),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size1, size2),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size2, 4760) 
        )

    def forward(self, input):
        return self.net(input)

In [7]:
# network class for the wide-medium architecture
class wideM(nn.Module):
    def __init__(self, size1, drop):
        super(wideM, self).__init__()
        self.net= nn.Sequential(
            nn.Linear(943, size1),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size1, 4760) 
        )

    def forward(self, input):
        return self.net(input)

In [8]:
# Defining the tiltedLC and it's corresponding torch class

def tiltedLC(x, y, tau, h):
    e= y-x # errors
    ind= (torch.sign(e)+1)/2 # the division in the log-cosh is only about the origin
    quantFactor= (1-tau)*(1-ind) + tau*ind
    loss= quantFactor*torch.log(torch.cosh(e/h))
    loss= torch.mean(loss)*h
    return loss

class TiltedLC(nn.Module):
    def __init__(self):
        super(TiltedLC, self).__init__()
    def forward(self, x, y, tau, h):
        return tiltedLC(x, y, tau, h)

## Instantiations and training loops start for constant LR's

In [72]:
# instantiating all the loss functions, optimizer algos and models for training
# the small architecture
size1= 300
size2= 300
tau= 0.5
h= 0.4
device= ('cuda' if torch.cuda.is_available() else 'cpu')
criterion1= TiltedLC()
model1= Network(size1, size2, 0.1).to(device)
optimizer1= optim.SGD(model1.parameters(), lr= 0.1)

In [77]:
# The fundamental training loop
ls_a= []
N_EPOCHS= 500
# threshold= 0.5040 -- not using threshold values for now, instead just doing the training for the same number of EPOCHS for comparison
for epoch in range(N_EPOCHS):
    epoch_loss = 0.0
    for inputs, labels in trainLoader: 
        inputs= inputs.to(device) 
        labels= labels.to(device)
        optimizer1.zero_grad() 
        outputs= model1(inputs) 
        loss= criterion1(outputs, labels, tau, h) 
        loss.backward() 
        optimizer1.step() 
        epoch_loss+= loss.item() 
    ls_a.append((epoch_loss/len(trainLoader)))
    print("Epoch: {} Loss: {}".format(epoch,
           epoch_loss/len(trainLoader)))

Epoch: 0 Loss: 0.13386895350536082
Epoch: 1 Loss: 0.1336473765160732
Epoch: 2 Loss: 0.13338832228092692
Epoch: 3 Loss: 0.13315786012597522
Epoch: 4 Loss: 0.13294416327660974
Epoch: 5 Loss: 0.13271940638865035
Epoch: 6 Loss: 0.13252280629437113
Epoch: 7 Loss: 0.132285691769547
Epoch: 8 Loss: 0.13209931786490697
Epoch: 9 Loss: 0.13190648027893187
Epoch: 10 Loss: 0.1317071426512851
Epoch: 11 Loss: 0.1315119389885504
Epoch: 12 Loss: 0.13133860254095517
Epoch: 13 Loss: 0.1311479967038517
Epoch: 14 Loss: 0.13099940435115634
Epoch: 15 Loss: 0.13081443068052795
Epoch: 16 Loss: 0.13064387180562106
Epoch: 17 Loss: 0.13048930255314656
Epoch: 18 Loss: 0.1303272681222523
Epoch: 19 Loss: 0.13016274789124432
Epoch: 20 Loss: 0.13001288522865037
Epoch: 21 Loss: 0.12986963122030826
Epoch: 22 Loss: 0.12971601760848064
Epoch: 23 Loss: 0.12958107533819546
Epoch: 24 Loss: 0.12943074332526344
Epoch: 25 Loss: 0.12929609544216847
Epoch: 26 Loss: 0.1291717951525076
Epoch: 27 Loss: 0.1290396464190984
Epoch: 28 L

In [73]:
# the medium architecture 
size1= 1000
size2= 1000
tau= 0.5
h= 0.4
device = ('cuda' if torch.cuda.is_available() else 'cpu')
criterion2= TiltedLC()
model2= Network(size1, size2, 0.1).to(device)
optimizer2= optim.SGD(model2.parameters(), lr= 0.1)

In [78]:
# training loop for medium architecture
ls_b= []
N_EPOCHS= 500
# threshold= 0.5040 -- not using threshold values for now, instead just doing the training for the same number of EPOCHS for comparison
for epoch in range(N_EPOCHS):
    epoch_loss = 0.0
    for inputs, labels in trainLoader: 
        inputs= inputs.to(device) 
        labels= labels.to(device)
        optimizer2.zero_grad() 
        outputs= model2(inputs) 
        loss= criterion2(outputs, labels, tau, h) 
        loss.backward() 
        optimizer2.step() 
        epoch_loss+= loss.item() 
    ls_b.append((epoch_loss/len(trainLoader)))
    print("Epoch: {} Loss: {}".format(epoch,
           epoch_loss/len(trainLoader)))

Epoch: 0 Loss: 0.22437675972121016
Epoch: 1 Loss: 0.18908993501724197
Epoch: 2 Loss: 0.17391823054331743
Epoch: 3 Loss: 0.1645287473692247
Epoch: 4 Loss: 0.1580595013389736
Epoch: 5 Loss: 0.15333088454234237
Epoch: 6 Loss: 0.14964595208740775
Epoch: 7 Loss: 0.14657005375756849
Epoch: 8 Loss: 0.14395680926690507
Epoch: 9 Loss: 0.1417040350850134
Epoch: 10 Loss: 0.13972982083343424
Epoch: 11 Loss: 0.13798714435045795
Epoch: 12 Loss: 0.13643127892161677
Epoch: 13 Loss: 0.13504712467090738
Epoch: 14 Loss: 0.13380356526116038
Epoch: 15 Loss: 0.13268313399871562
Epoch: 16 Loss: 0.13165820268650696
Epoch: 17 Loss: 0.13072460318305346
Epoch: 18 Loss: 0.12984753040107944
Epoch: 19 Loss: 0.12906806141411886
Epoch: 20 Loss: 0.12830473296579167
Epoch: 21 Loss: 0.12760660851179983
Epoch: 22 Loss: 0.12694354597334817
Epoch: 23 Loss: 0.12632951623731808
Epoch: 24 Loss: 0.1257495993664084
Epoch: 25 Loss: 0.12520848965455855
Epoch: 26 Loss: 0.12469043632607872
Epoch: 27 Loss: 0.1241981894102661
Epoch: 

In [74]:
# The large architecture
size1= 3000
size2= 3000
tau= 0.5
h= 0.4
device = ('cuda' if torch.cuda.is_available() else 'cpu')
criterion3= TiltedLC()
model3= Network(size1, size2, 0.1).to(device)
optimizer3= optim.SGD(model3.parameters(), lr= 0.1)

In [None]:
# training loop for the large architecture
ls_c= []
N_EPOCHS= 500
# threshold= 0.5040 -- not using threshold values for now, instead just doing the training for the same number of EPOCHS for comparison
for epoch in range(N_EPOCHS):
    epoch_loss= 0.0
    for inputs, labels in trainLoader: 
        inputs= inputs.to(device) 
        labels= labels.to(device)
        optimizer3.zero_grad() 
        outputs= model3(inputs) 
        loss= criterion3(outputs, labels, tau, h) 
        loss.backward() 
        optimizer3.step() 
        epoch_loss+= loss.item() 
    ls_c.append((epoch_loss/len(trainLoader)))
    print("Epoch: {} Loss: {}".format(epoch,
           epoch_loss/len(trainLoader)))

In [83]:
# the wide-medium architecture
size1= 2000
tau= 0.5
h= 0.4
device = ('cuda' if torch.cuda.is_available() else 'cpu')
criterion4= TiltedLC()
model4= wideM(size1, 0.2).to(device)
optimizer4= optim.SGD(model4.parameters(), lr= 0.1)

In [84]:
# training loop for wide-medium architecture
ls_d= []
N_EPOCHS= 500
# threshold= 0.5040 -- not using threshold values for now, instead just doing the training for the same number of EPOCHS for comparison
for epoch in range(N_EPOCHS):
    epoch_loss= 0.0
    for inputs, labels in trainLoader: 
        inputs= inputs.to(device) 
        labels= labels.to(device)
        optimizer4.zero_grad() 
        outputs= model4(inputs) 
        loss= criterion4(outputs, labels, tau, h) 
        loss.backward() 
        optimizer4.step() 
        epoch_loss+= loss.item() 
    ls_d.append((epoch_loss/len(trainLoader)))
    print("Epoch: {} Loss: {}".format(epoch,
           epoch_loss/len(trainLoader)))

Epoch: 0 Loss: 0.209590157707024
Epoch: 1 Loss: 0.1704578223560892
Epoch: 2 Loss: 0.15676667146388742
Epoch: 3 Loss: 0.14901596169373169
Epoch: 4 Loss: 0.14367494402630138
Epoch: 5 Loss: 0.13961779341904285
Epoch: 6 Loss: 0.13637957213252885
Epoch: 7 Loss: 0.1337346183063962
Epoch: 8 Loss: 0.13150652691687018
Epoch: 9 Loss: 0.12961816856574848
Epoch: 10 Loss: 0.12796683837928938
Epoch: 11 Loss: 0.1265219940304477
Epoch: 12 Loss: 0.12522861315197525
Epoch: 13 Loss: 0.12408294251423187
Epoch: 14 Loss: 0.12303413431658614
Epoch: 15 Loss: 0.12208520993925906
Epoch: 16 Loss: 0.12122030708810656
Epoch: 17 Loss: 0.12041974248751458
Epoch: 18 Loss: 0.11967730591681987
Epoch: 19 Loss: 0.11899999151695227
Epoch: 20 Loss: 0.11835443277473687
Epoch: 21 Loss: 0.11776173981170013
Epoch: 22 Loss: 0.11720020041457599
Epoch: 23 Loss: 0.11667675201344374
Epoch: 24 Loss: 0.11619489071170955
Epoch: 25 Loss: 0.11572806608288208
Epoch: 26 Loss: 0.115306452062995
Epoch: 27 Loss: 0.11489788307824278
Epoch: 28

In [82]:
# saving the EPOCH-losses
A= np.asarray(ls_a)
B= np.asarray(ls_b)
C= np.asarray(ls_c)
D= np.asarray(ls_d)

np.save("/home/aryamanj/Downloads/ls_a.npy", A)
np.save("/home/aryamanj/Downloads/ls_b.npy", B)
np.save("/home/aryamanj/Downloads/ls_c.npy", C)
np.save("/home/aryamanj/Downloads/ls_d.npy", D)

## Instantiations and training loops using adaH as the optimizer algorithm

In [11]:
# the small architecture
size1= 300
size2= 300
tau= 0.5
h= 0.4
device= ('cuda' if torch.cuda.is_available() else 'cpu')
criterion5= TiltedLC()
model5= Network(size1, size2, 0.1).to(device)
optimizer5= Adahessian(model5.parameters(),lr=.15)
scheduler5= lr_scheduler.MultiStepLR(
    optimizer5,
    [30,45], # 
    gamma=.1,
    last_epoch=-1)

In [12]:
# training loop for the small architecture
ls_e= []
N_EPOCHS= 100
# threshold= 0.5040 -- not using threshold values for now, instead just doing the training for the same number of EPOCHS for comparison
for epoch in range(N_EPOCHS):
    epoch_loss= 0.0
    for inputs, labels in trainLoader: 
        inputs= inputs.to(device) 
        labels= labels.to(device)
        optimizer5.zero_grad() 
        outputs= model5(inputs) 
        loss= criterion5(outputs, labels, tau, h) 
        loss.backward(create_graph=True)
        _, gradsH = get_params_grad(model5)
        optimizer5.step(gradsH)
        epoch_loss+= loss.item() 
    ls_e.append((epoch_loss/len(trainLoader)))
    print("Epoch: {} Loss: {}".format(epoch,
           epoch_loss/len(trainLoader)))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /opt/conda/conda-bld/pytorch_1616554794034/work/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg.mul_(beta1).add_(1 - beta1, grad)


Epoch: 0 Loss: 0.16603829289149474
Epoch: 1 Loss: 0.16330337790439117
Epoch: 2 Loss: 0.16223583395680616
Epoch: 3 Loss: 0.16146252985082002
Epoch: 4 Loss: 0.16090061644421377
Epoch: 5 Loss: 0.16058397461134902
Epoch: 6 Loss: 0.16024065135277363
Epoch: 7 Loss: 0.15990553915747607
Epoch: 8 Loss: 0.15965878882938153
Epoch: 9 Loss: 0.15938243182677017
Epoch: 10 Loss: 0.15918316465281213
Epoch: 11 Loss: 0.15904777714758817
Epoch: 12 Loss: 0.1588188072938658
Epoch: 13 Loss: 0.15868130317815166
Epoch: 14 Loss: 0.15861690969316622
Epoch: 15 Loss: 0.15847898997706006
Epoch: 16 Loss: 0.1583482665661605
Epoch: 17 Loss: 0.1583395162182893
Epoch: 18 Loss: 0.15824741795940928
Epoch: 19 Loss: 0.15810321528451804
Epoch: 20 Loss: 0.1580750745006605
Epoch: 21 Loss: 0.15792126501547155
Epoch: 22 Loss: 0.15787840801823183
Epoch: 23 Loss: 0.1578085291758247
Epoch: 24 Loss: 0.15776998399775471
Epoch: 25 Loss: 0.1576643417349615
Epoch: 26 Loss: 0.15766901335502997
Epoch: 27 Loss: 0.15766645248001535
Epoch: 2

KeyboardInterrupt: 

In [16]:
# the medium architecture
size1= 1000
size2= 1000
tau= 0.5
h= 0.4
device= ('cuda' if torch.cuda.is_available() else 'cpu')
criterion6= TiltedLC()
model6= Network(size1, size2, 0.1).to(device)
optimizer6= Adahessian(model6.parameters(),lr=.15)
scheduler6= lr_scheduler.MultiStepLR(
    optimizer6,
    [30,45], # 
    gamma=.1,
    last_epoch=-1)

In [None]:
# training loop for the medium architecture
ls_f= []
N_EPOCHS= 100
for epoch in range(N_EPOCHS):
    epoch_loss= 0.0
    for inputs, labels in trainLoader: 
        inputs= inputs.to(device) 
        labels= labels.to(device)
        optimizer6.zero_grad() 
        outputs= model6(inputs) 
        loss= criterion6(outputs, labels, tau, h) 
        loss.backward(create_graph=True)
        _, gradsH = get_params_grad(model6)
        optimizer6.step(gradsH)
        epoch_loss+= loss.item() 
    ls_f.append((epoch_loss/len(trainLoader)))
    print("Epoch: {} Loss: {}".format(epoch,
           epoch_loss/len(trainLoader)))

In [None]:
# the wide-medium architecture
size1= 2000
tau= 0.5
h= 0.4
device = ('cuda' if torch.cuda.is_available() else 'cpu')
criterion7= TiltedLC()
model7= wideM(size1, 0.2).to(device)
optimizer7= Adahessian(model7.parameters(),lr=.15)
scheduler7= lr_scheduler.MultiStepLR(
    optimizer7,
    [30,45], # 
    gamma=.1,
    last_epoch=-1)

In [None]:
# training loop for the wide-M architecture:
ls_g= []
N_EPOCHS= 100
for epoch in range(N_EPOCHS):
    epoch_loss= 0.0
    for inputs, labels in trainLoader: 
        inputs= inputs.to(device) 
        labels= labels.to(device)
        optimizer7.zero_grad() 
        outputs= model7(inputs) 
        loss= criterion7(outputs, labels, tau, h) 
        loss.backward(create_graph=True)
        _, gradsH = get_params_grad(model7)
        optimizer7.step(gradsH)
        epoch_loss+= loss.item() 
    ls_g.append((epoch_loss/len(trainLoader)))
    print("Epoch: {} Loss: {}".format(epoch,
           epoch_loss/len(trainLoader)))

In [None]:
E= np.asarray(ls_e)
F= np.asarray(ls_f)
G= np.asarray(ls_g)

np.save("/home/aryamanj/Downloads/ls_e.npy", E)
np.save("/home/aryamanj/Downloads/ls_fnpy", F)
np.save("/home/aryamanj/Downloads/ls_g.npy", G)

I see the following steps to incorporating LALR into the code:
1. Be able to compute the $K_z$: 
    * This is defined to be the largest activation in the penultimate layer
    * Hence, our network needs a method that allows us to obtain it's penultimate output and then simply take a supremum over it 
2. For now, a possible LC of the log-cosh seems to be just the same as for the check, namely, $\frac{K_z}{m}\times \max(\tau, 1-\tau)$ -- these kinds of LC's are all expressable in the form: $C.\frac{K_z}{m}$ (where C is some constant which we know)

In [2]:
# LALR code:

# A new network class for LALR training, that supports returning penultimate activations
class LALRnetwork(nn.Module):
    def __init__(self, size1, size2, drop):
        super(LALRnetwork, self).__init__()
        self.l1= nn.Linear(943, size1)
        self.l2= nn.Dropout(p= drop)
        self.l3= nn.Linear(size1, size2)
        self.l4= nn.Dropout(p= drop)
        self.l5= nn.Linear(size2, 4760)

    def forward(self, x):
        x= F.tanh(self.l2(self.l1(x)))
        x= F.tanh(self.l4(self.l3(x)))
        return self.l5(x)
    
    def penU(self, x):
        x= F.tanh(self.l2(self.l1(x)))
        x= F.tanh(self.l4(self.l3(x)))
        return x

def computeLR(model, input, factor, bSize= 16, flag= 0):
    """
    Takes in a network of the LALRnetwork class(during some arbitrary EPOCH of training) and the current input, and returns Kz for the EPOCH
    flag= 1 => LR*0.01 modification
    """
    Kz = 0.0
    model.eval()
    with torch.no_grad():
        for i,j in enumerate(trainLoader):
            inputs,labels= j[0],j[1]
            inputs= inputs.to(device)
            labels= labels.to(device)
            op= model.penU(inputs)
            activ= np.linalg.norm(op.detach().cpu().numpy())
            if activ > Kz:
                Kz= activ
    LR= Kz/bSize*(max(tau,1-tau))
    if LR==0:
        return 0.1
    if flag:
        return 1/(LR*0.01)
    return 1/LR


In [None]:
# writing training loops for networks with LALR

ls_e= []
def train_adaptive_lr(model,optimizer,loader, epochs, verbose=False):
    train_preds_Q = []
    train_labels = []
    epoch_loss= 0.0
    lr_val= computeLR(model, loader,bSize=16,flag=0)
    optimizer.param_groups[0]['lr'] = lr_val
    model.train()
    for inputs, labels in trainLoader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs= model(inputs)
        loss= criterion1(outputs, labels, tau, h) 
        optimizer.backward()
        optimizer.step()
        epoch_loss+= loss.item()
    ls_e.append((epoch_loss/len(trainLoader)))
    print("Epoch: {} Loss: {}".format(epoch,
           epoch_loss/len(trainLoader)))