In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import optim
from adahessian import Adahessian, get_params_grad
import torch.optim.lr_scheduler as lr_scheduler
from fastai.vision import *

## Writing the data-loader class for the GEO-dataset (can be changed to wrok for both the training and validation sets)

In [5]:
def GEOdataLoader(path, bSize= 16):
    """
    Specify the path to the parent folder containing the .npy files
    """
    X_tr= np.load(os.path.join(path, "X_tr.npy"))
    Y_tr= np.load(os.path.join(path, "Y_tr.npy"))
    X_va= np.load(os.path.join(path, "X_va.npy"))
    Y_va= np.load(os.path.join(path, "Y_va.npy"))
    print(X_tr.shape)
    print(Y_tr.shape)
    print(X_va.shape)
    print(Y_va.shape)
    test_data= torch.utils.data.TensorDataset(torch.from_numpy(X_tr).float(), torch.from_numpy(Y_tr).float())
    val_data= torch.utils.data.TensorDataset(torch.from_numpy(X_va).float(), torch.from_numpy(Y_va).float())
    trainLoader= torch.utils.data.DataLoader(test_data, batch_size=bSize, shuffle=True) 
    valLoader= torch.utils.data.DataLoader(val_data, batch_size=bSize, shuffle=True) 
    return (trainLoader, valLoader)

(88807, 943)
(88807, 4760)


In [6]:
# standardised class for all network architectures
class Network(nn.Module):
    def __init__(self, size1, size2, drop):
        super(Network, self).__init__()
        self.net= nn.Sequential(
            nn.Linear(943, size1),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size1, size2),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size2, 4760) 
        )

    def forward(self, input):
        return self.net(input)

In [7]:
# network class for the wide-medium architecture
class wideM(nn.Module):
    def __init__(self, size1, drop):
        super(wideM, self).__init__()
        self.net= nn.Sequential(
            nn.Linear(943, size1),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size1, 4760) 
        )

    def forward(self, input):
        return self.net(input)

In [8]:
# defining all the criterions to be used in the following experiments:


def tiltedLC(x, y, tau, h):
    e= y-x # errors
    ind= (torch.sign(e)+1)/2 # the division in the log-cosh is only about the origin
    quantFactor= (1-tau)*(1-ind) + tau*ind
    loss= quantFactor*torch.log(torch.cosh(e/h))
    loss= torch.mean(loss)*h
    return loss

def check_loss(x, y, tau): # the x,*args way to pass arguments to this function is an idiom for the scipy.optimize() library y = args[0][0]
    e = y-x
    # below I(e<0)
    ind = (np.sign(-e)+1)/2
    loss = np.mean(e*(tau-ind))
    return loss


class TiltedLC(nn.Module):
    def __init__(self):
        super(TiltedLC, self).__init__()
    def forward(self, x, y, tau, h):
        return tiltedLC(x, y, tau, h)

class CheckLoss(nn.Module):
    def __init__(self):
        super(CheckLoss, self).__init__()
    def forward(self, x, y, tau):
        return check_loss(x, y, tau)

In [None]:
# global initialisations:
h= 0.4 # smoothing parameter for the log-cosh 
device= ('cuda' if torch.cuda.is_available() else 'cpu')
trainLoader, valLoader= GEOdataLoader("/home/aryamanj/Downloads/LGdata")
criterion= TiltedLC()
criterion1= CheckLoss()
criterion2= nn.L1Loss()
N_EPOCHS= 500

## Instantiations and training loops start for constant LR's

In [72]:
# instantiating objects for all constantLR tests:
size1,size2= 300,300
model_CLR_S= Network(size1, size2, 0.1).to(device)
optimizer_CLR_S= optim.SGD(model_CLR_S.parameters(), lr= 0.1)
lossList_CLR_S= []

size1, size2= 1000, 1000
model_CLR_M= Network(size1, size2, 0.1).to(device)
optimizer_CLR_M= optim.SGD(model_CLR_M.parameters(), lr= 0.1)
lossList_CLR_M= []

size1, size2= 3000, 3000
model_CLR_L= Network(size1, size2, 0.1).to(device)
optimizer_CLR_L= optim.SGD(model_CLR_L.parameters(), lr= 0.1)
lossList_CLR_L= []

size1= 2000
model_CLR_WM= wideM(size1, 0.1).to(device)
optimizer_CLR_WM= optim.SGD(model_CLR_WM.parameters(), lr= 0.1)
lossList_CLR_WM= []

In [77]:
def trainConstantLR(model, optimizer, criterion, tau, N_EPOCHS, lossList):
    """
    The fundamental loop used for both constantLR training
    """
    for epoch in range(N_EPOCHS):
        # training loop
        epoch_loss = 0.0
        model.train()
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            optimizer.zero_grad() 
            outputs= model(inputs) 
            loss= criterion(outputs, labels, tau, h) 
            loss.backward() 
            optimizer.step() 
            epoch_loss+= loss.item() 
        lossList.append((epoch_loss/len(trainLoader)))
    
        # validation loop
        val_loss= 0.0
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion(outputs, labels, tau, h)
            val_loss+= loss.item()
        print("Epoch: {} Train Loss: {} Val Loss: {} LR: {}".format(epoch,epoch_loss/len(trainLoader),
            val_loss/len(valLoader),optimizer.param_groups[0]['lr']))

def trainadaH(model, optimizer, criterion, tau, N_EPOCHS, lossList):
    """
    The fundamental loop used for adaH training
    """
    for epoch in range(N_EPOCHS):
        epoch_loss= 0.0
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            optimizer.zero_grad() 
            outputs= model(inputs) 
            loss= criterion(outputs, labels, tau, h) 
            loss.backward(create_graph=True)
            _, gradsH = get_params_grad(model)
            optimizer.step(gradsH)
            epoch_loss+= loss.item() 
        lossList.append((epoch_loss/len(trainLoader)))
        
        # validation loop
        val_loss= 0.0
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion(outputs, labels, tau, h)
            val_loss+= loss.item()
        print("Epoch: {} Train Loss: {} Val Loss: {} LR: {}".format(epoch,epoch_loss/len(trainLoader),
            val_loss/len(valLoader),optimizer.param_groups[0]['lr']))

Epoch: 0 Loss: 0.13386895350536082
Epoch: 1 Loss: 0.1336473765160732
Epoch: 2 Loss: 0.13338832228092692
Epoch: 3 Loss: 0.13315786012597522
Epoch: 4 Loss: 0.13294416327660974
Epoch: 5 Loss: 0.13271940638865035
Epoch: 6 Loss: 0.13252280629437113
Epoch: 7 Loss: 0.132285691769547
Epoch: 8 Loss: 0.13209931786490697
Epoch: 9 Loss: 0.13190648027893187
Epoch: 10 Loss: 0.1317071426512851
Epoch: 11 Loss: 0.1315119389885504
Epoch: 12 Loss: 0.13133860254095517
Epoch: 13 Loss: 0.1311479967038517
Epoch: 14 Loss: 0.13099940435115634
Epoch: 15 Loss: 0.13081443068052795
Epoch: 16 Loss: 0.13064387180562106
Epoch: 17 Loss: 0.13048930255314656
Epoch: 18 Loss: 0.1303272681222523
Epoch: 19 Loss: 0.13016274789124432
Epoch: 20 Loss: 0.13001288522865037
Epoch: 21 Loss: 0.12986963122030826
Epoch: 22 Loss: 0.12971601760848064
Epoch: 23 Loss: 0.12958107533819546
Epoch: 24 Loss: 0.12943074332526344
Epoch: 25 Loss: 0.12929609544216847
Epoch: 26 Loss: 0.1291717951525076
Epoch: 27 Loss: 0.1290396464190984
Epoch: 28 L

In [None]:
# Calls to training loops themselves:
trainConstantLR(model_CLR_S, optimizer_CLR_S, criterion, tau, N_EPOCHS, lossList_CLR_S)

In [None]:
trainConstantLR(model_CLR_M, optimizer_CLR_M, criterion, tau, N_EPOCHS, lossList_CLR_M)

In [None]:
trainConstantLR(model_CLR_L, optimizer_CLR_L, criterion, tau, N_EPOCHS, lossList_CLR_M)

In [None]:
trainConstantLR(model_CLR_WM, optimizer_CLR_WM, criterion, tau, N_EPOCHS, lossList_CLR_WM)

In [82]:
# saving the EPOCH-losses
A= np.asarray(ls_a)
B= np.asarray(ls_b)
C= np.asarray(ls_c)
D= np.asarray(ls_d)

np.save("/home/aryamanj/Downloads/ls_a.npy", A)
np.save("/home/aryamanj/Downloads/ls_b.npy", B)
np.save("/home/aryamanj/Downloads/ls_c.npy", C)
np.save("/home/aryamanj/Downloads/ls_d.npy", D)

## Instantiations and training loops using adaH as the optimizer algorithm

In [None]:
# instantiating objects for all adaH tests:
size1,size2= 300,300
model_adaH_S= Network(size1, size2, 0.1).to(device)
optimizer_adaH_S= optim.SGD(model_adaH_S.parameters(), lr= 0.1)
lossList_adaH_S= []

size1, size2= 1000, 1000
model_adaH_M= Network(size1, size2, 0.1).to(device)
optimizer_adaH_M= optim.SGD(model_adaH_M.parameters(), lr= 0.1)
lossList_adaH_M= []

size1, size2= 3000, 3000
model_adaH_L= Network(size1, size2, 0.1).to(device)
optimizer_adaH_L= optim.SGD(model_adaH_L.parameters(), lr= 0.1)
lossList_adaH_L= []

size1= 2000
model_adaH_WM= wideM(size1, 0.1).to(device)
optimizer_adaH_WM= optim.SGD(model_adaH_WM.parameters(), lr= 0.1)
lossList_adaH_WM= []

In [None]:
trainadaH(model_adaH_S, optimizer_adaH_S, criterion, tau, N_EPOCHS, lossList_adaH_S)

In [None]:
trainadaH(model_adaH_M, optimizer_adaH_M, criterion, tau, N_EPOCHS, lossList_adaH_M)

In [None]:
trainadaH(model_adaH_L, optimizer_adaH_L, criterion, tau, N_EPOCHS, lossList_adaH_L)

In [None]:
trainadaH(model_adaH_WM, optimizer_adaH_WM, criterion, tau, N_EPOCHS, lossList_adaH_WM)

In [22]:
E= np.asarray(ls_e)
F= np.asarray(ls_f)
G= np.asarray(ls_g)

np.save("/home/aryamanj/Downloads/ls_e.npy", E)
np.save("/home/aryamanj/Downloads/ls_f.npy", F)
np.save("/home/aryamanj/Downloads/ls_g.npy", G)

I see the following steps to incorporating LALR into the code:
1. Be able to compute the $K_z$: 
    * This is defined to be the largest activation in the penultimate layer
    * Hence, our network needs a method that allows us to obtain it's penultimate output and then simply take a supremum over it 
2. For now, a possible LC of the log-cosh seems to be just the same as for the check, namely, $\frac{K_z}{m}\times \max(\tau, 1-\tau)$ -- these kinds of LC's are all expressable in the form: $C.\frac{K_z}{m}$ (where C is some constant which we know)

The next expression that we want to try out, is: $\frac{1}{m}\tanh(g(0)-y_k).K_z: \text{ } z_k^{[L]}=0$, or, exploiting the continuity of the regression problem, we can also write: $\frac{1}{m}\tanh(g(0)-y_k).K_z,\text{ where: } k= \argmin_j z_j^{[L]}$

Hence, for implementing the above expression in addition to the penultimate function, we also need access to the set of pre-activation values, and be able to compute the $\argmin$ from among them, since in this network, we do not have any activation function acting on the final layer, we can just compute the minimum feature in the final layer

In [52]:
# A new network class for LALR training, that supports returning penultimate activations
class LALRnetwork(nn.Module):
    def __init__(self, size1, size2, drop):
        super(LALRnetwork, self).__init__()
        self.l1= nn.Linear(943, size1)
        self.l2= nn.Dropout(p= drop)
        self.l3= nn.Linear(size1, size2)
        self.l4= nn.Dropout(p= drop)
        self.l5= nn.Linear(size2, 4760)
        # creating separate heads for all the different quantiles
        self.fc1= create_head(4760, 4760, ps=drop)
        self.fc2= create_head(4760, 4760, ps=drop)
        self.fc3= create_head(4760, 4760, ps=drop)
        self.fc4= create_head(4760, 4760, ps=drop)
        self.fc5= create_head(4760, 4760, ps=drop)
        self.fc6= create_head(4760, 4760, ps=drop)
        self.fc7= create_head(4760, 4760, ps=drop)
        self.fc8= create_head(4760, 4760, ps=drop)
        self.fc9= create_head(4760, 4760, ps=drop)

    def forward(self, x):
        x= F.tanh(self.l2(self.l1(x)))
        x= self.l5(F.tanh(self.l4(self.l3(x))))
        q1= self.fc1(x)
        q2= self.fc2(x)
        q3= self.fc3(x)
        q4= self.fc4(x)
        q5= self.fc5(x)
        q6= self.fc6(x)
        q7= self.fc7(x)
        q8= self.fc8(x)
        q9= self.fc9(x)
        return [q1,q2,q3,q4,q5,q6,q7,q8,q9]
    
    def penU(self, x):
        x= F.tanh(self.l2(self.l1(x)))
        x= F.tanh(self.l4(self.l3(x)))
        return x

class LALRwideM(nn.Module):
    def __init__(self, size1, drop):
        super(LALRwideM, self).__init__()
        self.l1= nn.Linear(943, size1)
        self.l2= nn.Dropout(p= drop)
        self.l3= nn.Linear(size1, 4760)

    def forward(self,x):
        x= F.tanh(self.l1(x))
        return self.l3(self.l2(x))
        
    def penU(self, x):
        x= F.tanh(self.l1(x))
        return self.l2(x)


def computeLR(model, bSize= 16):
    """
    Takes in a network of the LALRnetwork class(during some arbitrary EPOCH of training) and the current input, and returns Kz for the EPOCH
    """
    Kz = 0.0
    z_k= 0.0
    model.eval()
    with torch.no_grad():
        for i,j in enumerate(trainLoader):
            inputs,labels= j[0],j[1]
            inputs= inputs.to(device)
            labels= labels.to(device)
            op1= model.penU(inputs)
            op2= model(inputs)
            # activ1= np.linalg.norm(op1.detach().cpu().numpy())
            # activ2= np.linalg.norm(op2.detach().cpu().numpy())
            activ1= torch.max(op1)
            activ2, arg2= torch.min(op2)
            if activ1 > Kz:
                Kz= activ1
            if activ2 < z_k:
                z_k= activ2
                argMin= arg2
    LR= (1/bSize)*np.tanh(-op2[argMin])*Kz
    if LR==0:
        return 0.1
    return 1/LR


In [45]:
# writing training loop for networks with LALR

def train_adaptive_lr(model,optimizer, criterion, epochs, ls_list):
    for epoch in range(epochs):
        epoch_loss = 0.0
        lr_val= computeLR(model, bSize=16)
        optimizer.param_groups[0]['lr']= lr_val
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            optimizer.zero_grad() 
            outputs= model(inputs) 
            loss= criterion(outputs, labels, tau, h) 
            loss.backward() 
            optimizer.step() 
            epoch_loss+= loss.item() 
        ls_list.append((epoch_loss/len(trainLoader)))
        print("Epoch: {} Loss: {} LR: {}".format(epoch,
            epoch_loss/len(trainLoader), optimizer.param_groups[0]['lr']))

In [53]:
# instantiating LALR models
model_lr_1= LALRnetwork(size1=300, size2=300, drop=0.1).to(device)
optimizer_lr_1= optim.SGD(model_lr_1.parameters(), lr= 0.1)
loss_list_lr_1= []

model_lr_2= LALRnetwork(size1=1000, size2=1000, drop=0.1).to(device)
optimizer_lr_2= optim.SGD(model_lr_2.parameters(), lr= 0.1)
loss_list_lr_2= []


model_lr_3= LALRwideM(size1=2000,drop=0.1).to(device)
optimizer_lr_3= optim.SGD(model_lr_3.parameters(), lr= 0.1)
loss_list_lr_3= []

In [40]:
train_adaptive_lr(model_lr_1,optimizer_lr_1, criterion, 100,loss_list_lr_1)



Epoch: 0 Loss: 0.17367235173047973 LR: 1.6327996884124587
Epoch: 1 Loss: 0.1438976589505081 LR: 0.6847130045882568
Epoch: 2 Loss: 0.13761624940702066 LR: 0.700449054833888
Epoch: 3 Loss: 0.13314414638680308 LR: 0.6964594313141763
Epoch: 4 Loss: 0.12983695297783676 LR: 0.705482966403094
Epoch: 5 Loss: 0.12724609879722812 LR: 0.7041584560673653
Epoch: 6 Loss: 0.12514590478828203 LR: 0.7123995500692348
Epoch: 7 Loss: 0.12340140272917736 LR: 0.6974392913040496
Epoch: 8 Loss: 0.1219351899809632 LR: 0.712972342876764
Epoch: 9 Loss: 0.12065037253890418 LR: 0.7236611147731506
Epoch: 10 Loss: 0.11950898959016697 LR: 0.7302756085689559
Epoch: 11 Loss: 0.1184839633505015 LR: 0.7299473932986149
Epoch: 12 Loss: 0.11757076266697346 LR: 0.7263564012705294
Epoch: 13 Loss: 0.1167418597649613 LR: 0.732737926467025
Epoch: 14 Loss: 0.11598094526448136 LR: 0.7012487894345484
Epoch: 15 Loss: 0.11533269464534761 LR: 0.7363674365192464
Epoch: 16 Loss: 0.11469170159771515 LR: 0.7397070626675529
Epoch: 17 Loss:

In [55]:
train_adaptive_lr(model_lr_2,optimizer_lr_2, criterion, 100,loss_list_lr_2)

Epoch: 0 Loss: 0.2674305171842855 LR: 0.9126519997249622
Epoch: 1 Loss: 0.2674271901262449 LR: 0.9352691898299412
Epoch: 2 Loss: 0.2674272424051341 LR: 0.9262805662578283
Epoch: 3 Loss: 0.26743391728781296 LR: 0.9057811818054404
Epoch: 4 Loss: 0.26742825945837606 LR: 0.9015939795771736
Epoch: 5 Loss: 0.2674365390304874 LR: 0.9329796987076587
Epoch: 6 Loss: 0.26742701939506974 LR: 0.8969772091488466
Epoch: 7 Loss: 0.26742735472622703 LR: 0.9006793795554802
Epoch: 8 Loss: 0.2674325669892005 LR: 0.9256528839203082
Epoch: 9 Loss: 0.2674266588331407 LR: 0.9313245655132905
Epoch: 10 Loss: 0.2674270817217065 LR: 0.9233213428398747
Epoch: 11 Loss: 0.26742708029628465 LR: 0.9172196045963499
Epoch: 12 Loss: 0.2674366643145973 LR: 0.933775326875597
Epoch: 13 Loss: 0.2674257676250504 LR: 0.9363839735456096
Epoch: 14 Loss: 0.2674297019825865 LR: 0.9179157425802812
Epoch: 15 Loss: 0.26742823457120884 LR: 0.9263275156422387
Epoch: 16 Loss: 0.2674285500592057 LR: 0.9302631551223366
Epoch: 17 Loss: 0.2

In [56]:
train_adaptive_lr(model_lr_3, optimizer_lr_3, criterion, 100, loss_list_lr_3)

Epoch: 0 Loss: 0.2766984735812352 LR: 0.3429063828778339
Epoch: 1 Loss: 0.2766971208586003 LR: 0.34261955077124656
Epoch: 2 Loss: 0.2766984212916083 LR: 0.3458309937426772
Epoch: 3 Loss: 0.27669959257495574 LR: 0.34909062822853176
Epoch: 4 Loss: 0.2767024405946218 LR: 0.33833348713457956
Epoch: 5 Loss: 0.27669661724713446 LR: 0.34781446297450036
Epoch: 6 Loss: 0.2766980450151418 LR: 0.343522984846034
Epoch: 7 Loss: 0.2767026562950371 LR: 0.3470735479450581
Epoch: 8 Loss: 0.27669903356800035 LR: 0.34617273691945266
Epoch: 9 Loss: 0.2766997841774204 LR: 0.34503833975043874
Epoch: 10 Loss: 0.27669807655427764 LR: 0.34747707182064125
Epoch: 11 Loss: 0.276699979640067 LR: 0.34331283018648623
Epoch: 12 Loss: 0.2767020834821599 LR: 0.3523759372066897
Epoch: 13 Loss: 0.2766986167005667 LR: 0.34504600362198007
Epoch: 14 Loss: 0.27669969829239943 LR: 0.3453722606796819
Epoch: 15 Loss: 0.27670550601393484 LR: 0.3423404908108156
Epoch: 16 Loss: 0.27669683603730594 LR: 0.34144203460606926
Epoch: 17

In [57]:
H= np.asarray(ls_h)
I= np.asarray(ls_i)
J= np.asarray(ls_j)

np.save("/home/aryamanj/Downloads/ls_h.npy", H)
np.save("/home/aryamanj/Downloads/ls_i.npy", I)
np.save("/home/aryamanj/Downloads/ls_j.npy", J)