In [70]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import optim
from adahessian import Adahessian, get_params_grad
import torch.optim.lr_scheduler as lr_scheduler
from fastai.vision.all import *

## Writing the data-loader class for the GEO-dataset (can be changed to wrok for both the training and validation sets)

In [71]:
def GEOdataLoader(path, bSize= 16):
    """
    Specify the path to the parent folder containing the .npy files
    """
    X_tr= np.load(os.path.join(path, "X_tr.npy"))
    Y_tr= np.load(os.path.join(path, "Y_tr.npy"))
    X_va= np.load(os.path.join(path, "X_va.npy"))
    Y_va= np.load(os.path.join(path, "Y_va.npy"))

    test_data= torch.utils.data.TensorDataset(torch.from_numpy(X_tr).float(), torch.from_numpy(Y_tr).float())
    val_data= torch.utils.data.TensorDataset(torch.from_numpy(X_va).float(), torch.from_numpy(Y_va).float())
    trainLoader= torch.utils.data.DataLoader(test_data, batch_size=bSize, shuffle=True) 
    valLoader= torch.utils.data.DataLoader(val_data, batch_size=bSize, shuffle=True) 
    return (trainLoader, valLoader)

In [3]:
# standardised class for all network architectures
class Network(nn.Module):
    def __init__(self, size1, size2, drop):
        super(Network, self).__init__()
        self.net= nn.Sequential(
            nn.Linear(943, size1),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size1, size2),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size2, 4760) 
        )

    def forward(self, input):
        return self.net(input)

In [4]:
# network class for the wide-medium architecture
class wideM(nn.Module):
    def __init__(self, size1, drop):
        super(wideM, self).__init__()
        self.net= nn.Sequential(
            nn.Linear(943, size1),
            nn.Tanh(),
            nn.Dropout(p= drop),
            nn.Linear(size1, 4760) 
        )

    def forward(self, input):
        return self.net(input)

In [72]:
# defining all the criterions to be used in the following experiments:


def tiltedLC(x, y, tau, h):
    e= y-x # errors
    ind= (torch.sign(e)+1)/2 # the division in the log-cosh is only about the origin
    quantFactor= (1-tau)*(1-ind) + tau*ind
    loss= quantFactor*torch.log(torch.cosh(e))
    loss= torch.mean(loss)
    return loss

def check_loss(x, y, tau): # the x,*args way to pass arguments to this function is an idiom for the scipy.optimize() library y = args[0][0]
    e = y-x
    # below I(e<0)
    ind = (np.sign(-e)+1)/2
    loss = np.mean(e*(tau-ind))
    return loss


class TiltedLC(nn.Module):
    def __init__(self):
        super(TiltedLC, self).__init__()
    def forward(self, x, y, tau, h):
        return tiltedLC(x, y, tau, h)

class CheckLoss(nn.Module):
    def __init__(self):
        super(CheckLoss, self).__init__()
    def forward(self, x, y, tau):
        return check_loss(x, y, tau)

In [73]:
# global initialisations:
h= 0.4 # smoothing parameter for the log-cosh 
device= ('cuda' if torch.cuda.is_available() else 'cpu')
# device= 'cpu'
trainLoader, valLoader= GEOdataLoader("/home/aryamanj/Downloads/LGdata")
criterion= multiQuantileLoss(3)
criterion1= CheckLoss()
criterion2= nn.L1Loss()
criterion3= nn.MSELoss()
N_EPOCHS= 500

## Instantiations and training loops start for constant LR's

In [74]:
# instantiating objects for all constantLR tests:
size1,size2= 300,300
model_CLR_S= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_CLR_S= optim.SGD(model_CLR_S.parameters(), lr= 0.1)
lossList_CLR_S= []
valList_CLR_S= []

size1, size2= 1000, 1000
model_CLR_M= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_CLR_M= optim.SGD(model_CLR_M.parameters(), lr= 0.1)
lossList_CLR_M= []
valList_CLR_M= []

size1, size2= 3000, 3000
model_CLR_L= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_CLR_L= optim.SGD(model_CLR_L.parameters(), lr= 0.1)
lossList_CLR_L= []
valList_CLR_L= []

size1= 2000
model_CLR_WM= LALRwideM(size1, 0.1).to(device)
optimizer_CLR_WM= optim.SGD(model_CLR_WM.parameters(), lr= 0.1)
lossList_CLR_WM= []
valList_CLR_WM= []

In [None]:
def trainConstantLR(model, optimizer, criterion, epochs, ls_list, valList):
    """
    Training loop used for constantLR training
    """
    for epoch in range(epochs):
        epoch_loss= [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        # training loop
        model.train()
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            labels= torch.tile(labels, (1,9))
            optimizer.zero_grad() 
            outputs= model(inputs) 
            outputs= torch.cat(outputs, dim=1)
            loss, epoch_loss = criterion(outputs, labels, h, epoch_loss) 
            loss.backward() 
            optimizer.step() 

        # validation loop
        val_loss= [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion(outputs, labels, h, val_loss)
        ls_list.append([x/len(trainLoader) for x in epoch_loss])
        valList.append([x/len(valLoader) for x in valList])
        print("Epoch: {} Loss-0.1: {} Loss-0.5: {} Loss-0.9: {}".format(epoch,
            ls_list[epoch][0], ls_list[epoch][4], ls_list[epoch][8]))

def trainadaH(model, optimizer, criterion, epochs, ls_list, valList):
    """
    Training loop used for adaH training
    """
    for epoch in range(epochs):
        epoch_loss= [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        # training loop
        model.train()
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            labels= torch.tile(labels, (1,9))
            optimizer.zero_grad() 
            outputs= model(inputs) 
            outputs= torch.cat(outputs, dim=1)
            loss, epoch_loss = criterion(outputs, labels, h, epoch_loss) 
            loss.backward(create_graph=True)
            _, gradsH = get_params_grad(model)
            optimizer.step(gradsH)
        ls_list.append([x/len(trainLoader) for x in epoch_loss])

        # validation loop
        val_loss= [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion(outputs, labels, h, val_loss)
        valList.append([x/len(valLoader) for x in valList])
        print("Epoch: {} Loss-0.1: {} Loss-0.5: {} Loss-0.9: {}".format(epoch,
            ls_list[epoch][0], ls_list[epoch][4], ls_list[epoch][8]))

In [None]:
# Calls to training loops themselves:
trainConstantLR(model_CLR_S, optimizer_CLR_S, criterion, N_EPOCHS, lossList_CLR_S, valList_CLR_S)

In [None]:
trainConstantLR(model_CLR_M, optimizer_CLR_M, criterion, N_EPOCHS, lossList_CLR_M, valList_CLR_M)

In [None]:
trainConstantLR(model_CLR_L, optimizer_CLR_L, criterion, N_EPOCHS, lossList_CLR_M, valList_CLR_L)

In [None]:
trainConstantLR(model_CLR_WM, optimizer_CLR_WM, criterion, N_EPOCHS, lossList_CLR_WM, valList_CLR_WM)

In [82]:
# saving the EPOCH-losses
A= np.asarray(lossList_CLR_S)
B= np.asarray(lossList_CLR_M)
C= np.asarray(lossList_CLR_L)
D= np.asarray(lossList_CLR_WM)

np.save("/home/aryamanj/Downloads/lossList_CLR_S.npy", A)
np.save("/home/aryamanj/Downloads/lossList_CLR_M.npy", B)
np.save("/home/aryamanj/Downloads/lossList_CLR_L.npy", C)
np.save("/home/aryamanj/Downloads/lossList_CLR_WM.npy", D)

## Instantiations and training loops using adaH as the optimizer algorithm

In [75]:
# instantiating objects for all adaH tests:
size1,size2= 300,300
model_adaH_S= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_adaH_S= optim.SGD(model_adaH_S.parameters(), lr= 0.1)
lossList_adaH_S= []
valList_adaH_S= []

size1, size2= 1000, 1000
model_adaH_M= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_adaH_M= optim.SGD(model_adaH_M.parameters(), lr= 0.1)
lossList_adaH_M= []
valList_adaH_M= []

size1, size2= 3000, 3000
model_adaH_L= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_adaH_L= optim.SGD(model_adaH_L.parameters(), lr= 0.1)
lossList_adaH_L= []
valList_adaH_L= []

size1= 2000
model_adaH_WM= LALRwideM(size1, 0.1).to(device)
optimizer_adaH_WM= optim.SGD(model_adaH_WM.parameters(), lr= 0.1)
lossList_adaH_WM= []
valList_adaH_WM= []

In [None]:
trainadaH(model_adaH_S, optimizer_adaH_S, criterion, N_EPOCHS, lossList_adaH_S, valList_adaH_S)

In [None]:
trainadaH(model_adaH_M, optimizer_adaH_M, criterion, N_EPOCHS, lossList_adaH_M, valList_adaH_M)

In [None]:
trainadaH(model_adaH_L, optimizer_adaH_L, criterion, N_EPOCHS, lossList_adaH_L, valList_adaH_L)

In [None]:
trainadaH(model_adaH_WM, optimizer_adaH_WM, criterion, N_EPOCHS, lossList_adaH_WM, valList_adaH_WM)

In [22]:
E= np.asarray(lossList_adaH_S)
F_= np.asarray(lossList_adaH_M)
G= np.asarray(lossList_adaH_L)
H= np.asarray(lossList_adaH_WM)

np.save("/home/aryamanj/Downloads/lossList_adaH_S.npy", E)
np.save("/home/aryamanj/Downloads/lossList_adaH_M.npy", F_)
np.save("/home/aryamanj/Downloads/lossList_adaH_L.npy", G)
np.save("/home/aryamanj/Downloads/lossList_adaH_WM.npy", H)

I see the following steps to incorporating LALR into the code:
1. Be able to compute the $K_z$: 
    * This is defined to be the largest activation in the penultimate layer
    * Hence, our network needs a method that allows us to obtain it's penultimate output and then simply take a supremum over it 
2. For now, a possible LC of the log-cosh seems to be just the same as for the check, namely, $\frac{K_z}{m}\times \max(\tau, 1-\tau)$ -- these kinds of LC's are all expressable in the form: $C.\frac{K_z}{m}$ (where C is some constant which we know)

The next expression that we want to try out, is: $\frac{1}{m}\tanh(g(0)-y_k).K_z: \text{ } z_k^{[L]}=0$, or, exploiting the continuity of the regression problem, we can also write: $\frac{1}{m}\tanh(g(0)-y_k).K_z,\text{ where: } k= \argmin_j z_j^{[L]}$

Hence, for implementing the above expression in addition to the penultimate function, we also need access to the set of pre-activation values, and be able to compute the $\argmin$ from among them, since in this network, we do not have any activation function acting on the final layer, we can just compute the minimum feature in the final layer

In [68]:
# A new network class for LALR training, that supports returning penultimate activations
class LALRnetwork(nn.Module):
    def __init__(self, size1, size2, drop):
        super(LALRnetwork, self).__init__()
        self.l1= nn.Linear(943, size1)
        self.l2= nn.Dropout(p= drop)
        self.l3= nn.Linear(size1, size2)
        self.l4= nn.Dropout(p= drop)
        # creating separate heads for all the different quantiles
        self.fc1= nn.Linear(size2, 4760)
        self.fc2= nn.Linear(size2, 4760)
        self.fc3= nn.Linear(size2, 4760)
        self.fc4= nn.Linear(size2, 4760)
        self.fc5= nn.Linear(size2, 4760)
        self.fc6= nn.Linear(size2, 4760)
        self.fc7= nn.Linear(size2, 4760)
        self.fc8= nn.Linear(size2, 4760)
        self.fc9= nn.Linear(size2, 4760)

    def forward(self, x):
        x= F.tanh(self.l1(x))
        x= F.tanh(self.l3(self.l2(x)))
        x= self.l4(x)
        q1= self.fc1(x)
        q2= self.fc2(x)
        q3= self.fc3(x)
        q4= self.fc4(x)
        q5= self.fc5(x)
        q6= self.fc6(x)
        q7= self.fc7(x)
        q8= self.fc8(x)
        q9= self.fc9(x)
        return [q1,q2,q3,q4,q5,q6,q7,q8,q9]
    
    def penU(self, x):
        x= F.tanh(self.l2(self.l1(x)))
        x= F.tanh(self.l4(self.l3(x)))
        return x

# the loss-function wrapper for multi-class quantile learning
class multiQuantileLoss(nn.Module):
    def __init__(self, num_tasks):
        super(multiQuantileLoss, self).__init__()
        self.num_tasks= num_tasks

    def forward(self, x, y, h, loss_list):
        loss1= tiltedLC(x, y, 0.1, h) + loss_list[0]
        loss2= tiltedLC(x, y, 0.2, h) + loss_list[1]
        loss3= tiltedLC(x, y, 0.3, h)+ loss_list[2] 
        loss4= tiltedLC(x, y, 0.4, h)+ loss_list[3]
        loss5= tiltedLC(x, y, 0.5, h)+ loss_list[4]
        loss6= tiltedLC(x, y, 0.6, h)+ loss_list[5]
        loss7= tiltedLC(x, y, 0.7, h)+ loss_list[6]
        loss8= tiltedLC(x, y, 0.8, h)+ loss_list[7]
        loss9= tiltedLC(x, y, 0.9, h)+ loss_list[8]
        loss_list= [loss1,loss2,loss3,loss4,loss5,loss6,loss7,loss8,loss9]

        return [(loss1+loss2+loss3+loss4+loss5+loss6+loss7+loss8+loss9)/9, loss_list]


class LALRwideM(nn.Module):
    def __init__(self, size1, drop):
        super(LALRwideM, self).__init__()
        self.l1= nn.Linear(943, size1)
        self.l2= nn.Dropout(p= drop)
        self.l3= nn.Linear(size1, 4760)

    def forward(self,x):
        x= F.tanh(self.l1(x))
        return self.l3(self.l2(x))
        
    def penU(self, x):
        x= F.tanh(self.l1(x))
        return self.l2(x)


def computeLR(model, bSize= 16):
    """
    Takes in a network of the LALRnetwork class(during some arbitrary EPOCH of training) and the current input, and returns Kz for the EPOCH
    """
    Kz = 0.0
    z_k= 0.0
    model.eval()
    with torch.no_grad():
        for i,j in enumerate(trainLoader):
            inputs,labels= j[0],j[1]
            inputs= inputs.to(device)
            labels= labels.to(device)
            op1= model.penU(inputs)
            op2= model(inputs)
            # activ1= np.linalg.norm(op1.detach().cpu().numpy())
            # activ2= np.linalg.norm(op2.detach().cpu().numpy())
            activ1= torch.max(op1)
            activ2, arg2= torch.min(op2[0], dim= 1)
            # print(arg2, i)
            # print(activ2, i)
            # now, we take the minimum across batches
            val, indx= torch.min(activ2, dim= 0)
            # print(indx, i)
            if activ1 > Kz:
                Kz= activ1
            if val < z_k:
                z_k= val
                argMin= arg2[indx]
    print(int(indx), int(argMin))
    LR= (1/bSize)*torch.tanh(-(op2[0])[int(indx)][int(argMin)])*Kz
    if LR==0:
        return 0.1
    return 1/LR


In [69]:
# writing training loop for networks with LALR
def train_adaptive_lr(model,optimizer, criterion, epochs, ls_list, valList):
    """
    Training loop used for LALR training
    """
    for epoch in range(epochs):
        epoch_loss= [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        lr_val= computeLR(model, bSize=16)
        optimizer.param_groups[0]['lr']= lr_val
        # training loop
        model.train()
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            labels= torch.tile(labels, (1,9))
            optimizer.zero_grad() 
            outputs= model(inputs) 
            outputs= torch.cat(outputs, dim=1)
            loss, epoch_loss = criterion(outputs, labels, h, epoch_loss) 
            loss.backward() 
            optimizer.step() 

        # validation loop
        val_loss= [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion(outputs, labels, h, val_loss)
        ls_list.append([x/len(trainLoader) for x in epoch_loss])
        valList.append([x/len(valLoader) for x in valList])
        print("Epoch: {} Loss-0.1: {} Loss-0.5: {} Loss-0.9: {} LR: {}".format(epoch,
            ls_list[epoch][0], ls_list[epoch][4], ls_list[epoch][8], optimizer.param_groups[0]['lr']))

In [62]:
# instantiating LALR models
model_lr_1= LALRnetwork(size1=300, size2=300, drop=0.1).to(device)
optimizer_lr_1= optim.SGD(model_lr_1.parameters(), lr= 0.1)
loss_list_lr_1= []
valList_LR_S= []

model_lr_2= LALRnetwork(size1=1000, size2=1000, drop=0.1).to(device)
optimizer_lr_2= optim.SGD(model_lr_2.parameters(), lr= 0.1)
loss_list_lr_2= []
valList_LR_M= []

model_lr_3= LALRnetwork(size1=3000, size2=3000, drop=0.1).to(device)
optimizer_lr_3= optim.SGD(model_lr_3.parameters(), lr= 0.1)
loss_list_lr_3= []
valList_LR_L= []


model_lr_4= LALRwideM(size1=2000,drop=0.1).to(device)
optimizer_lr_4= optim.SGD(model_lr_4.parameters(), lr= 0.1)
loss_list_lr_4= []
valList_LR_WM= []

In [None]:
train_adaptive_lr(model_lr_1,optimizer_lr_1, criterion, N_EPOCHS, loss_list_lr_1, valList_LR_S)

In [None]:
train_adaptive_lr(model_lr_2,optimizer_lr_2, criterion, N_EPOCHS, loss_list_lr_2, valList_LR_M)

In [None]:
train_adaptive_lr(model_lr_3, optimizer_lr_3, criterion, N_EPOCHS, loss_list_lr_3, valList_LR_L)

In [None]:
train_adaptive_lr(model_lr_4, optimizer_lr_4, criterion, N_EPOCHS, loss_list_lr_4, valList_LR_WM)

In [57]:
H= np.asarray(loss_list_lr_1)
I= np.asarray(loss_list_lr_2)
J= np.asarray(loss_list_lr_3)
K= np.asarray(loss_list_lr_4)

np.save("/home/aryamanj/Downloads/lossList_LR_S.npy", H)
np.save("/home/aryamanj/Downloads/lossList_LR_M.npy", I)
np.save("/home/aryamanj/Downloads/lossList_LR_L.npy", J)
np.save("/home/aryamanj/Downloads/lossList_LR_WM.npy", K)