In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import optim
from adahessian import Adahessian, get_params_grad
import torch.optim.lr_scheduler as lr_scheduler

## Writing the data-loader class for the GEO-dataset (can be changed to work for both the training and validation sets)

In [2]:
def GEOdataLoader(path, bSize= 16):
    """
    Specify the path to the parent folder containing the .npy files
    """
    X_tr= np.load(os.path.join(path, "X_tr.npy"))
    Y_tr= np.load(os.path.join(path, "Y_tr.npy"))
    X_va= np.load(os.path.join(path, "X_va.npy"))
    Y_va= np.load(os.path.join(path, "Y_va.npy"))

    test_data= torch.utils.data.TensorDataset(torch.from_numpy(X_tr).float(), torch.from_numpy(Y_tr).float())
    val_data= torch.utils.data.TensorDataset(torch.from_numpy(X_va).float(), torch.from_numpy(Y_va).float())
    trainLoader= torch.utils.data.DataLoader(test_data, batch_size=bSize, shuffle=True) 
    valLoader= torch.utils.data.DataLoader(val_data, batch_size=bSize, shuffle=True) 
    return (trainLoader, valLoader)

In [3]:
# defining all the criterions to be used in the following experiments:
def tiltedLC(x, y, tau, h):
    e= y-x # errors
    ind= (torch.sign(e)+1)/2 # the division in the log-cosh is only about the origin
    quantFactor= (1-tau)*(1-ind) + tau*ind
    loss= quantFactor*torch.log(torch.cosh(e))
    loss= torch.mean(loss)
    return loss

def check_loss(x, y, tau): # the x,*args way to pass arguments to this function is an idiom for the scipy.optimize() library y = args[0][0]
    e = y-x
    # below I(e<0)
    ind = (np.sign(-e)+1)/2
    loss = np.mean(e*(tau-ind))
    return loss


class TiltedLC(nn.Module):
    def __init__(self):
        super(TiltedLC, self).__init__()
    def forward(self, x, y, tau, h):
        return tiltedLC(x, y, tau, h)

class CheckLoss(nn.Module):
    def __init__(self):
        super(CheckLoss, self).__init__()
    def forward(self, x, y, tau):
        return check_loss(x, y, tau)

In [4]:
# global initialisations:
h= 0.4 # smoothing parameter for the log-cosh 
tau= 0.5
device= ('cuda' if torch.cuda.is_available() else 'cpu')
# device= 'cpu'
trainLoader, valLoader= GEOdataLoader("/home/aryamanj/Downloads/LGdata")
criterion= TiltedLC()
criterion1= CheckLoss()
criterion2= nn.L1Loss()
criterion3= nn.MSELoss()
N_EPOCHS= 500

In [11]:
# A new network class for LALR training, that supports returning penultimate activations
class LALRnetwork(nn.Module):
    def __init__(self, size1, size2, drop):
        super(LALRnetwork, self).__init__()
        self.l1= nn.Linear(943, size1)
        self.l2= nn.Dropout(p= drop)
        self.l3= nn.Linear(size1, size2)
        self.l4= nn.Dropout(p= drop)
        self.l5= nn.Linear(size2, 4760)

    def forward(self, x):
        x= F.tanh(self.l1(x))
        x= F.tanh(self.l3(self.l2(x)))
        x= self.l5(self.l4(x))
        return x
    
    def penU(self, x):
        x= F.tanh(self.l2(self.l1(x)))
        x= F.tanh(self.l4(self.l3(x)))
        return x

class LALRwideM(nn.Module):
    def __init__(self, size1, drop):
        super(LALRwideM, self).__init__()
        self.l1= nn.Linear(943, size1)
        self.l2= nn.Dropout(p= drop)
        self.l3= nn.Linear(size1, 4760)

    def forward(self, x):
        x= F.tanh(self.l1(x))
        x= self.l3(self.l2(x))
        return x
        
    def penU(self, x):
        x= F.tanh(self.l1(x))
        return self.l2(x)



## Instantiations and training loops start for constant LR's

In [12]:
# instantiating objects for all constantLR tests:
size1,size2= 300,300
model_CLR_S= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_CLR_S= optim.SGD(model_CLR_S.parameters(), lr= 0.1)
lossList_CLR_S= []
valList_CLR_S= []

model_LBFGS_S= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_LBFGS_S= optim.LBFGS(model_LBFGS_S.parameters())
lossList_LBFGS_S= []
valList_LBFGS_S= []

size1, size2= 1000, 1000
model_CLR_M= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_CLR_M= optim.SGD(model_CLR_M.parameters(), lr= 0.1)
lossList_CLR_M= []
valList_CLR_M= []

size1, size2= 3000, 3000
model_CLR_L= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_CLR_L= optim.SGD(model_CLR_L.parameters(), lr= 0.1)
lossList_CLR_L= []
valList_CLR_L= []

size1= 2000
model_CLR_WM= LALRwideM(size1, 0.1).to(device)
optimizer_CLR_WM= optim.SGD(model_CLR_WM.parameters(), lr= 0.1)
lossList_CLR_WM= []
valList_CLR_WM= []

In [23]:
def trainConstantLR(model, optimizer, criterion, tau, epochs, ls_list, valList):
    """
    Training loop used for constantLR
    """
    for epoch in range(epochs):
        epoch_loss= 0.0
        # training loop
        model.train()
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            optimizer.zero_grad() 
            outputs= model(inputs) 
            loss= criterion(outputs, labels, tau, h) 
            loss.backward()
            optimizer.step() 
            epoch_loss+= loss.item()
        ls_list.append(epoch_loss/len(trainLoader))

        # validation loop
        val_loss= 0.0
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion3(outputs, labels, tau, h)
            val_loss+= loss.item()
        valList.append(val_loss/len(valLoader))
        print("Epoch: {} Training loss: {} Validation loss: {}".format(epoch, epoch_loss/len(trainLoader), val_loss/len(valLoader)))

def trainLBFGS(model, optimizer, criterion, tau, epochs, ls_list, valList):
    """
    Training loop used for LBFGS and conjugate gradient training
    """
    for epoch in range(epochs):
        epoch_loss= 0.0
        # training loop
        model.train()
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            def closure():
                optimizer.zero_grad()
                outputs= model(inputs)
                loss= criterion(outputs, labels, tau, h)
                loss.backward()
                return loss
            # optimizer.step(closure)
            # optimizer.zero_grad() 
            # outputs= model(inputs) 
            # loss= criterion(outputs, labels, tau, h) 
            # loss.backward()
            optimizer.step(closure) 
            # epoch_loss+= loss.item()
        # ls_list.append(epoch_loss/len(trainLoader))

        # validation loop
        val_loss= 0.0
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion3(outputs, labels)
            val_loss+= loss.item()
        valList.append(val_loss/len(valLoader))
        print("Epoch: {} Training loss: {} Validation loss: {}".format(epoch, epoch_loss/len(trainLoader), val_loss/len(valLoader)))

In [24]:
trainLBFGS(model_LBFGS_S, optimizer_LBFGS_S, criterion, tau,  N_EPOCHS, lossList_LBFGS_S, valList_LBFGS_S)

TypeError: forward() takes 3 positional arguments but 5 were given

In [None]:
# Calls to training loops themselves:
trainConstantLR(model_CLR_S, optimizer_CLR_S, criterion, tau,  N_EPOCHS, lossList_CLR_S, valList_CLR_S)

In [None]:
trainConstantLR(model_CLR_M, optimizer_CLR_M, criterion, tau, N_EPOCHS, lossList_CLR_M, valList_CLR_M)

In [None]:
trainConstantLR(model_CLR_L, optimizer_CLR_L, criterion, tau, N_EPOCHS, lossList_CLR_M, valList_CLR_L)

In [None]:
trainConstantLR(model_CLR_WM, optimizer_CLR_WM, criterion, tau, N_EPOCHS, lossList_CLR_WM, valList_CLR_WM)

In [None]:
# saving the EPOCH-losses
A= np.asarray(lossList_CLR_S)
B= np.asarray(lossList_CLR_M)
C= np.asarray(lossList_CLR_L)
D= np.asarray(lossList_CLR_WM)

np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_CLR_S.npy", A)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_CLR_M.npy", B)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_CLR_L.npy", C)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_CLR_WM.npy", D)

In [None]:
# saving the trained models:
torch.save(model_CLR_S.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/CLR_S.pt")
torch.save(model_CLR_M.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/CLR_M.pt")
torch.save(model_CLR_L.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/CLR_L.pt")
torch.save(model_CLR_WM.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/CLR_WM.pt")

## Instantiations and training loops using adaH as the optimizer algorithm

In [23]:
def trainadaH(model, optimizer, criterion, tau, epochs, ls_list, valList):
    """
    Training loop used for adaH
    """
    for epoch in range(epochs):
        epoch_loss= 0.0
        # training loop
        model.train()
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            optimizer.zero_grad() 
            outputs= model(inputs) 
            loss= criterion(outputs, labels, tau, h) 
            loss.backward(create_graph=True)
            _, gradsH = get_params_grad(model)
            optimizer.step(gradsH)
            epoch_loss+= loss.item()
        ls_list.append(epoch_loss/len(trainLoader))

        # validation loop
        val_loss= 0.0
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion(outputs, labels, tau, h)
            val_loss+= loss.item()
        valList.append(val_loss/len(valLoader))
        print("Epoch: {} Training-loss: {} Validation loss: {}".format(epoch,epoch_loss/len(trainLoader), val_loss/len(valLoader)))

In [24]:
# instantiating objects for all adaH tests:
size1,size2= 300,300
model_adaH_S= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_adaH_S = Adahessian(model_adaH_S.parameters(),lr=.15)
scheduler_adaH_S = lr_scheduler.MultiStepLR(
    optimizer_adaH_S,
    [30,45],  
    gamma=.1,
    last_epoch=-1)
lossList_adaH_S= []
valList_adaH_S= []

size1, size2= 1000, 1000
model_adaH_M= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_adaH_M = Adahessian(model_adaH_M.parameters(),lr=.15)
scheduler_adaH_M = lr_scheduler.MultiStepLR(
    optimizer_adaH_M,
    [30,45],  
    gamma=.1,
    last_epoch=-1)
lossList_adaH_M= []
valList_adaH_M= []

size1, size2= 3000, 3000
model_adaH_L= LALRnetwork(size1, size2, 0.1).to(device)
optimizer_adaH_L = Adahessian(model_adaH_L.parameters(),lr=.15)
scheduler_adaH_L = lr_scheduler.MultiStepLR(
    optimizer_adaH_L,
    [30,45],  
    gamma=.1,
    last_epoch=-1)
lossList_adaH_L= []
valList_adaH_L= []

size1= 2000
model_adaH_WM= LALRwideM(size1, 0.1).to(device)
optimizer_adaH_WM = Adahessian(model_adaH_WM.parameters(),lr=.15)
scheduler_adaH_WM = lr_scheduler.MultiStepLR(
    optimizer_adaH_WM,
    [30,45],  
    gamma=.1,
    last_epoch=-1)
lossList_adaH_WM= []
valList_adaH_WM= []


In [None]:
trainadaH(model_adaH_S, optimizer_adaH_S, criterion, tau, N_EPOCHS, lossList_adaH_S, valList_adaH_S)

In [None]:
trainadaH(model_adaH_M, optimizer_adaH_M, criterion, tau, N_EPOCHS, lossList_adaH_M, valList_adaH_M)

In [None]:
trainadaH(model_adaH_L, optimizer_adaH_L, criterion, tau, N_EPOCHS, lossList_adaH_L, valList_adaH_L)

In [None]:
trainadaH(model_adaH_WM, optimizer_adaH_WM, criterion, tau, N_EPOCHS, lossList_adaH_WM, valList_adaH_WM)

In [None]:
E= np.asarray(lossList_adaH_S)
F_= np.asarray(lossList_adaH_M)
G= np.asarray(lossList_adaH_L)
H= np.asarray(lossList_adaH_WM)

np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_adaH_S.npy", E)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_adaH_M.npy", F_)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_adaH_L.npy", G)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_adaH_WM.npy", H)

In [None]:
torch.save(model_adaH_S.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/adaH_S.pt")
torch.save(model_adaH_M.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/adaH_M.pt")
torch.save(model_adaH_L.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/adaH_L.pt")
torch.save(model_adaH_WM.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/adaH_WM.pt")

I see the following steps to incorporating LALR into the code:
1. Be able to compute the $K_z$: 
    * This is defined to be the largest activation in the penultimate layer
    * Hence, our network needs a method that allows us to obtain it's penultimate output and then simply take a supremum over it 
2. For now, a possible LC of the log-cosh seems to be just the same as for the check, namely, $\frac{K_z}{m}\times \max(\tau, 1-\tau)$ -- these kinds of LC's are all expressable in the form: $C.\frac{K_z}{m}$ (where C is some constant which we know)

The next expression that we want to try out, is: $\frac{1}{m}\tanh(g(0)-y_k).K_z: \text{ } z_k^{[L]}=0$, or, exploiting the continuity of the regression problem, we can also write: $\frac{1}{m}\tanh(g(0)-y_k).K_z,\text{ where: } k= \argmin_j z_j^{[L]}$

Hence, for implementing the above expression in addition to the penultimate function, we also need access to the set of pre-activation values, and be able to compute the $\argmin$ from among them, since in this network, we do not have any activation function acting on the final layer, we can just compute the minimum feature in the final layer

In [35]:
# writing training loop for networks with LALR
def train_adaptive_lr(model,optimizer, criterion, tau, epochs, ls_list, valList):
    """
    Training loop used for LALR training
    """
    for epoch in range(epochs):
        epoch_loss= 0.0
        lr_val= computeLR(model, bSize=16)
        optimizer.param_groups[0]['lr']= lr_val
        # training loop
        model.train()
        for inputs, labels in trainLoader: 
            inputs= inputs.to(device) 
            labels= labels.to(device)
            optimizer.zero_grad() 
            outputs= model(inputs) 
            loss= criterion(outputs, labels, tau, h) 
            loss.backward() 
            optimizer.step() 
            epoch_loss+= loss.item()
        ls_list.append(epoch_loss/len(trainLoader))

        # validation loop
        val_loss= 0.0
        model.eval()
        for inputs, labels in valLoader:
            inputs= inputs.to(device)
            labels= labels.to(device)
            outputs= model(inputs)
            loss= criterion(outputs, labels, tau, h)
            val_loss+= loss.item()
        valList.append(val_loss/len(valLoader))
        print("Epoch: {} Training Loss: {} Validation loss: {} LR: {}".format(epoch, epoch_loss/len(trainLoader), val_loss/len(valLoader), optimizer.param_groups[0]['lr']))


def computeLR(model, bSize= 16):
    """
    Takes in a network of the LALRnetwork class(during some arbitrary EPOCH of training) and the current input, and returns Kz for the EPOCH
    """
    Kz = 0.0
    z_k= 0.0
    model.eval()
    with torch.no_grad():
        for i,j in enumerate(trainLoader):
            inputs,labels= j[0],j[1]
            inputs= inputs.to(device)
            labels= labels.to(device)
            op1= model.penU(inputs)
            op2= model(inputs)
            # first taking the max and min for each batch
            activ1, arg1= torch.max(op1, dim= 1)
            activ2, arg2= torch.min(op2, dim= 1)
            # now, we take the max and min across batches
            val1, indx1= torch.max(activ1, dim= 0)
            val2, indx2= torch.min(activ2, dim= 0)
            # print(indx, i)
            if val1 > Kz:
                # in the case of K_z, we do not need the index where the max occurs, hence only deal with the value
                Kz= val1 
            z_k= val2
            argMin= arg2[indx2]
    print(int(indx2), int(argMin))

    LR= (1/bSize)*torch.tanh(-op2[int(indx2)][int(argMin)])*Kz
    if LR==0:
        return 0.1
    return 1/LR

In [36]:
# instantiating LALR models
model_lr_1= LALRnetwork(size1=300, size2=300, drop=0.1).to(device)
optimizer_lr_1= optim.SGD(model_lr_1.parameters(), lr= 0.1)
loss_list_lr_1= []
valList_LR_S= []

model_lr_2= LALRnetwork(size1=1000, size2=1000, drop=0.1).to(device)
optimizer_lr_2= optim.SGD(model_lr_2.parameters(), lr= 0.1)
loss_list_lr_2= []
valList_LR_M= []

model_lr_3= LALRnetwork(size1=3000, size2=3000, drop=0.1).to(device)
optimizer_lr_3= optim.SGD(model_lr_3.parameters(), lr= 0.1)
loss_list_lr_3= []
valList_LR_L= []


model_lr_4= LALRwideM(size1=2000, drop=0.1).to(device)
optimizer_lr_4= optim.SGD(model_lr_4.parameters(), lr= 0.1)
loss_list_lr_4= []
valList_LR_WM= []

In [None]:
train_adaptive_lr(model_lr_1,optimizer_lr_1, criterion, tau, N_EPOCHS, loss_list_lr_1, valList_LR_S)

In [None]:
train_adaptive_lr(model_lr_2,optimizer_lr_2, criterion, tau, N_EPOCHS, loss_list_lr_2, valList_LR_M)

In [None]:
train_adaptive_lr(model_lr_3, optimizer_lr_3, criterion, tau, N_EPOCHS, loss_list_lr_3, valList_LR_L)

In [None]:
train_adaptive_lr(model_lr_4, optimizer_lr_4, criterion, tau, N_EPOCHS, loss_list_lr_4, valList_LR_WM)

In [None]:
H= np.asarray(loss_list_lr_1)
I= np.asarray(loss_list_lr_2)
J= np.asarray(loss_list_lr_3)
K= np.asarray(loss_list_lr_4)

np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_LR_S.npy", H)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_LR_M.npy", I)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_LR_L.npy", J)
np.save("/home/aryamanj/Downloads/D-GEX_second/lossList_LR_WM.npy", K)

In [None]:
torch.save(model_lr_1.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/LR_S.pt")
torch.save(model_lr_2.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/LR_M.pt")
torch.save(model_lr_3.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/LR_L.pt")
torch.save(model_lr_4.state_dict(), "/home/aryamanj/Downloads/D-GEX_second/LR_WM.pt")
