**Author:** Boris Kundu

**Problem Statement**: Comparison of different adaptive learning optimizers.

**Dataset** Iris

In [34]:
#Import packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import datasets

In [35]:
#Read data
iris = datasets.load_iris()

In [36]:
#Define input parameters
n1 = len(iris.feature_names)  # input size
k = len(iris.target_names)    # output size
n2 = 5                        # hidden layer size

In [37]:
#Initialize weights and biases
W1 = torch.randn(n1, n2, dtype=torch.double, requires_grad=True)
b1 = torch.randn(n2, dtype=torch.double, requires_grad=True)
W2 = torch.randn(n2, k, dtype=torch.double, requires_grad=True)
b2 = torch.randn(k, dtype=torch.double, requires_grad=True)

In [38]:
#Define input and output features
X = torch.tensor(iris["data"])
target = torch.tensor(iris["target"], dtype=torch.long)

In [39]:
# Initialize Accumulated Squared Gradient - AdaGrad
rW1 = torch.zeros_like(W1)
rb1 = torch.zeros_like(b1)
rW2 = torch.zeros_like(W2)
rb2 = torch.zeros_like(b2)

In [40]:
#Define system parameters
alpha = 0.9 #Momentum
eta = 0.01 #Learning rate
epochs = 1000 #Iterations
delta = 1e-7 #To avoid divide by zero in case gradient is 0
rho = 0.9 #For RMSprop
rho1 = 0.9 #For Adam
rho2 = 0.999 #For Adam

In [41]:
#Make copies of initial weights for AdaGrad.
adaW1 = W1
adab1 = b1
adaW2 = W2
adab2 = b2

In [42]:
#Train using AdaGrad
for i in range(epochs):
    o1 = X.matmul(adaW1) + adab1
    h = o1.sigmoid()
    o2 = h.matmul(adaW2) + adab2
    L = F.cross_entropy(o2, target)
    if (i%100 == 0):
        print(f'Loss:{L.item()} at Epoch:{i}')
    adaW1.grad = None
    adab1.grad = None
    adaW2.grad = None
    adab2.grad = None
    L.backward()
    rb2 = rb2 + adab2.grad.square()
    adab2.data = adab2.data - eta * adab2.grad.div(delta + rb2.sqrt())
    rW2 = rW2 + adaW2.grad.square()
    adaW2.data = adaW2.data - eta * adaW2.grad.div(delta + rW2.sqrt())
    rb1 = rb1 + adab1.grad.square()
    adab1.data = adab1.data - eta * adab1.grad.div(delta + rb1.sqrt())
    rW1 = rW1 + adaW1.grad.square()
    adaW1.data = adaW1.data - eta * adaW1.grad.div(delta + rW1.sqrt())

Loss:1.4532194599800896 at Epoch:0
Loss:0.9694441040324958 at Epoch:100
Loss:0.7634343085339962 at Epoch:200
Loss:0.675112804981835 at Epoch:300
Loss:0.6321122797349757 at Epoch:400
Loss:0.603050860637645 at Epoch:500
Loss:0.5805252328733098 at Epoch:600
Loss:0.5618992145257825 at Epoch:700
Loss:0.5459118800821069 at Epoch:800
Loss:0.5318506364220132 at Epoch:900


In [43]:
#Function to predict
def predict(features,target_class,wt1,bs1,wt2,bs2,opt):
    o1 = features.matmul(wt1) + bs1
    h = o1.sigmoid()
    o2 = h.matmul(wt2) + bs2
    ypred = o2.argmax(axis=1)
    print(f'Predictions using {opt} is:\n{ypred}')
    matches = torch.eq(ypred, target).int().sum()
    print(f'Matches using {opt} is:{matches.item()}')

In [44]:
#Predict using AdaGrad
predict(X,target,adaW1,adab1,adaW2,adab2,'AdaGrad')

Predictions using AdaGrad is:
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2,
        2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])
Matches using AdaGrad is:113


In [45]:
#Make copies of initial weights for RMSProp.
rmsW1 = W1
rmsb1 = b1
rmsW2 = W2
rmsb2 = b2

In [46]:
# Initialize Accumulated Squared Gradient - RMSProp
rW1 = torch.zeros_like(W1)
rb1 = torch.zeros_like(b1)
rW2 = torch.zeros_like(W2)
rb2 = torch.zeros_like(b2)

In [47]:
#Train using RMSProp
for i in range(epochs):
    o1 = X.matmul(rmsW1) + rmsb1
    h = o1.sigmoid()
    o2 = h.matmul(rmsW2) + rmsb2
    L = F.cross_entropy(o2, target)
    if (i%100 == 0):
        print(f'Loss:{L.item()} at Epoch:{i}')
    rmsW1.grad = None
    rmsb1.grad = None
    rmsW2.grad = None
    rmsb2.grad = None
    L.backward()
    rb2 = rho * rb2 + (1 - rho) * rmsb2.grad.square()
    rmsb2.data = rmsb2.data - eta * rmsb2.grad.div(delta + rb2.sqrt())
    rW2 = rho * rW2 + (1 - rho) * rmsW2.grad.square()
    rmsW2.data = rmsW2.data - eta * rmsW2.grad.div(delta + rW2.sqrt())
    rb1 = rho * rb1 + (1 - rho) * rmsb1.grad.square()
    rmsb1.data = rmsb1.data - eta * rmsb1.grad.div(delta + rb1.sqrt())
    rW1 = rho * rW1 + (1 - rho) * rmsW1.grad.square()
    rmsW1.data = rmsW1.data - eta * rmsW1.grad.div(delta + rW1.sqrt())

Loss:0.519269887816031 at Epoch:0
Loss:0.19723521340853803 at Epoch:100
Loss:0.10321460260039333 at Epoch:200
Loss:0.07032723341672055 at Epoch:300
Loss:0.05336319054557142 at Epoch:400
Loss:0.04899767935117585 at Epoch:500
Loss:0.046967942103228875 at Epoch:600
Loss:0.04578384541635465 at Epoch:700
Loss:0.044984337305388325 at Epoch:800
Loss:0.04438882637883274 at Epoch:900


In [48]:
#Predict using RMSProp
predict(X,target,rmsW1,rmsb1,rmsW2,rmsb2,'RMSProp')

Predictions using RMSProp is:
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])
Matches using RMSProp is:148


In [49]:
#For Nesterov
vW1 = torch.zeros_like(W1)
vb1 = torch.zeros_like(b1)
vW2 = torch.zeros_like(W2)
vb2 = torch.zeros_like(b2)

In [50]:
# Initialize Accumulated Squared Gradient - RMSProp Nesterov
rW1 = torch.zeros_like(W1)
rb1 = torch.zeros_like(b1)
rW2 = torch.zeros_like(W2)
rb2 = torch.zeros_like(b2)

In [51]:
#Make copies of initial weights for RMSProp with Nesterov.
rmsnW1 = W1
rmsnb1 = b1
rmsnW2 = W2
rmsnb2 = b2

In [52]:
#Train using RMSProp with Nesterov
for i in range(epochs):
    tW1 = rmsnW1.data
    tb1 = rmsnb1.data
    tW2 = rmsnW2.data
    tb2 = rmsnb2.data
    rmsnW1.data = rmsnW1.data + alpha * vW1
    rmsnb1.data = rmsnb1.data + alpha * vb1
    rmsnW2.data = rmsnW2.data + alpha * vW2
    rmsnb2.data = rmsnb2.data + alpha * vb2
    o1 = X.matmul(rmsnW1) + rmsnb1
    h = o1.sigmoid()
    o2 = h.matmul(rmsnW2) + rmsnb2
    L = F.cross_entropy(o2, target)
    if (i%100 == 0):
        print(f'Loss:{L.item()} at Epoch:{i}')
    rmsnW1.grad = None
    rmsnb1.grad = None
    rmsnW2.grad = None
    rmsnb2.grad = None
    L.backward()
    rb2 = rho * rb2 + (1 - rho) * rmsnb2.grad.square()
    vb2 = alpha * vb2 - eta * rmsnb2.grad.div(rb2.sqrt())
    rmsnb2.data = tb2 + vb2
    rW2 = rho * rW2 + (1 - rho) * rmsnW2.grad.square()
    vW2 = alpha * vW2 - eta * rmsnW2.grad.div(rW2.sqrt())
    rmsnW2.data = tW2 + vW2
    rb1 = rho * rb1 + (1 - rho) * rmsnb1.grad.square()
    vb1 = alpha * vb1 - eta * rmsnb1.grad.div(rb1.sqrt())
    rmsnb1.data = tb1 + vb1
    rW1 = rho * rW1 + (1 - rho) * rmsnW1.grad.square()
    vW1 = alpha * vW1 - eta * rmsnW1.grad.div(rW1.sqrt())
    rmsnW1.data = tW1 + vW1

Loss:0.04391518756187686 at Epoch:0
Loss:0.04204065220747992 at Epoch:100
Loss:0.04304365578020279 at Epoch:200
Loss:0.042231424007413546 at Epoch:300
Loss:0.04157183575045348 at Epoch:400
Loss:0.040993215778672674 at Epoch:500
Loss:0.040457779671888044 at Epoch:600
Loss:0.039944235736047856 at Epoch:700
Loss:0.03943958616417627 at Epoch:800
Loss:0.038934878128612446 at Epoch:900


In [53]:
#Predict using RMSProp with Nesterov
predict(X,target,rmsnW1,rmsnb1,rmsnW2,rmsnb2,'RMSProp with Nesterov')

Predictions using RMSProp with Nesterov is:
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])
Matches using RMSProp with Nesterov is:149


In [54]:
# Initialize for Adam
rW1 = torch.zeros_like(W1)
rb1 = torch.zeros_like(b1)
rW2 = torch.zeros_like(W2)
rb2 = torch.zeros_like(b2)
sW1 = torch.zeros_like(W1)
sb1 = torch.zeros_like(b1)
sW2 = torch.zeros_like(W2)
sb2 = torch.zeros_like(b2)

In [55]:
#Make copies of initial weights for Adam
admW1 = W1
admb1 = b1
admW2 = W2
admb2 = b2

In [56]:
#Train Adam
rho1t = rho1
rho2t = rho2
for i in range(epochs):
    o1 = X.matmul(admW1) + admb1
    h = o1.sigmoid()
    o2 = h.matmul(admW2) + admb2
    L = F.cross_entropy(o2, target)
    if (i%100 == 0):
        print(f'Loss:{L.item()} at Epoch:{i}')
    admW1.grad = None
    admb1.grad = None
    admW2.grad = None
    admb2.grad = None
    L.backward()
    sb2 = rho1 * sb2 + (1 - rho1) * admb2.grad
    rb2 = rho2 * rb2 + (1 - rho2) * admb2.grad.square()
    admb2.data = admb2.data - eta * sb2.div(1 - rho1t).div(delta + rb2.div(1 - rho2t).sqrt())
    sW2 = rho1 * sW2 + (1 - rho1) * admW2.grad
    rW2 = rho2 * rW2 + (1 - rho2) * admW2.grad.square()
    admW2.data = admW2.data - eta * sW2.div(1 - rho1t).div(delta + rW2.div(1 - rho2t).sqrt())
    sb1 = rho1 * sb1 + (1 - rho1) * admb1.grad
    rb1 = rho2 * rb1 + (1 - rho2) * admb1.grad.square()
    admb1.data = admb1.data - eta * sb1.div(1 - rho1t).div(delta + rb1.div(1 - rho2t).sqrt())
    sW1 = rho1 * sW1 + (1 - rho1) * admW1.grad
    rW1 = rho2 * rW1 + (1 - rho2) * admW1.grad.square()
    admW1.data = admW1.data - eta * sW1.div(1 - rho1t).div(delta + rW1.div(1 - rho2t).sqrt())
    rho1t = rho1t * rho1
    rho2t = rho2t * rho2

Loss:0.0361660668124977 at Epoch:0
Loss:0.03553527912940638 at Epoch:100
Loss:0.0352079712652821 at Epoch:200
Loss:0.03477218096981501 at Epoch:300
Loss:0.03421487848021305 at Epoch:400
Loss:0.033533424141350604 at Epoch:500
Loss:0.03275584079750848 at Epoch:600
Loss:0.031936878233067706 at Epoch:700
Loss:0.03112675729748629 at Epoch:800
Loss:0.030352907888884905 at Epoch:900


In [57]:
#Predict using Adam
predict(X,target,admW1,admb1,admW2,admb2,'Adam')

Predictions using Adam is:
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])
Matches using Adam is:149
