**Author:** Boris Kundu

**Problem Statement:** Classify Iris types using GD with standard Momentum and Nesterov Momentum (PyTorch)

**Dataset:** Iris

In [48]:
#Import packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import datasets

In [49]:
#Read data
iris = datasets.load_iris()

In [50]:
#Define input parameters
n1 = len(iris.feature_names)  # input size
k = len(iris.target_names)    # output size
n2 = 5                        # hidden layer size

In [51]:
#Initialize weights and biases
W1 = torch.randn(n1, n2, dtype=torch.double, requires_grad=True)
b1 = torch.randn(n2, dtype=torch.double, requires_grad=True)
W2 = torch.randn(n2, k, dtype=torch.double, requires_grad=True)
b2 = torch.randn(k, dtype=torch.double, requires_grad=True)

In [52]:
#Define input and output features
X = torch.tensor(iris["data"])
target = torch.tensor(iris["target"], dtype=torch.long)

In [53]:
#Initialize velocity for weights and biases
vW1 = torch.zeros_like(W1)
vb1 = torch.zeros_like(b1)
vW2 = torch.zeros_like(W2)
vb2 = torch.zeros_like(b2)

In [54]:
#Define system parameters
alpha = 0.9 #Momentum
eta = 0.01 #Learning rate
epochs = 1000 #Iterations

In [55]:
#Train model with GD using momentum
for i in range(epochs):
    o1 = X.matmul(W1) + b1
    h = o1.sigmoid()
    o2 = h.matmul(W2) + b2
    L = F.cross_entropy(o2, target)
    if (i%100 == 0):
        print(f'Loss:{L.item()} at Epoch:{i}')
    W1.grad = None
    b1.grad = None
    W2.grad = None
    b2.grad = None
    L.backward()
    vb2 = alpha * vb2 - eta * b2.grad
    b2.data = b2.data + vb2
    vW2 = alpha * vW2 - eta * W2.grad
    W2.data = W2.data + vW2
    vb1 = alpha * vb1 - eta * b1.grad
    b1.data = b1.data + vb1
    vW1 = alpha * vW1 - eta * W1.grad
    W1.data = W1.data + vW1

Loss:1.4118387685376868 at Epoch:0
Loss:0.8137970121631003 at Epoch:100
Loss:0.6509584799482667 at Epoch:200
Loss:0.5837409190877758 at Epoch:300
Loss:0.5493200902891285 at Epoch:400
Loss:0.5283783838507435 at Epoch:500
Loss:0.5135961149868734 at Epoch:600
Loss:0.5012734750808205 at Epoch:700
Loss:0.4889235772845755 at Epoch:800
Loss:0.4748257145459746 at Epoch:900


In [56]:
#Make predictions
o1 = X.matmul(W1) + b1
h = o1.sigmoid()
o2 = h.matmul(W2) + b2
ypred = o2.argmax(axis=1)

In [57]:
#Display results
print(f'Predictions using GD with momentum:\n{ypred}')
matches = torch.eq(ypred, target).int().sum()
print(f'Matches using GD with momentum:{matches.item()}')

Predictions using GD with momentum:
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1,
        2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])
Matches using GD with momentum:140


In [58]:
#Reinitialize weights and biases
W1 = torch.randn(n1, n2, dtype=torch.double, requires_grad=True)
b1 = torch.randn(n2, dtype=torch.double, requires_grad=True)
W2 = torch.randn(n2, k, dtype=torch.double, requires_grad=True)
b2 = torch.randn(k, dtype=torch.double, requires_grad=True)

In [59]:
#Reinitialize velocity for weights and biases
vW1 = torch.zeros_like(W1)
vb1 = torch.zeros_like(b1)
vW2 = torch.zeros_like(W2)
vb2 = torch.zeros_like(b2)

In [60]:
#Train model with GD using Nesterov momentum
for i in range(epochs):
    # Saving current
    tW1 = W1.data  
    tb1 = b1.data
    tW2 = W2.data
    tb2 = b2.data
    # Move without gradient
    W1.data = W1.data + alpha * vW1  
    b1.data = b1.data + alpha * vb1
    W2.data = W2.data + alpha * vW2
    b2.data = b2.data + alpha * vb2
    o1 = X.matmul(W1) + b1
    h = o1.sigmoid()
    o2 = h.matmul(W2) + b2
    L = F.cross_entropy(o2, target) # not exactly the loss, but for gradient
    if (i%100 == 0):
        print(f'Loss:{L.item()} at Epoch:{i}')
    W1.grad = None
    b1.grad = None
    W2.grad = None
    b2.grad = None
    L.backward()
    vb2 = alpha * vb2 - eta * b2.grad
    b2.data = tb2 + vb2
    vW2 = alpha * vW2 - eta * W2.grad
    W2.data = tW2 + vW2
    vb1 = alpha * vb1 - eta * b1.grad
    b1.data = tb1 + vb1
    vW1 = alpha * vW1 - eta * W1.grad
    W1.data = tW1 + vW1  # move from saved current state

Loss:1.334649500255506 at Epoch:0
Loss:0.8107889651532572 at Epoch:100
Loss:0.6382366482800941 at Epoch:200
Loss:0.5378399577173137 at Epoch:300
Loss:0.482266359523972 at Epoch:400
Loss:0.4316132989969166 at Epoch:500
Loss:0.3778874609395956 at Epoch:600
Loss:0.32436055604498126 at Epoch:700
Loss:0.27688551891405416 at Epoch:800
Loss:0.2383225891867438 at Epoch:900


In [61]:
#Make predictions using new parameters
o1 = X.matmul(W1) + b1
h = o1.sigmoid()
o2 = h.matmul(W2) + b2
ypred = o2.argmax(axis=1)

In [62]:
#Display results
print(f'Predictions using GD with Nesterov momentum:\n{ypred}')
matches = torch.eq(ypred, target).int().sum()
print(f'Matches using GD with Nesterov momentum:{matches.item()}')

Predictions using GD with Nesterov momentum:
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])
Matches using GD with Nesterov momentum:145
