In [1]:
import torch
from torch.optim import Optimizer

In [2]:
dtype = torch.float
device = torch.device("cpu")

### Custom SGD Optimizer

In [5]:
class MySGD(Optimizer):
    def __init__(self, params, lr=1e-3):
        # sanity checks
        if lr < 0.0:
            raise ValueError(f"Invalid learning rate: {lr}")

        defaults = dict(lr=lr)
        super(MySGD, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SGD, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    @torch.no_grad()
    def step(self):
        loss = None
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad
                p.add_(d_p, alpha=-group['lr'])

        return loss

In [6]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = MySGD(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t-1, loss.item())

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()        

98 2.0202033519744873
198 0.028193345293402672
298 0.0010272630024701357
398 5.838064680574462e-05
498 3.7830916426173644e-06


### Custom SGD with Momentum and Dampening

In [3]:
class MySGD2(Optimizer):
    def __init__(self, params, lr, momentum=0, dampening=0):
        if lr < 0.0:
            raise ValueError(f"Invalid learning rate: {lr}")
        if momentum < 0.0:
            raise ValueError(f"Invalid momentum value: {momentum}")

        defaults = dict(lr=lr, momentum=momentum, dampening=dampening)
        super(MySGD2, self).__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure=None):
        loss = None

        for group in self.param_groups:
            momentum = group['momentum']
            dampening = group['dampening']
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(d_p, alpha=1 - dampening)

                p.add_(d_p, alpha=-group['lr'])

        return loss

In [4]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
momentum = 0.9
optimizer = MySGD2(
    model.parameters(), 
    lr=learning_rate,
    momentum=momentum
)
for t in range(500):
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t-1, loss.item())

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()        

98 2.389493227005005
198 0.04467420279979706
298 0.0016932786675170064
398 8.230946696130559e-05
498 4.497599093156168e-06
