In [1]:
% matplotlib inline

import torch
import torch.nn as nn
import numpy as np

# Load Data
data_path = '../data/processed/'
X = np.load(data_path + 'X.npy')
y = np.load(data_path + 'y.npy')
print("X has shape: {}\ny has shape: {}".format(X.shape, y.shape))
X_torch = torch.from_numpy(X).float()
y_torch = torch.from_numpy(y).float()

# Calculate global optimum a.k.a c from appendix A in the paper
# P.S. @ in python 3.5+ means matrix multiplication
cross_cov = (1 / len(y_torch)) * y_torch.transpose(0,1) @ X_torch # (1, 128)
y_cov = (1 / len(y_torch)) * y_torch.transpose(0,1) @ y_torch
global_opt = -0.5 * (cross_cov @ cross_cov.transpose(0,1)) + 0.5 * y_cov 
global_opt 

X has shape: (2565, 128)
y has shape: (2565, 1)


tensor([[1.9375e-11]])

In [4]:
import sys
sys.path.append('../src/models/')
from linear_nn import three_layer_nn, fro_loss, train

# Set seed
seed = 521
torch.manual_seed(seed)

# Initalize Constants
learning_rates = [1e-4, 1e-3, 1e-2, 1e-1, 1e0]
eps = 1e-5


for learning_rate in learning_rates: # Find optimal learning_rate
    # Select Model
    std = 1e-1
    model = three_layer_nn('normal', std, False, p=0.1)
    loss_fn = fro_loss()
    
    # Train using vanilla gradient descent outputs the number of iterations 
    # to reach eps of the global opt with value of loss
    num_iter = np.inf
    train_iter, loss = train(model, loss_fn, X_torch, y_torch, learning_rate, eps, global_opt)
    if num_iter > train_iter: # Find min train_iter
        num_iter = train_iter
    print("(Learning Rate, Total Iterations, Loss) = ({}, {}, {}).".format(learning_rate, train_iter, loss))

(Learning Rate, Total Iterations, Loss) = (0.0001, 100001, 0.00016575879999436438).
(Learning Rate, Total Iterations, Loss) = (0.001, 18195, 9.997145753004588e-06).
(Learning Rate, Total Iterations, Loss) = (0.01, 1821, 9.968571248464286e-06).
(Learning Rate, Total Iterations, Loss) = (0.1, 183, 9.948667866410688e-06).
(Learning Rate, Total Iterations, Loss) = (1.0, 32, 9.365488949697465e-06).


In [None]:
# learning_rate = 1e-1
# eps = 1e-5
# loss = np.inf
# t = 0
# while torch.abs(global_opt - loss) > eps:
#     W = model() # W_N * W_{N - 1} * ... * W_1

#     # Compute and print loss. We pass Tensors containing the predicted and true
#     # values of y, and the loss function returns a Tensor containing the
#     # loss.
#     loss = loss_fn(W, X_torch, y_torch)
#     print(t, loss.item())

#     # Zero the gradients before running the backward pass.
#     # In pytorch, gradients are accumulated with .backward(), hence,
#     # we need to zero them out each round
#     model.zero_grad()

#     # Backward pass: compute gradient of the loss with respect to all the learnable
#     # parameters of the model. Internally, the parameters of each Module are stored
#     # in Tensors with requires_grad=True, so this call will compute gradients for
#     # all learnable parameters in the model.
#     loss.backward()
    

#     # Update the weights using gradient descent. Each parameter is a Tensor, so
#     # we can access its gradients like we did before.
#     with torch.no_grad():
#         for param in model.parameters():
#             if param.grad is None:
#                 continue
#             param.data -= learning_rate * param.grad
#     t+=1