In [1]:
import numpy as np
from numpy import asarray
from numpy import arange
from numpy.random import rand
from numpy.random import seed
import matplotlib.pyplot as plt
import seaborn as sns
from func_autograd import *
from sklearn.model_selection import train_test_split
import pandas as pd

def MSE(y_data, y_model):
	n = np.size(y_model)
	y_data = y_data.reshape(-1,1)
	y_model = y_model.reshape(-1,1)
	return np.sum((y_data - y_model)**2)/n


def generate_data(noise=True, step_size=0.05 , FrankesFunction=True):
    # Arrange x and y
    x = np.arange(0, 1, step_size)
    y = np.arange(0, 1, step_size)
    # Create meshgrid of x and y
    X, Y = np.meshgrid(x, y)
    
    if FrankesFunction:
        # Calculate the values for Franke function
        z = FrankeFunction(X, Y, noise=noise).flatten()
    else:
        z = TestFunction(X, Y, noise=noise).flatten()

    # Flatten x and y for plotting
    x = X.flatten()
    y = Y.flatten()
    
    return x, y, z

def TestFunction(x, y, noise=False):
    if noise: 
        random_noise = np.random.normal(0, 0.1 , x.shape)
    else: 
        random_noise = 0

    return  x**2 + y**2 + 2*x*y + random_noise

def FrankeFunction(x, y, noise=False):
    if noise: 
        random_noise = np.random.normal(0, 0.1 , x.shape)
    else: 
        random_noise = 0
    
    term1 = 0.75*np.exp(-(0.25*(9*x-2)**2) - 0.25*((9*y-2)**2))
    term2 = 0.75*np.exp(-((9*x+1)**2)/49.0 - 0.1*(9*y+1))
    term3 = 0.5*np.exp(-(9*x-7)**2/4.0 - 0.25*((9*y-3)**2))
    term4 = -0.2*np.exp(-(9*x-4)**2 - (9*y-7)**2)
    return term1 + term2 + term3 + term4 + random_noise

x, y, z = generate_data()
X = create_X(x, y, 7)
X_train, X_test, z_train, z_test = train_test_split(X, z)

In [7]:
# parameters
gamma = np.linspace(0.001, 0.1, 10)
delta = np.linspace(0.05, 0.5, 10)
eta = np.linspace(0.7, 0.95, 5)

X_train = X_train[:, 1:3]
X_test = X_test[:, 1:3]

trials = 10

gd_gamma = np.zeros(len(gamma))
gd_mom_gamma = np.zeros(len(gamma))
gd_mom_delta = np.zeros(len(delta))

for t in range(trials):
    initial_val = np.random.randn(X_train.shape[1],1)

    # plain gradient descent
    train_score = np.zeros(len(gamma))
    for i in range(len(gamma)):
        model = GradientDescend(momentum=False, learning_rate=gamma[i])
        scores = model.fit(X_train, z_train, X_test, z_test)

        pred_train = model.predict(X_train)
        train_score[i] = MSE(pred_train, z_train)

    i_min, min = train_score.argmin(), train_score.min()
    gd_gamma[i_min] += 1
    print("gd: learning rate and minimal train MSE")
    print(gamma[i_min], min)

    # adding momentum
    train_score = np.zeros((len(gamma), len(delta)))
    for j in range(len(delta)):
        for i in range(len(gamma)):
            model = GradientDescend(learning_rate=gamma[i], delta_momentum=delta[j])
            scores = model.fit(X_train, z_train, X_test, z_test)

            pred_train = model.predict(X_train)
            train_score[i, j] = MSE(pred_train, z_train)
        
    i_min, min = train_score.argmin(), train_score.min()
    k, l=np.shape(train_score)
    i_min = np.unravel_index(i_min, shape=[k, l])
    gd_mom_gamma[i_min[0]] += 1
    gd_mom_delta[i_min[1]] += 1
    print("gd with momentum: learning rate, momentum and minimal train MSE")
    print(gamma[i_min[0]], delta[i_min[1]], min)


gd_gamma_opt = gamma[gd_gamma.argmax()]
gd_mom_gamma_opt = gamma[gd_mom_gamma.argmax()]
gd_mom_delta_opt = delta[gd_mom_delta.argmax()]

gd: learning rate and minimal train MSE
0.07800000000000001 0.21239183827522068
gd with momentum: learning rate, momentum and minimal train MSE
0.1 0.25 0.19463814788879574
gd: learning rate and minimal train MSE
0.001 0.1994109344156838
gd with momentum: learning rate, momentum and minimal train MSE
0.067 0.25 0.19296293237444687
gd: learning rate and minimal train MSE
0.012 0.2522201841012957
gd with momentum: learning rate, momentum and minimal train MSE
0.012 0.5 0.1998516160912237
gd: learning rate and minimal train MSE
0.012 0.20247244974048922
gd with momentum: learning rate, momentum and minimal train MSE
0.045000000000000005 0.3 0.1929014121801626
gd: learning rate and minimal train MSE
0.1 0.228224212595669
gd with momentum: learning rate, momentum and minimal train MSE
0.023000000000000003 0.25 0.1949698963266102
gd: learning rate and minimal train MSE
0.012 0.3498085559952122
gd with momentum: learning rate, momentum and minimal train MSE
0.1 0.35000000000000003 0.193149035

In [2]:
trials = 10

# parameters
gamma = np.linspace(0.001, 0.1, 10)
delta = np.linspace(0.05, 0.5, 5)
eta = np.linspace(0.7, 0.95, 3)
batch_size = np.arange(50, len(X_train), 50)

sgd_mom_gamma = np.zeros(len(gamma))
sgd_mom_delta = np.zeros(len(delta))
sgd_mom_eta = np.zeros(len(eta))
sgd_mom_batchsize = np.zeros(len(batch_size))

# minibatch sgd with momentum and learning schedule
for t in range(trials):
    train_score = np.zeros((len(gamma), len(delta), len(eta), len(batch_size)))
    for j in range(len(delta)):
        for i in range(len(gamma)):
            for h in range(len(eta)):
                for b in range(len(batch_size)):
                    model = GradientDescend(optimizer="sgd", method="gd", learning_rate=gamma[i], delta_momentum=delta[j], learning_rate_decay_flag=True, learning_rate_decay=eta[h], batch_size=batch_size[b])
                    scores = model.fit(X_train, z_train, X_test, z_test)

                    pred_train = model.predict(X_train)
                    train_score[i, j, h, b] = MSE(pred_train, z_train)
                

    i_min, min = train_score.argmin(), train_score.min()
    k, l, m, n=np.shape(train_score)
    i_min = np.unravel_index(i_min, shape=[k, l, m, n])
    sgd_mom_gamma[i_min[0]] += 1
    sgd_mom_delta[i_min[1]] += 1
    sgd_mom_eta[i_min[2]] += 1
    sgd_mom_batchsize[i_min[3]] += 1

    print("sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE")
    print(gamma[i_min[0]], delta[i_min[1]], eta[i_min[2]], min)


sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.07800000000000001 0.275 0.95 0.021652041391109985
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.08900000000000001 0.05 0.95 0.02133296044271596
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.05600000000000001 0.5 0.95 0.021785460768874502
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.07800000000000001 0.5 0.825 0.021477614360114722
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.05600000000000001 0.5 0.825 0.021019447572645084
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.08900000000000001 0.3875 0.825 0.021388274421771533
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.1 0.1625 0.95 0.020464675705762006
sgd with momentum: learning rate, momentum, learning 

In [None]:
sgd_mom_gamma_opt = gamma[sgd_mom_gamma.argmax()]
sgd_mom_delta_opt = delta[sgd_mom_delta.argmax()]
sgd_mom_eta_opt = eta[sgd_mom_eta.argmax()]
sgd_mom_batchsize_opt = batch_size[sgd_mom_batchsize.argmax()]

In [9]:
# iterating over different batch sizes again
# since the grid search above resulted in the smallest possible batch size performing the best, we test for lower values of the batch size

batch_size = np.arange(15, 75, 10)
sgd_mom_batchsize = np.zeros(len(batch_size))

for t in range(trials):
    train_score = np.zeros(len(batch_size))
    for b in range(len(batch_size)):
        model = GradientDescend(optimizer="sgd", method="gd", learning_rate=sgd_mom_gamma_opt, delta_momentum=sgd_mom_delta_opt, learning_rate_decay_flag=True, learning_rate_decay=sgd_mom_eta_opt, batch_size=batch_size[b])
        scores = model.fit(X_train, z_train, X_test, z_test)

        pred_train = model.predict(X_train)
        train_score[b] = MSE(pred_train, z_train)
                

    i_min, min = train_score.argmin(), train_score.min()
    sgd_mom_batchsize[i_min] += 1

    print("sgd with momentum: batch size and minimal train MSE")
    print(batch_size[i_min], min)


sgd_mom_batchsize_opt = batch_size[sgd_mom_batchsize.argmax()]


sgd with momentum: batch size and minimal train MSE
25 0.19220425837946933
sgd with momentum: batch size and minimal train MSE
55 0.19220426023675682
sgd with momentum: batch size and minimal train MSE
15 0.19220425883127062
sgd with momentum: batch size and minimal train MSE
15 0.1922042575055339
sgd with momentum: batch size and minimal train MSE
15 0.1922042573207149
sgd with momentum: batch size and minimal train MSE
25 0.19220426356636414
sgd with momentum: batch size and minimal train MSE
25 0.19220425925588405
sgd with momentum: batch size and minimal train MSE
25 0.19220425691727347
sgd with momentum: batch size and minimal train MSE
15 0.19220425713879302
sgd with momentum: batch size and minimal train MSE
15 0.1922042583487819


In [10]:
# storing the optimal parameters
%store gd_gamma_opt
%store gd_mom_gamma_opt
%store sgd_mom_gamma_opt
%store gd_mom_delta_opt
%store sgd_mom_delta_opt
%store sgd_mom_eta_opt
%store sgd_mom_batchsize_opt

Stored 'gd_gamma_opt' (float64)
Stored 'gd_mom_gamma_opt' (float64)
Stored 'sgd_mom_gamma_opt' (float64)
Stored 'gd_mom_delta_opt' (float64)
Stored 'sgd_mom_delta_opt' (float64)
Stored 'sgd_mom_eta_opt' (float64)
Stored 'sgd_mom_batchsize_opt' (int64)


In [13]:
trials = 100
batch_size = np.arange(50, len(X_train), 50)

sgd_adam_batchsize = np.zeros(len(batch_size))

for t in range(trials):
    train_score = np.zeros(len(batch_size))
    for b in range(len(batch_size)):
        model = GradientDescend(optimizer="sgd", method="adam", learning_rate=sgd_mom_gamma_opt, batch_size=batch_size[b])
        scores = model.fit(X_train, z_train, X_test, z_test)

        pred_train = model.predict(X_train)
        train_score[b] = MSE(pred_train, z_train)
                

    i_min, min = train_score.argmin(), train_score.min()
    sgd_adam_batchsize[i_min] += 1

    print("sgd with momentum: batch size and minimal train MSE")
    print(batch_size[i_min], min)


sgd_adam_batchsize_opt = batch_size[sgd_adam_batchsize.argmax()]

sgd with momentum: batch size and minimal train MSE
50 0.2085220476302492
sgd with momentum: batch size and minimal train MSE
150 0.4990548441463926
sgd with momentum: batch size and minimal train MSE
50 0.3870432741514932
sgd with momentum: batch size and minimal train MSE
150 0.26023795909132935
sgd with momentum: batch size and minimal train MSE
200 0.2948245035450071
sgd with momentum: batch size and minimal train MSE
200 0.19944632128347944
sgd with momentum: batch size and minimal train MSE
250 0.21106404888831723
sgd with momentum: batch size and minimal train MSE
200 0.20212834558655748
sgd with momentum: batch size and minimal train MSE
50 0.2545491645558134
sgd with momentum: batch size and minimal train MSE
150 0.22077881929849288
sgd with momentum: batch size and minimal train MSE
100 0.25516452020942115
sgd with momentum: batch size and minimal train MSE
150 0.3135825761868059
sgd with momentum: batch size and minimal train MSE
150 0.2148554773105451
sgd with momentum: bat