In [17]:
import numpy as np
from numpy import asarray
from numpy import arange
from numpy.random import rand
from numpy.random import seed
import matplotlib.pyplot as plt
import seaborn as sns
from func_autograd import *
from sklearn.model_selection import train_test_split
import pandas as pd

def MSE(y_data, y_model):
	n = np.size(y_model)
	y_data = y_data.reshape(-1,1)
	y_model = y_model.reshape(-1,1)
	return np.sum((y_data - y_model)**2)/n


def generate_data(noise=True, step_size=0.05 , FrankesFunction=True):
    # Arrange x and y
    x = np.arange(0, 1, step_size)
    y = np.arange(0, 1, step_size)
    # Create meshgrid of x and y
    X, Y = np.meshgrid(x, y)
    
    if FrankesFunction:
        # Calculate the values for Franke function
        z = FrankeFunction(X, Y, noise=noise).flatten()
    else:
        z = TestFunction(X, Y, noise=noise).flatten()

    # Flatten x and y for plotting
    x = X.flatten()
    y = Y.flatten()
    
    return x, y, z

def TestFunction(x, y, noise=False):
    if noise: 
        random_noise = np.random.normal(0, 0.1 , x.shape)
    else: 
        random_noise = 0

    return  x**2 + y**2 + 2*x*y + random_noise

def FrankeFunction(x, y, noise=False):
    if noise: 
        random_noise = np.random.normal(0, 0.1 , x.shape)
    else: 
        random_noise = 0
    
    term1 = 0.75*np.exp(-(0.25*(9*x-2)**2) - 0.25*((9*y-2)**2))
    term2 = 0.75*np.exp(-((9*x+1)**2)/49.0 - 0.1*(9*y+1))
    term3 = 0.5*np.exp(-(9*x-7)**2/4.0 - 0.25*((9*y-3)**2))
    term4 = -0.2*np.exp(-(9*x-4)**2 - (9*y-7)**2)
    return term1 + term2 + term3 + term4 + random_noise

x, y, z = generate_data()
X = create_X(x, y, 7)
X_train, X_test, z_train, z_test = train_test_split(X, z)

In [18]:
# parameters GD
trials = 10

gamma = np.linspace(0.01, 0.13, 7)
delta = np.linspace(0.6, 1, 7)
eta = np.linspace(0.9, 0.999, 5)
batch_size = np.arange(10, 50, 5)

In [19]:
X_train = X_train[:, 1:3]
X_test = X_test[:, 1:3]

gd_gamma = np.zeros(len(gamma))
gd_mom_gamma = np.zeros(len(gamma))
gd_mom_delta = np.zeros(len(delta))

for t in range(trials):
    
    # plain gradient descent
    train_score = np.zeros(len(gamma))
    for i in range(len(gamma)):
        model = GradientDescend(momentum=False, learning_rate=gamma[i])
        scores = model.fit(X_train, z_train, X_test, z_test)

        pred_train = model.predict(X_train)
        train_score[i] = MSE(pred_train, z_train)

    i_min, min = train_score.argmin(), train_score.min()
    gd_gamma[i_min] += 1
    print("gd: learning rate and minimal train MSE")
    print(gamma[i_min], min)

    # adding momentum
    train_score = np.zeros((len(gamma), len(delta)))
    for j in range(len(delta)):
        for i in range(len(gamma)):
            model = GradientDescend(learning_rate=gamma[i], delta_momentum=delta[j])
            scores = model.fit(X_train, z_train, X_test, z_test)

            pred_train = model.predict(X_train)
            train_score[i, j] = MSE(pred_train, z_train)
        
    i_min, min = train_score.argmin(), train_score.min()
    k, l=np.shape(train_score)
    i_min = np.unravel_index(i_min, shape=[k, l])
    gd_mom_gamma[i_min[0]] += 1
    gd_mom_delta[i_min[1]] += 1
    print("gd with momentum: learning rate, momentum and minimal train MSE")
    print(gamma[i_min[0]], delta[i_min[1]], min)


gd_gamma_opt = gamma[gd_gamma.argmax()]
gd_mom_gamma_opt = gamma[gd_mom_gamma.argmax()]
gd_mom_delta_opt = delta[gd_mom_delta.argmax()]

gd: learning rate and minimal train MSE
0.01 0.2863758610694571
gd with momentum: learning rate, momentum and minimal train MSE
0.03 0.8666666666666667 0.2120873032357589
gd: learning rate and minimal train MSE
0.09 0.22176743384087502
gd with momentum: learning rate, momentum and minimal train MSE
0.05 0.6666666666666666 0.19393262597249009
gd: learning rate and minimal train MSE
0.05 0.24385173298825374
gd with momentum: learning rate, momentum and minimal train MSE
0.01 0.9333333333333333 0.20514091483236238
gd: learning rate and minimal train MSE
0.11 0.19322159143780693
gd with momentum: learning rate, momentum and minimal train MSE
0.13 1.0 0.19313378686619076
gd: learning rate and minimal train MSE
0.09 0.3382354761410309
gd with momentum: learning rate, momentum and minimal train MSE
0.13 0.9333333333333333 0.19465612959365095
gd: learning rate and minimal train MSE
0.05 0.4127567597132159
gd with momentum: learning rate, momentum and minimal train MSE
0.11 0.9333333333333333 0

In [20]:
# parameters

trials = 10

In [21]:
# minibatch sgd with momentum and learning schedule
sgd_mom_gamma = np.zeros(len(gamma))
sgd_mom_delta = np.zeros(len(delta))
sgd_mom_eta = np.zeros(len(eta))
sgd_mom_batchsize = np.zeros(len(batch_size))

for t in range(trials):
    train_score = np.zeros((len(gamma), len(delta), len(eta), len(batch_size)))
    for j in range(len(delta)):
        for i in range(len(gamma)):
            for h in range(len(eta)):
                for b in range(len(batch_size)):
                    model = GradientDescend(optimizer="sgd", method="gd", learning_rate=gamma[i], delta_momentum=delta[j], learning_rate_decay_flag=True, learning_rate_decay=eta[h], batch_size=batch_size[b])
                    scores = model.fit(X_train, z_train, X_test, z_test)

                    pred_train = model.predict(X_train)
                    train_score[i, j, h, b] = MSE(pred_train, z_train)
                

    i_min, min = train_score.argmin(), train_score.min()
    k, l, m, n=np.shape(train_score)
    i_min = np.unravel_index(i_min, shape=[k, l, m, n])
    sgd_mom_gamma[i_min[0]] += 1
    sgd_mom_delta[i_min[1]] += 1
    sgd_mom_eta[i_min[2]] += 1
    sgd_mom_batchsize[i_min[3]] += 1

    print("sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE")
    print(gamma[i_min[0]], delta[i_min[1]], eta[i_min[2]], min)


sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.05 0.7333333333333333 0.9742500000000001 0.19182313305870666
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.05 0.8666666666666667 0.9742500000000001 0.1918231330596769
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.11 0.8 0.9742500000000001 0.19182313306013377
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.11 0.7333333333333333 0.9742500000000001 0.19182313306095455
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.11 0.9333333333333333 0.9742500000000001 0.19182313305904775
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.03 0.9333333333333333 0.9742500000000001 0.19182313306017537
sgd with momentum: learning rate, momentum, learning rate decay and minimal train MSE
0.09 0.6 0.9742500000000001 0.191823

In [22]:
sgd_mom_gamma_opt = gamma[sgd_mom_gamma.argmax()]
sgd_mom_delta_opt = delta[sgd_mom_delta.argmax()]
sgd_mom_eta_opt = eta[sgd_mom_eta.argmax()]
sgd_mom_batchsize_opt = batch_size[sgd_mom_batchsize.argmax()]

# iterating over different batch sizes again
# since the grid search above resulted in the smallest possible batch size performing the best, we test for lower values of the batch size

sgd_mom_batchsize = np.zeros(len(batch_size))

for t in range(trials):
    train_score = np.zeros(len(batch_size))
    for b in range(len(batch_size)):
        model = GradientDescend(optimizer="sgd", method="gd", learning_rate=sgd_mom_gamma_opt, delta_momentum=sgd_mom_delta_opt, learning_rate_decay_flag=True, learning_rate_decay=sgd_mom_eta_opt, batch_size=batch_size[b])
        scores = model.fit(X_train, z_train, X_test, z_test)

        pred_train = model.predict(X_train)
        train_score[b] = MSE(pred_train, z_train)
                

    i_min, min = train_score.argmin(), train_score.min()
    sgd_mom_batchsize[i_min] += 1

    print("sgd with momentum: batch size and minimal train MSE")
    print(batch_size[i_min], min)


sgd_mom_batchsize_opt = batch_size[sgd_mom_batchsize.argmax()]

In [23]:
trials = 100
batch_size = np.arange(50, len(X_train), 50)

In [24]:
sgd_adam_batchsize = np.zeros(len(batch_size))

for t in range(trials):
    train_score = np.zeros(len(batch_size))
    for b in range(len(batch_size)):
        model = GradientDescend(optimizer="sgd", method="adam", learning_rate=sgd_mom_gamma_opt, batch_size=batch_size[b])
        scores = model.fit(X_train, z_train, X_test, z_test)

        pred_train = model.predict(X_train)
        train_score[b] = MSE(pred_train, z_train)
                

    i_min, min = train_score.argmin(), train_score.min()
    sgd_adam_batchsize[i_min] += 1

    print("sgd with momentum: batch size and minimal train MSE")
    print(batch_size[i_min], min)


sgd_adam_batchsize_opt = batch_size[sgd_adam_batchsize.argmax()]

sgd with momentum: batch size and minimal train MSE
150 0.2244975467003096
sgd with momentum: batch size and minimal train MSE
250 0.21330132671042373
sgd with momentum: batch size and minimal train MSE
100 0.3107925740275043
sgd with momentum: batch size and minimal train MSE
200 0.4034309187208384
sgd with momentum: batch size and minimal train MSE
150 0.20316900648946515
sgd with momentum: batch size and minimal train MSE
50 0.2549536895803511
sgd with momentum: batch size and minimal train MSE
200 0.22573655585188376
sgd with momentum: batch size and minimal train MSE
200 0.4096458113871427
sgd with momentum: batch size and minimal train MSE
200 0.2428556488878947
sgd with momentum: batch size and minimal train MSE
50 0.4350071577246852
sgd with momentum: batch size and minimal train MSE
200 0.2766847504487155
sgd with momentum: batch size and minimal train MSE
250 0.39538378569564986
sgd with momentum: batch size and minimal train MSE
150 0.23116178968447015
sgd with momentum: bat

In [25]:
# storing the optimal parameters
%store gd_gamma_opt
%store gd_mom_gamma_opt
%store sgd_mom_gamma_opt
%store gd_mom_delta_opt
%store sgd_mom_delta_opt
%store sgd_mom_eta_opt
%store sgd_mom_batchsize_opt
%store sgd_adam_batchsize_opt

Stored 'gd_gamma_opt' (float64)
Stored 'gd_mom_gamma_opt' (float64)
Stored 'sgd_mom_gamma_opt' (float64)
Stored 'gd_mom_delta_opt' (float64)
Stored 'sgd_mom_delta_opt' (float64)
Stored 'sgd_mom_eta_opt' (float64)
Stored 'sgd_mom_batchsize_opt' (int64)
Stored 'sgd_adam_batchsize_opt' (int64)
