In [1]:
backend = "pt_backend"

In [2]:
import time
import numpy as np

np.random.seed(0)

match backend:
    case "original_backend":
        from lib.original_backend.linear_algebra import Matrix
        from lib.original_backend.nn import  NN, ReLU, Linear
        from lib.original_backend.processing import ColumnNormalizer
        from lib.optimizers import SgdOptimizer, SgdWithMomentumOptimizer, AdaGradOptimizer, RmsPropOptimizer, AdamOptimizer

    case "np_backend":
        from lib.np_backend.linear_algebra import Matrix
        from lib.np_backend.nn import NN, ReLU, Linear
        from lib.np_backend.processing import ColumnNormalizer
        from lib.optimizers import SgdOptimizer, SgdWithMomentumOptimizer, AdaGradOptimizer, RmsPropOptimizer, AdamOptimizer

    case "pt_backend":
        from lib.pt_backend.linear_algebra import Matrix
        from lib.pt_backend.nn import NN, ReLU, Linear
        from lib.pt_backend.processing import ColumnNormalizer
        from lib.pt_backend.optimizers import SgdOptimizer, SgdWithMomentumOptimizer, AdaGradOptimizer, RmsPropOptimizer, AdamOptimizer

from lib.metrics.losses import mean_squared_error
from lib.gd_data_loaders import BatchDataLoader, StochasticDataLoader, MiniBatchDataLoader

In [3]:
# Boston House Price Dataset
# The Boston House Price Dataset involves the prediction of a house price in thousands of dollars given details of the house and its neighborhood.
# It is a regression problem. There are 506 observations with 13 input variables and 1 output variable. The variable names are as follows:

data = []
with open("data/boston_house_prices.data", "rt") as f:
    for line in f.readlines():
        data.append([float(v) for v in line.split()])
data = np.array(data)

In [4]:
np.random.shuffle(data)
split = int(len(data) * 0.8)
train_set = data[:split]
test_set = data[split:]

X_train, y_train = train_set[:, :-1], train_set[:, -1:]
X_test, y_test = test_set[:, :-1], test_set[:, -1:]
X_train = Matrix(X_train)
y_train = Matrix(y_train)
X_test = Matrix(X_test)
y_test = Matrix(y_test)
X_train.dims(), y_train.dims(), X_test.dims(), y_test.dims()

(torch.Size([404, 13]),
 torch.Size([404, 1]),
 torch.Size([102, 13]),
 torch.Size([102, 1]))

In [5]:
normalizer = ColumnNormalizer()
normalizer.fit(X_train)
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)
X_train.dims(), X_test.dims()

(torch.Size([404, 13]), torch.Size([102, 13]))

In [6]:
def init_nn():
    return NN([
        Linear(13, 4),
        ReLU(),
        Linear(4, 1),
    ])

In [7]:
time_point = time.time()

data_loaders = [
    # BatchDataLoader(X_train, y_train),
    # StochasticDataLoader(X_train, y_train),
    MiniBatchDataLoader(X_train, y_train, 32)
]
optimizer_creators = [
    # lambda nn: SgdOptimizer(nn, 0.001),
    # lambda nn: SgdWithMomentumOptimizer(nn, 0.001, 0.9),
    lambda nn: AdaGradOptimizer(nn, 0.05),
    lambda nn: RmsPropOptimizer(nn, 0.01, 0.95),
    lambda nn: AdamOptimizer(nn, 0.01, 0.95, 0.95),
]

for data_loader in data_loaders:
    for optimizer_creator in optimizer_creators:
        nn = init_nn()
        optimizer = optimizer_creator(nn)
        print(f"gradient descent: {data_loader.__class__} | optimizer: {optimizer.__class__}")
        for i in range(2001):
            X_b, y_b = data_loader.get_batch()
            out = nn(X_b)
            loss = mean_squared_error(y_b, out)
            if i % 100 == 0:
                elapsed_time = int(time.time() - time_point)
                time_point = time.time()
                print(f"{i} | {loss.data:.2f} | {elapsed_time}s") 

            optimizer.step(loss)

        train_out = nn(X_train) 
        train_loss = mean_squared_error(y_train, train_out)
        test_out = nn(X_test) 
        test_loss = mean_squared_error(y_test, test_out)
        print(f"train loss: {train_loss.data:.2f}   test loss: {test_loss.data:.2f}") 

gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.pt_backend.optimizers.AdaGradOptimizer'>
0 | 595.12 | 0s
100 | 87.60 | 0s
200 | 34.37 | 0s
300 | 20.49 | 0s
400 | 22.19 | 0s
500 | 42.83 | 0s
600 | 13.46 | 0s
700 | 28.37 | 0s
800 | 23.08 | 0s
900 | 34.97 | 0s
1000 | 25.05 | 0s
1100 | 22.15 | 0s
1200 | 14.42 | 0s
1300 | 17.76 | 0s
1400 | 14.82 | 0s
1500 | 15.67 | 0s
1600 | 16.74 | 0s
1700 | 30.43 | 0s
1800 | 14.51 | 0s
1900 | 24.72 | 0s
2000 | 29.81 | 0s
train loss: 19.99   test loss: 22.06
gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.pt_backend.optimizers.RmsPropOptimizer'>
0 | 571.61 | 0s
100 | 59.30 | 0s
200 | 59.27 | 0s
300 | 10.24 | 0s
400 | 14.48 | 0s
500 | 14.31 | 0s
600 | 14.91 | 0s
700 | 11.01 | 0s
800 | 31.01 | 0s
900 | 20.74 | 0s
1000 | 13.60 | 0s
1100 | 17.26 | 0s
1200 | 17.99 | 0s
1300 | 8.14 | 0s
1400 | 18.23 | 0s
1500 | 19.44 | 0s
1600 | 12.64 | 0s
1700 | 11.24 | 0s
1800 | 11.59 | 0

In [8]:
[(round(float(v1),1), float(v2)) for v1, v2 in zip([v[0].data for v in out], [v[0].data for v in y_test])][:10]

[(15.8, 20.700000762939453),
 (14.5, 39.79999923706055),
 (9.7, 17.799999237060547),
 (23.5, 19.600000381469727),
 (16.7, 14.899999618530273),
 (9.2, 22.0),
 (16.4, 48.79999923706055),
 (40.7, 25.0),
 (16.4, 48.5),
 (17.5, 23.899999618530273)]

In [9]:
gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.optimizers.AdaGradOptimizer'>
0 | 580.47 | 0s
100 | 205.61 | 11s
200 | 35.91 | 13s
300 | 31.69 | 11s
400 | 43.72 | 11s
500 | 78.20 | 12s
600 | 17.42 | 11s
700 | 38.73 | 11s
800 | 38.90 | 11s
900 | 9.81 | 11s
1000 | 18.67 | 12s
1100 | 20.35 | 13s
1200 | 11.91 | 11s
1300 | 9.77 | 13s
1400 | 14.41 | 12s
1500 | 18.66 | 15s
1600 | 16.21 | 12s
1700 | 11.52 | 13s
1800 | 9.95 | 12s
1900 | 12.19 | 11s
2000 | 15.00 | 11s
train loss: 20.91   test loss: 20.94
gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.optimizers.RmsPropOptimizer'>
0 | 451.70 | 1s
100 | 96.17 | 15s
200 | 67.43 | 14s
300 | 14.20 | 15s
400 | 29.44 | 15s
500 | 8.97 | 15s
600 | 7.00 | 15s
700 | 22.93 | 16s
800 | 20.84 | 14s
900 | 29.82 | 14s
1000 | 8.39 | 16s
1100 | 10.35 | 15s
1200 | 15.34 | 15s
1300 | 7.63 | 16s
1400 | 9.06 | 15s
1500 | 6.94 | 15s
1600 | 13.59 | 15s
1700 | 16.01 | 16s
1800 | 7.27 | 15s
1900 | 10.61 | 15s
2000 | 14.75 | 15s
train loss: 11.29   test loss: 9.17
gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.optimizers.AdamOptimizer'>
0 | 749.99 | 2s
100 | 131.75 | 15s
200 | 26.44 | 14s
300 | 47.31 | 15s
400 | 14.10 | 15s
500 | 16.89 | 14s
600 | 11.65 | 15s
700 | 30.11 | 15s
800 | 7.45 | 16s
900 | 7.41 | 16s
1000 | 9.48 | 15s
1100 | 7.70 | 16s
1200 | 11.16 | 16s
1300 | 7.07 | 15s
1400 | 8.96 | 16s
1500 | 11.53 | 16s
1600 | 6.08 | 16s
1700 | 9.46 | 15s
1800 | 23.97 | 16s
1900 | 7.37 | 15s
2000 | 8.11 | 15s
train loss: 12.47   test loss: 10.80

SyntaxError: invalid decimal literal (3551842227.py, line 2)