In [1]:
USE_FAST = True

In [2]:
import time
import numpy as np

np.random.seed(0)

if USE_FAST:
    from lib.fast.value import Value
    from lib.fast.linear_algebra import Vector, Matrix
    from lib.fast.nn import NN, Softmax, Linear
    from lib.fast.processing import OneHotEncoder, ColumnNormalizer
else:
    from lib.original.value import Value
    from lib.original.linear_algebra import Vector, Matrix
    from lib.original.nn import NN, Softmax, Linear
    from lib.original.processing import OneHotEncoder, ColumnNormalizer
    
from lib.metrics.losses import negative_log_likelihood
from lib.gd_data_loaders import BatchDataLoader, StochasticDataLoader, MiniBatchDataLoader
from lib.optimizers import SgdOptimizer, SgdWithMomentumOptimizer, AdaGradOptimizer, RmsPropOptimizer, AdamOptimizer

In [3]:
# The Iris dataset was used in R.A. Fisher's classic 1936 paper, The Use of Multiple Measurements in Taxonomic Problems, and can also be found on the UCI Machine Learning Repository.
# It includes three iris species with 50 samples each as well as some properties about each flower. One flower species is linearly separable from the other two, but the other two are not linearly separable from each other.

data = []
labels = []
with open("data/iris.data", "rt") as f:
    for line in f.readlines():
        data.append([float(v) for v in line.split(",")[:-1]])
        labels.append(line.split(",")[-1])
data = np.array(data)

In [4]:
indeces = list(range(len(data)))
np.random.shuffle(indeces)
split = int(len(data) * 0.8)

X_train = data[indeces[:split]]
X_test = data[indeces[split:]]
y_train = [labels[i] for i in indeces[:split]]
y_test = [labels[i] for i in indeces[split:]]
X_train = Matrix(X_train)
X_test = Matrix(X_test)
X_train.dims(), X_test.dims()

((120, 4), (30, 4))

In [5]:
ohe = OneHotEncoder()
ohe.fit(labels)
y_train = ohe.transform(y_train)
y_test = ohe.transform(y_test)
y_train.dims(), y_test.dims()

((120, 3), (30, 3))

In [6]:
normalizer = ColumnNormalizer()
normalizer.fit(X_train)
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)
X_train.dims(), X_test.dims()

((120, 4), (30, 4))

In [7]:
def init_nn():
    return NN([
        Linear(4, 3),
        Softmax()
    ])

In [8]:
time_point = time.time()

data_loaders = [
    # BatchDataLoader(X_train, y_train),
    # StochasticDataLoader(X_train, y_train),
    MiniBatchDataLoader(X_train, y_train, 4)
]
optimizer_creators = [
    lambda nn: SgdOptimizer(nn, 0.01),
    lambda nn: SgdWithMomentumOptimizer(nn, 0.01, 0.9),
    lambda nn: AdaGradOptimizer(nn, 0.1),
    lambda nn: RmsPropOptimizer(nn, 0.01, 0.95),
    lambda nn: AdamOptimizer(nn, 0.01, 0.95, 0.95),
]

for data_loader in data_loaders:
    for optimizer_creator in optimizer_creators:
        nn = init_nn()
        optimizer = optimizer_creator(nn)
        print(f"gradient descent: {data_loader.__class__} | optimizer: {optimizer.__class__}")
        for i in range(4001):
            X_b, y_b = data_loader.get_batch()
            out = nn(X_b)
            loss = negative_log_likelihood(y_b, out)
            
            if i % 400 == 0:
                elapsed_time = int(time.time() - time_point)
                time_point = time.time()
                print(f"{i} | {loss.data:.2f} | {elapsed_time}s")    
    
            optimizer.step(loss)
        
        train_out = nn(X_train) 
        train_loss = negative_log_likelihood(y_train, train_out)
        test_out = nn(X_test) 
        test_loss = negative_log_likelihood(y_test, test_out)
        print(f"train loss: {train_loss.data:.2f}   test loss: {test_loss.data:.2f}") 

gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.optimizers.SgdOptimizer'>
0 | 1.49 | 0s
400 | 0.72 | 1s
800 | 0.06 | 1s
1200 | 0.05 | 1s
1600 | 0.30 | 1s
2000 | 0.34 | 1s
2400 | 0.25 | 1s
2800 | 0.47 | 1s
3200 | 0.15 | 1s
3600 | 0.17 | 1s
4000 | 0.08 | 1s
train loss: 0.20   test loss: 0.18
gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.optimizers.SgdWithMomentumOptimizer'>
0 | 2.64 | 0s
400 | 1.12 | 1s
800 | 0.31 | 1s
1200 | 0.23 | 1s
1600 | 0.36 | 1s
2000 | 0.04 | 1s
2400 | 0.34 | 1s
2800 | 0.32 | 1s
3200 | 0.23 | 1s
3600 | 0.19 | 1s
4000 | 0.16 | 1s
train loss: 0.21   test loss: 0.19
gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.optimizers.AdaGradOptimizer'>
0 | 1.90 | 0s
400 | 0.06 | 1s
800 | 0.22 | 1s
1200 | 0.41 | 1s
1600 | 0.29 | 1s
2000 | 0.05 | 1s
2400 | 0.08 | 1s
2800 | 0.13 | 1s
3200 | 0.23 | 1s
3600 | 0.39 | 1s
4000 | 0.19 | 1s
train loss:

In [9]:
X_b.all_values()

[{1.06, -3.85},
 {0.57, -3.2},
 {1.12, -18.28},
 {1.2, -8.61},
 {-0.51, 5.36},
 {1.49, -2.0},
 {-1.29, 2.99},
 {-1.33, -0.7},
 {-1.24, 9.84},
 {-0.12, -9.23},
 {-1.35, 5.37},
 {-1.46, 2.88},
 {-0.39, -5.21},
 {-1.49, 10.74},
 {0.03, 15.59},
 {-0.13, 3.35}]