In [1]:
import time
import numpy as np

np.random.seed(0)

from lib.value import Value
from lib.linear_algebra import Vector, Matrix
from lib.nn import NN, Softmax, Linear
from lib.processing import OneHotEncoder, ColumnNormalizer
from lib.metrics.losses import negative_log_likelihood
from lib.gd_data_loaders import BatchDataLoader, StochasticDataLoader, MiniBatchDataLoader
from lib.optimizers import SgdOptimizer, SgdWithMomentumOptimizer, AdaGradOptimizer, RmsPropOptimizer, AdamOptimizer

In [2]:
# The Iris dataset was used in R.A. Fisher's classic 1936 paper, The Use of Multiple Measurements in Taxonomic Problems, and can also be found on the UCI Machine Learning Repository.
# It includes three iris species with 50 samples each as well as some properties about each flower. One flower species is linearly separable from the other two, but the other two are not linearly separable from each other.

data = []
labels = []
with open("data/iris.data", "rt") as f:
    for line in f.readlines():
        data.append([float(v) for v in line.split(",")[:-1]])
        labels.append(line.split(",")[-1])
data = np.array(data)

In [3]:
indeces = list(range(len(data)))
np.random.shuffle(indeces)
split = int(len(data) * 0.8)

X_train = data[indeces[:split]]
X_test = data[indeces[split:]]
y_train = [labels[i] for i in indeces[:split]]
y_test = [labels[i] for i in indeces[split:]]
X_train = Matrix(X_train)
X_test = Matrix(X_test)
X_train.dims(), X_test.dims()

((120, 4), (30, 4))

In [4]:
ohe = OneHotEncoder()
ohe.fit(labels)
y_train = ohe.transform(y_train)
y_test = ohe.transform(y_test)
y_train.dims(), y_test.dims()

((120, 3), (30, 3))

In [5]:
normalizer = ColumnNormalizer()
normalizer.fit(X_train)
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)
X_train.dims(), X_test.dims()

((120, 4), (30, 4))

In [6]:
def init_nn():
    return NN([
        Linear(4, 3),
        Softmax()
    ])

In [7]:
time_point = time.time()

data_loaders = [
    BatchDataLoader(X_train, y_train),
    StochasticDataLoader(X_train, y_train),
    MiniBatchDataLoader(X_train, y_train, 32)
]
optimizer_creators = [
    lambda nn: SgdOptimizer(nn, 0.01),
    lambda nn: SgdWithMomentumOptimizer(nn, 0.01, 0.9),
    lambda nn: AdaGradOptimizer(nn, 0.1),
    lambda nn: RmsPropOptimizer(nn, 0.01, 0.95),
    lambda nn: AdamOptimizer(nn, 0.01, 0.95, 0.95),
]

for data_loader in data_loaders:
    for optimizer_creator in optimizer_creators:
        nn = init_nn()
        optimizer = optimizer_creator(nn)
        print(f"gradient descent: {data_loader.__class__} | optimizer: {optimizer.__class__}")
        for i in range(4001):
            X_b, y_b = data_loader.get_batch()
            out = nn(X_b)
            loss = negative_log_likelihood(y_b, out)
            
            if i % 400 == 0:
                elapsed_time = int(time.time() - time_point)
                time_point = time.time()
                print(f"{i} | {loss.data:.2f} | {elapsed_time}s")    
    
            optimizer.step(loss)
        
        train_out = nn(X_train) 
        train_loss = negative_log_likelihood(y_train, train_out)
        test_out = nn(X_test) 
        test_loss = negative_log_likelihood(y_test, test_out)
        print(f"train loss: {train_loss.data:.2f}   test loss: {test_loss.data:.2f}") 

gradient descent: <class 'lib.gd_data_loaders.BatchDataLoader'> | optimizer: <class 'lib.optimizers.SgdOptimizer'>
0 | 1.71 | 0s
400 | 0.50 | 71s
800 | 0.41 | 72s
1200 | 0.35 | 72s
1600 | 0.32 | 72s
2000 | 0.29 | 69s
2400 | 0.26 | 67s
2800 | 0.24 | 69s
3200 | 0.23 | 76s
3600 | 0.21 | 80s
4000 | 0.20 | 72s
train loss: 0.20   test loss: 0.18
gradient descent: <class 'lib.gd_data_loaders.BatchDataLoader'> | optimizer: <class 'lib.optimizers.SgdWithMomentumOptimizer'>
0 | 2.64 | 0s
400 | 0.55 | 93s
800 | 0.45 | 102s
1200 | 0.38 | 88s
1600 | 0.34 | 92s
2000 | 0.30 | 94s
2400 | 0.27 | 93s
2800 | 0.25 | 107s
3200 | 0.23 | 104s
3600 | 0.22 | 106s
4000 | 0.21 | 114s
train loss: 0.21   test loss: 0.19
gradient descent: <class 'lib.gd_data_loaders.BatchDataLoader'> | optimizer: <class 'lib.optimizers.AdaGradOptimizer'>
0 | 2.73 | 0s
400 | 0.23 | 113s
800 | 0.16 | 88s
1200 | 0.13 | 113s
1600 | 0.11 | 112s
2000 | 0.10 | 124s
2400 | 0.09 | 136s
2800 | 0.08 | 85s
3200 | 0.08 | 81s
3600 | 0.07 | 78s
4

In [8]:
[(round(float(v1),1), float(v2)) for v1, v2 in zip([v[0].data for v in test_out.values], [v[0].data for v in y_test])]

[(1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0)]