In [1]:
from feed_forward_neural_network import FeedForwardRegressor, FeedForwardClassifier

In [2]:
seed = 0

# Simplest example

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import r2_score, accuracy_score
from sklearn.datasets import make_regression, make_classification

### Create data from a regression model

In [4]:
X, y = make_regression(n_samples=1000, n_features=20, n_informative=10, random_state=seed)
train_size = 100
X_train, X_test, y_train, y_test = tts(
    X, y, train_size=train_size, random_state=seed)

### A Create instance of FeedForwardRegressor

In [5]:
reg = FeedForwardRegressor(random_state=seed)

### B Fit training set

In [6]:
reg = reg.fit(X_train, y_train)

### C Predict test set

In [7]:
y_pred = reg.predict(X_test)

### D Evaluate prediction

In [8]:
print(r2_score(y_test, y_pred), reg.score(X_test, y_test))

0.9959374736003573 0.9959374736003573


### E Clean up model (risk of GPU memory leaks otherwise)

In [9]:
reg.delete_model_weights()  # else gpu memory leaks
del reg

## Same for binary classification

In [10]:
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=10, random_state=seed)
train_size = 100
X_train, X_test, y_train, y_test = tts(
    X, y, train_size=train_size, random_state=seed)

# Learning
clf = FeedForwardClassifier(random_state=seed)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]
y_decision = clf.decision_function(X_test)
# y_proba = logistic_func(y_decision)
# y_pred = y_proba >= 0.5

print(accuracy_score(y_test, y_pred),
      accuracy_score(y_test, (y_proba > 0.5).astype(float)),
      accuracy_score(y_test, (y_decision > 0.).astype(float)),
      clf.score(X_test, y_test))
clf.delete_model_weights()  # else gpu memory leaks
del clf

0.7366666666666667 0.7366666666666667 0.7366666666666667 0.7366666666666667


## Same for multiclass classification

In [11]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
                           n_classes=5, random_state=seed)
train_size = 100
X_train, X_test, y_train, y_test = tts(
    X, y, train_size=train_size, random_state=seed)

# Learning
clf = FeedForwardClassifier(random_state=seed)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)
y_decision = clf.decision_function(X_test)
# y_proba = softmax(y_decision)
# y_pred = argmax(y_proba)

print(accuracy_score(y_test, y_pred),
      accuracy_score(y_test, np.argmax(y_proba, axis=1)),
      accuracy_score(y_test, np.argmax(y_decision, axis=1)),
      clf.score(X_test, y_test))
clf.delete_model_weights()  # else gpu memory leaks
del clf

0.5533333333333333 0.5533333333333333 0.5533333333333333 0.5533333333333333


# Compare with Random Forests on a real-world dataset

In [12]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor as RF

### Load California Housing

In [13]:
X, y = fetch_california_housing(return_X_y=True)
X, y = StandardScaler().fit_transform(X), (y - y.mean()) / \
    y.std()  # standardize data
train_samples_ratio = 0.8
X_train, X_test, y_train, y_test = tts(
    X, y, train_size=train_samples_ratio, random_state=seed)

### Train and evaluate a Random Forest

In [14]:
reg = RF(random_state=seed)
reg.fit(X_train, y_train)
print("RF", "R2-score :", reg.score(X_test, y_test))
del reg

RF R2-score : 0.7982270387164677


### Train and evaluate a FFNN with default parameters

In [15]:
reg = FeedForwardRegressor(random_state=seed)
reg.fit(X_train, y_train)
print("FFNN", "R2-score :", reg.score(X_test, y_test))
reg.delete_model_weights()  # else gpu memory leaks
del reg

FFNN R2-score : 0.7985995275549246


# Try other architectures and hyper-parameters

### Use dictionnaries of parameters provided in parameters_examples.py

In [16]:
# see parameters_examples.py for details and other examples
from parameters_examples import mlp, glu, snn, mlpbatch
# quick results on small datasets
print("mlp", ":", mlp, "\n")

# performances on small datasets for classification (default parameters)
print("glu", ":", glu, "\n")

# performances on small datasets for regression (default parameters)
print("snn", ":", snn, "\n")

# performances on medium to large datasets
print("mlpbatch", ":", mlpbatch)

mlp : {'lr_scheduler': 'OneCycleLR', 'lr_scheduler_params': {'max_lr': 0.01, 'total_steps': 200}, 'max_iter': 200, 'learning_rate': 0.001, 'hidden_nn': <class 'architectures.DenseLayers'>, 'hidden_params': {'width': 512, 'depth': 2, 'dropout': 0.2, 'batch_norm': True}} 

glu : {'lr_scheduler': 'OneCycleLR', 'lr_scheduler_params': {'max_lr': 0.01, 'total_steps': 500}, 'max_iter': 500, 'learning_rate': 0.001, 'hidden_nn': <class 'architectures.GLULayers'>, 'hidden_params': {'width': 512, 'depth': 3, 'dropout': 0.2, 'batch_norm': True}} 

snn : {'lr_scheduler': 'OneCycleLR', 'lr_scheduler_params': {'max_lr': 0.01, 'total_steps': 500}, 'max_iter': 500, 'learning_rate': 0.001, 'hidden_nn': <class 'architectures.DenseLayers'>, 'hidden_params': {'width': 512, 'depth': 2, 'activation': 'SELU', 'initializer_params': {'gain_type': 'linear'}}} 

mlpbatch : {'lr_scheduler': 'OneCycleLR', 'lr_scheduler_params': {'max_lr': 0.01, 'total_steps': 200}, 'max_iter': 200, 'epochs': True, 'max_runtime': 36

### Pick one

In [17]:
# mlpbatch is more appropriate for medium to large datasets, see parameters_examples.py for details
settings = "mlpbatch"
kwargs = eval(settings)
print(kwargs)

{'lr_scheduler': 'OneCycleLR', 'lr_scheduler_params': {'max_lr': 0.01, 'total_steps': 200}, 'max_iter': 200, 'epochs': True, 'max_runtime': 3600, 'learning_rate': 0.001, 'hidden_nn': <class 'architectures.DenseLayers'>, 'hidden_params': {'width': 512, 'depth': 2, 'dropout': 0.2, 'batch_norm': True}}


### Add data-specific infos (number of features, number of neurons on output layer)

In [18]:
multiclass = False
# number of neurons on output layer
output = len(set(y_train)) if multiclass else 1
kwargs["hidden_params"].update(
    {"n_features": X_train.shape[1], "output": output})

### Pass the parameters during the initialization of the regressor

In [19]:
reg = FeedForwardRegressor(random_state=seed, **kwargs)

### Train and evaluate

In [20]:
reg.fit(X_train, y_train)
print(settings, "R2-score :", reg.score(X_test, y_test))
reg.delete_model_weights()  # else gpu memory leaks
del reg

mlpbatch R2-score : 0.811336902284114


# Perform meta-learning.

### Train n = 10 models with different seeds.

In [21]:
n_learners = 10

settings = "mlpbatch"
kwargs = eval(settings)
multiclass = False
# number of neurons on output layer
output = len(set(y_train)) if multiclass else 1
kwargs["hidden_params"].update(
    {"n_features": X_train.shape[1], "output": output})

predictions = np.zeros((n_learners, len(X_test)))
records = {}
for i in range(n_learners):
    reg = FeedForwardRegressor(random_state=i, **kwargs)
    reg.fit(X_train, y_train)
    records[i] = reg.record
    predictions[i] = reg.predict(X_test).reshape(-1)
    reg.delete_model_weights()  # else gpu memory leaks
    del reg

### Perform model ensembling

In [22]:
ensemble_prediction = predictions.mean(axis=0)
print("Ensemble", "R2-score :", r2_score(y_test, ensemble_prediction))

Ensemble R2-score : 0.8218575205416627


### Perform model selection (seed picking)

In [23]:
best_seed = np.argmax([np.max(records[i]["validation"])
                      for i in range(n_learners)])
selected_prediction = predictions[best_seed]
print("Selection", "R2-score :", r2_score(y_test, selected_prediction))

Selection R2-score : 0.8143335868608732


# Use your own architecture on a dataset of tensor observations (eg: MNIST)

### Define a pytorch module
You can take inspiration from the architectures.py file. Here we will just copy [the pytorch example.](https://github.com/pytorch/examples/blob/master/mnist/main.py)

In [24]:
import torch
import importlib
if importlib.util.find_spec('torch.cuda'):
    device = "cuda" if torch.cuda.is_available() else "cpu"
else:
    device = "cpu"


class BasicConvNet(torch.nn.Module):
    def __init__(self, device=device):
        super(BasicConvNet, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 32, 3, 1, device=device)
        self.conv2 = torch.nn.Conv2d(32, 64, 3, 1, device=device)
        self.dropout1 = torch.nn.Dropout(0.25)
        self.dropout2 = torch.nn.Dropout(0.5)
        self.fc1 = torch.nn.Linear(9216, 128, device=device)
        self.fc2 = torch.nn.Linear(128, 10, device=device)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.nn.functional.relu(x)
        x = self.conv2(x)
        x = torch.nn.functional.relu(x)
        x = torch.nn.functional.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = torch.nn.functional.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

### Load MNIST

In [25]:
from sklearn.datasets import fetch_openml
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
train_samples_ratio = 0.8
X_train, X_test, y_train, y_test = tts(
    X, y, train_size=train_samples_ratio, stratify=y)
X_train, X_test = X_train.reshape(
    (-1, 1, 28, 28)), X_test.reshape((-1, 1, 28, 28))  # reshape as a tensor

### Pick training hyper-parameters
We will just copy [the pytorch documentation example.](https://github.com/pytorch/examples/blob/master/mnist/main.py)

In [26]:
kwargs = {"optimizer": "Adadelta",
          "validation_fraction": False,
          "early_stopping_criterion": False,
          "learning_rate": 1.0,
          "optimizer_params": {},
          "lr_scheduler": "StepLR",
          "lr_scheduler_params": {"step_size": 1, "gamma": 0.7},
          "batch_size": 64,
          "max_iter": 14 * int(len(y_train)/64),  # 14 epochs with batchsize 64
          "hidden_nn": BasicConvNet,
          "hidden_params": {}}

### Fit, predict and evaluate

In [27]:
clf = FeedForwardClassifier(random_state=seed, **kwargs)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
clf.delete_model_weights()  # else gpu memory leaks
del clf

0.9882142857142857


# Play with all the training parameters

In [28]:
# Exhaustive parameter list:
if False:
    if False:
        # architecture of hidden layers, torch.nn.Module (if False, uses default mlp architecture).
        hidden_nn,
        hidden_params,  # hyper_parameters for hidden_nn initialization

        # learning parameters
        default_loss_function,  # set by the subclass
        optimizer,  # a torch optimizer, str
        learning_rate,  # float
        optimizer_params,  # except learning rate, dict
        lr_scheduler,  # a torch optimizer.lr_scheduler, str
        lr_scheduler_params,  # dict
        batch_size,  # if None or False: full batch, if int number of samples, if float share of samples

        # convergence parameters
        max_iter,  # iterations, not epochs (epochs = max_iter/batch_size), int
        epochs,  # max_iter => max_iter * train_size / batch_size, bool
        max_runtime,  # unprecise, float or int
        # if None or False: no validation, if int number of samples, if float share of samples
        validation_fraction,
        should_stratify,  # validation split strategy, bool
        early_stopping_criterion,  # either "loss" or "validation", str
        convergence_tol,  # if None or False: always max_iter, else float
        divergence_tol,  # if None or False: always max_iter, else float

        # AdaCap Parameters
        # Tikhonov operator specific parameters
        adacap,
        # if None or False: regular FFNN, if int or float lambda initial value, if "max_variation" or "min_value" grid-search
        closeform_parameter_init,
        closeform_intercept,  # add unitary feature to covar matrix, bool

        # MuddlingLabelRegularization specific parameters
        n_permut,  # if int number of permutations, if None or False no permutations
        permutation_scale,  # weight of permutation term added to the loss, float

        # MLR additional regularization techniques
        dithering_scale,  # if float dithering white noise standard-deviation, if None or False no gaussian dithering
        # if float dithering structured noise standard-deviation, if None or False no structured noise dithering
        target_rotation_scale,

        # Target handling
        # center target around mean (behaves differently for binary clf), bool
        center_target,
        rescale_target,  # divide target by std before fitting, bool
        loss_imbalance,  # smaller weights on majority classes, bool

        random_state,  # scikit-learn random state, will also set torch generator using a different seed
        # delete validation samples, optimizer and lr scheduler after training
        release_train_memory,
        # repository in which tempory models will be saved (see PATH and REP)
        save_repository,
        verbose  # if False mute, if True print at each iteration, if int print if iter%verbose == 0

# Analysis

In [29]:
X, y = fetch_california_housing(return_X_y=True)
X, y = StandardScaler().fit_transform(X), (y - y.mean()) / \
    y.std()  # standardize data
train_samples_ratio = 0.8
X_train, X_test, y_train, y_test = tts(X, y, train_size=train_samples_ratio)

### Print metrics values every 50 iterations

In [30]:
reg = FeedForwardRegressor(random_state=seed, release_train_memory=False, verbose=50)
reg = reg.fit(X_train, y_train)

|     iter |     loss |     time | validati |   lambda |      mlr |
|        0 | 8.01e-01 | 8.32e-03 | 3.16e-01 | 2.85e+04 | 1.95e-02 |
|       50 | 5.98e-01 | 1.45e-02 | 6.44e-01 | 2.77e+04 | 2.42e-02 |
|      100 | 5.10e-01 | 1.44e-02 | 7.31e-01 | 2.58e+04 | 2.13e-02 |
|      150 | 4.56e-01 | 1.45e-02 | 7.67e-01 | 2.10e+04 | 2.14e-02 |
|      200 | 4.47e-01 | 1.23e-02 | 7.79e-01 | 1.65e+04 | 1.81e-02 |
|      250 | 4.25e-01 | 1.26e-02 | 7.82e-01 | 1.24e+04 | 1.56e-02 |
|      300 | 4.19e-01 | 1.24e-02 | 7.89e-01 | 9.30e+03 | 1.66e-02 |
|      350 | 3.99e-01 | 1.24e-02 | 7.89e-01 | 7.39e+03 | 1.09e-02 |
|      400 | 3.93e-01 | 1.26e-02 | 7.91e-01 | 6.41e+03 | 9.52e-03 |
|      450 | 3.77e-01 | 1.26e-02 | 7.90e-01 | 6.05e+03 | 7.50e-03 |
|      500 | 3.77e-01 | 1.24e-02 | 7.68e-01 | 6.00e+03 | 9.24e-03 |


### Save learning dynamic

In [31]:
record = reg.record
print(record.keys())  # you can then plot these with matplotlib

dict_keys(['loss', 'time', 'validation', 'lambda', 'mlr'])


### Hidden layers

In [32]:
hidden_layers = reg.hidden_layers
print(hidden_layers)
del hidden_layers

DefaultDenseLayers(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=512, bias=True)
    (1): SELU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): SELU()
  )
)


### Optimizer and Scheduler

In [33]:
optimizer = reg.optimizer_instance
print(optimizer)
if not reg.cst_lr:
    lr_scheduler = reg.lr_scheduler_instance
    print(lr_scheduler)

Adam (
Parameter Group 0
    amsgrad: False
    base_momentum: 0.85
    betas: (0.9499979858085436, 0.999)
    eps: 1e-08
    initial_lr: 0.0004
    lr: 2.414183399679499e-07
    max_lr: 0.01
    max_momentum: 0.95
    min_lr: 4e-08
    weight_decay: 0
)
<torch.optim.lr_scheduler.OneCycleLR object at 0x7f01b02956a0>


### Re-use a network copy elsewhere

In [34]:
import torch
# this file will be deleted with reg.delete_model_weights()
storage_path = reg.save_repository + reg.PATH
network_copy = torch.load(storage_path)
new_storage_path = "./saved_model.pt"
torch.save(network_copy, new_storage_path)

### Clean up

In [35]:
reg._release_train_memory()
reg.delete_model_weights()  # else gpu memory leaks
del reg

In [36]:
import os
os.remove(new_storage_path)

# Remarks on batch-learning.
There are two ways in which the default behavior of the FeedForwardNeuralNetwork class differs from the standard approach in deep learning. Note however that in both cases changing one parameter is enough to fall back to the most classical setting:

- The FeedForwardNeuralNetwork class can perform batch-learning or stochastic gradient descent but by default, it tries to avoid doing so (which is most often the correct strategy for small tabular datasets). This corresponds to the parameter "batch\_size" which by default is equal to "False", meaning no batch-learning. However, to avoid GPU memory errors, some caps on the maximum size of weights and activations matrices are hard-coded: the width and height is at most 4096. This means that even if the parameter "batch\_size" is equal to False, batch-learning will be used if the training set (substracting the validation set if any) is larger than 4096. Mini-batches of size smaller than 4096 can still be used by setting the "batch\_size" to an int. Following the scikit-learn convention, you can also set "batch\_size" to be a float between 0. and 1., in that case the mini-batchs wil be of size int("batch\_size" * n\_samples).


- The cap on the maximum number of iterations (parameter "max_iter") corresponds by default to the maximum number of gradient updates, not epochs. This is set by the parameter "epochs" which by default is equal to "False", meaning the parameter "max\_iter" corresponds to gradient updates not number of epochs, which are different when using stochastic gradient descent or batch-learning. The default value enforces a soft constraint on the maximum training time with respect to the training set size, as at most, the gradients will be computed for max_iter times 4096 observations. It is possible nonetheless to set the maximum number of epochs instead, by setting the parameter "epochs" as True.

Both choices were set to minimize the risk that the user accidentally launches a task which will lead to a GPU memory error or take years to complete.