In [23]:
import numpy as np
import pandas as pd
from main import MLP_Classifier,Layer
from sklearn.datasets import make_classification
import optuna
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.model_selection import train_test_split

## MLP with multilabel classification with python 3.13.11

before diving into images, let's see how our simple neural network will perform on tabular data

In [35]:
np.random.seed(42)
# Generate  dataset with 4 labels
X, Y = make_classification(
    n_samples=6000,     
    n_features=4,       
    n_redundant=0,      
    n_clusters_per_class=1,
    flip_y=0.1,         # Add label noise
    class_sep=1.0,      # Class separation
    n_classes=3,      # nb classes
)


noise = np.random.normal(0, 0.5, X.shape)
X = X + noise
X=pd.DataFrame(X)
Y=pd.Series(Y)
if len(np.unique(Y))==2:
   Y=pd.DataFrame(Y)
else: 
   Y=pd.get_dummies(Y).astype(int)


determine objective function to optimise :  minimise cross entropy (in pdf maximise log-likelihood).

for example we optimise over batch size, learning rate and dropout (one of the most important parameters in NN).

we could do also on layers but computantionally expensive for large datasets.


In [39]:



def objective(trial):
    # Define hyperparameter search space
    batch_size = trial.suggest_int("batch_size", 500, 800)
    alpha = trial.suggest_float("alpha", 0.01, 0.1)
    dropout_rate = trial.suggest_float("dropout", 0.5, 0.9)

    model = MLP_Classifier(
        (
            (
                Layer(
                    nb_neurons=20,
                    activation_function="relu",
                    regul=("l2", 0.1),
                    initial="he",
                    batchnorm=True
                ),
                Layer(
                    nb_neurons=10,
                    activation_function="relu",
                    regul=("l2", 0.1),
                    initial="he",
                ),
                Layer(
                    nb_neurons=30,
                    activation_function="relu",
                    regul=("dropout", dropout_rate),
                    initial="he",
                ),
            )
        ),
        max_iter=2000,
        thr=1e-5,
        alpha=alpha,
        seed=123,
        batch_size=batch_size,
        verbose=False,
        optim="adam"
    )

    model.train(X, Y)

    score = model.loss(Y,model.y_hat)  # need to do on val set

    return score


storage = "sqlite:///optuna_mlpsoftmax111.db"
study = optuna.create_study(
    direction="minimize", study_name="MLP", storage=storage, load_if_exists=True
)  # 'minimize' for loss functions
study.optimize(objective, n_trials=1)

print("Best Hyperparameters:", study.best_params)


[I 2026-01-05 20:45:37,409] A new study created in RDB with name: MLP
[I 2026-01-05 20:45:44,848] Trial 0 finished with value: 1.0988307655332203 and parameters: {'batch_size': 529, 'alpha': 0.013827387119549409, 'dropout': 0.7932257008509562}. Best is trial 0 with value: 1.0988307655332203.


Best Hyperparameters: {'batch_size': 529, 'alpha': 0.013827387119549409, 'dropout': 0.7932257008509562}


In [40]:
best_results={"best value" : study.best_trial.values,"params": study.best_trial.params}
best_results


{'best value': [1.0988307655332203],
 'params': {'batch_size': 529,
  'alpha': 0.013827387119549409,
  'dropout': 0.7932257008509562}}

run model on optimised parameters

In [41]:


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

model = MLP_Classifier(
    (
        (
            Layer(
                nb_neurons=20,
                activation_function="relu",
                regul=("l2", 0.1),
                initial="he",
                batchnorm=True
         
            ),
            Layer(
                nb_neurons=10,
                activation_function="relu",
                regul=("l2", 0.1),
                initial="he",
                
            ),
            Layer(
                nb_neurons=30,
                activation_function="relu",
                regul=("dropout", best_results["params"]["dropout"]),
                initial="he",
                
            ),
        )
    ),
    max_iter=2000,
    thr=1e-5,
    alpha=best_results["params"]["alpha"],
    seed=123,
    batch_size=best_results["params"]["batch_size"],
    verbose=True,
    optim="adam",
    nb_epochs_early_stopping=50
)

fct=accuracy_score

model.train(X_train, y_train,X_test,y_test,fct)

print(f"final {fct.__name__}", accuracy_score(model.predict(X_test), y_test))


-------------------------------------------------------------------------
iteration 0 : TRAIN accuracy_score  : 0.5955555555555555, loss : 1.0132966656360094
iteration 0 : TEST accuracy_score  : 0.5886666666666667, loss : 1.017541762998313
-------------------------------------------------------------------------
iteration 50 : TRAIN accuracy_score  : 0.748, loss : 0.6437553293458609
iteration 50 : TEST accuracy_score  : 0.7426666666666667, loss : 0.6585968766631872
early stopping at epoch 98
final accuracy_score 0.7546666666666667


## CNN or Early AlexNet (2012)

even if in this section we will not perform full alexnet infrastructure which is to heavy to compute, we will use basic CNN to perform calculations 

In [1]:
from layers import ConvLayer,MaxPoolLayer,Layer,FlatLayer
from cnn import CNN
import numpy as np 
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [2]:
#define CNN infra, note that for CNN i use vanilla SGD only for simplicity reasons
#moreover, i dont have any batchnormalisation on cnn part
#mlp part will still have batch normalisation as before 
#i always suppose here that i have a clear alternation : Conv-> Act->Maxpool->Conv-> Act->Maxpool->Conv-> Act->Maxpool->Conv-> Act->Maxpool-> Flatten->MLP
q=CNN(

    (
        ConvLayer(in_channels=3,output_channels=16,kernel_size=3,stride=1,padding=True,activation_function="relu",initial="lecun",law="normal"),
        MaxPoolLayer(kernel_size=3,stride=2,padding=False),
        ConvLayer(in_channels=16,output_channels=32,kernel_size=3,stride=1,padding=True,activation_function="relu",initial="lecun",law="normal"),
        MaxPoolLayer(kernel_size=3,stride=2,padding=False),
        ConvLayer(in_channels=32,output_channels=64,kernel_size=3,stride=1,padding=True,activation_function="relu",initial="lecun",law="normal"),
        MaxPoolLayer(kernel_size=3,stride=2,padding=False),
        
        FlatLayer(),
        Layer(
                nb_neurons=64,
                activation_function="relu",
                regul=("l2", 0.001),
                initial="he",
                batchnorm=True
         
            ),
            Layer(
                nb_neurons=32,
                activation_function="relu",
                regul=("l2", 0.001),
                initial="he",
                
            ),
    
        
    ),
    max_iter=3,#i will stop at 3 otherwise it will make too long to run 
    thr=1e-5,
    alpha=0.001,
    seed=123,
    batch_size=100,
    verbose=True,
    nb_epochs_early_stopping=20



)

Don't forget to normalise input data and think about Batch normalisations


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import cifar10


# LOAD CIFAR10
(X_train_raw, Y_train_raw), (X_test_raw, Y_test_raw) = cifar10.load_data()

X = np.concatenate([X_train_raw, X_test_raw], axis=0)   # (60000,32,32,3)
Y = np.concatenate([Y_train_raw, Y_test_raw], axis=0)   # (60000,1)

#SELECT 2 CLASSES (0 airplane, 1 automobile) ( my class supports multiclass classification but i select binary classification as otherwise the results will be bad for only 1K images)
mask = np.isin(Y.flatten(), [0,1])
X = X[mask]
Y = Y[mask]
Y = (Y == 1).astype(int)
#select only 1K observations : otherwise it is too heavy for local computations
N = 1000
pos = np.where(Y.flatten() == 1)[0]
neg = np.where(Y.flatten() == 0)[0]
np.random.seed(42)
np.random.shuffle(pos)
np.random.shuffle(neg)
k = N // 2
selected_idx = np.concatenate([pos[:k], neg[:k]])
np.random.shuffle(selected_idx)
X = X[selected_idx]
Y = Y[selected_idx]
print("Using dataset:", X.shape, Y.shape)
#normalise data
X = X.astype(np.float32) / 255

#train test split 
X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    shuffle=True,
    stratify=Y,
    random_state=42
)


#CONVERT TO (H,W,C,N)

X_train = np.transpose(X_train, (1,2,3,0))   # (32,32,3,N_train)
X_test  = np.transpose(X_test,  (1,2,3,0))   # (32,32,3,N_test)

Y_train = Y_train.reshape(-1,1)
Y_test  = Y_test.reshape(-1,1)

print("FINAL SHAPES")
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)
print("X_test :", X_test.shape)
print("Y_test :", Y_test.shape)
print("Train balance:", np.bincount(Y_train.flatten()))
print("Test balance :", np.bincount(Y_test.flatten()))


2026-01-05 20:35:13.648143: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-05 20:35:13.649022: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-05 20:35:13.768242: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-05 20:35:15.992615: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To tur

Using dataset: (1000, 32, 32, 3) (1000, 1)
FINAL SHAPES
X_train: (32, 32, 3, 800)
Y_train: (800, 1)
X_test : (32, 32, 3, 200)
Y_test : (200, 1)
Train balance: [400 400]
Test balance : [100 100]


we will run on train and evaluate on test datasets, since our architecture is weak (not Alexnet),few neurons and output channels, only 1K observations and 2 labels we will have bad results for accuracy
in every case it is not very important as depends on our computers, the most important is the theoretical results, derivations and OOP applied in this project 

In [4]:
q.train(
     
        X_train,
        Y_train,
        X_test ,
        Y_test ,
        fct  = accuracy_score


    )

(32, 32, 3, 1)
0 original image  shape: (32, 32, 3, 1)
1 conv layer shape: (32, 32, 16, 1)
2 maxpool layer shape: (15, 15, 16, 1)
3 conv layer shape: (15, 15, 32, 1)
4 maxpool layer shape: (7, 7, 32, 1)
5 conv layer shape: (7, 7, 64, 1)
6 maxpool layer shape: (3, 3, 64, 1)
7 flatten layer shape: (3, 3, 64, 1)
dummy res shape (1, 576)
the last dim is 1 as it is dummy, other shapes are kept as is 
(100, 576)
7 FlatLayer
6 MaxPoolLayer
5 ConvLayer
4 MaxPoolLayer
3 ConvLayer
2 MaxPoolLayer
1 ConvLayer
(100, 576)
7 FlatLayer
6 MaxPoolLayer
5 ConvLayer
4 MaxPoolLayer
3 ConvLayer
2 MaxPoolLayer
1 ConvLayer
(100, 576)
7 FlatLayer
6 MaxPoolLayer
5 ConvLayer
4 MaxPoolLayer
3 ConvLayer
2 MaxPoolLayer
1 ConvLayer
(100, 576)
7 FlatLayer
6 MaxPoolLayer
5 ConvLayer
4 MaxPoolLayer
3 ConvLayer
2 MaxPoolLayer
1 ConvLayer
(100, 576)
7 FlatLayer
6 MaxPoolLayer
5 ConvLayer
4 MaxPoolLayer
3 ConvLayer
2 MaxPoolLayer
1 ConvLayer
(100, 576)
7 FlatLayer
6 MaxPoolLayer
5 ConvLayer
4 MaxPoolLayer
3 ConvLayer
2 Ma