In [1]:
import numpy as np
import random
import pandas as pd
from sklearn.metrics import f1_score

In [2]:
np.random.seed(44)
random.seed(44)

In [3]:

class NeuralNetwork5:

    def __init__(self, layer_sizes, activation='sigmoid', output_activation='linear', init_method='normal',classification=True, bias_sd = 0.01):
        """
        Inicjalizuje sieć neuronową MLP.

        :param layer_sizes: Lista określająca liczbę neuronów w kolejnych warstwach, np. [1, 5, 1]
        :param activation: Funkcja aktywacji dla warstw ukrytych (domyślnie 'sigmoid')
        :param output_activation: Funkcja aktywacji dla warstwy wyjściowej (domyślnie 'linear')
        :param init_method: Metoda inicjalizacji wag, opcje:
                            'normal'  - N(0,1) (domyślnie),
                            'uniform' - U[0,1],
                            'he'      - inicjalizacja He,
                            'xavier'  - inicjalizacja Xavier.
                            W przypadku nieznanej metody używana jest inicjalizacja normalna.
        """
        self.layer_sizes = layer_sizes
        self.activation = activation
        self.output_activation = output_activation
        self.init_method = init_method
        self.classification = classification
        self.gradient_history = []

        # Inicjalizacja parametrów (wag i biasów) dla każdej warstwy poza wejściową
        self.params = []
        for i in range(len(layer_sizes) - 1):
            n_in = layer_sizes[i]
            n_out = layer_sizes[i+1]

            if init_method == 'normal':
                # Domyślna inicjalizacja: N(0,1)
                W = np.random.randn(n_out, n_in)
                b = np.random.randn(n_out, 1) * bias_sd
            elif init_method == 'uniform':
                # Inicjalizacja z U[0,1]
                W = np.random.rand(n_out, n_in)
                b = np.random.rand(n_out, 1) * bias_sd
            elif init_method == 'he':
                # Inicjalizacja He: dla warstw z ReLU lub jej wariantami
                W = np.random.randn(n_out, n_in) * np.sqrt(2 / n_in)
                b = np.random.randn(n_out, 1) * bias_sd
            elif init_method == 'xavier':
                # Inicjalizacja Xavier: dla warstw z sigmoid lub tanh
                W   = np.random.randn(n_out, n_in) * np.sqrt(1 / (n_in + n_out))
                b = np.random.randn(n_out, 1) * bias_sd
            else:
                # W przypadku nieznanej metody używamy domyślnej inicjalizacji N(0,1)
                W = np.random.randn(n_out, n_in)
                b = np.random.randn(n_out, 1) * bias_sd

            self.params.append({'W': W, 'b': b})

    def sigmoid(self, z):
        """
        Funkcja aktywacji sigmoidalnej z ograniczeniem zakresu dla stabilności numerycznej.
        """
        z = np.clip(z, -15, 15)
        return 1.0 / (1.0 + np.exp(-z))

    def relu(self, z):
        """
        Funkcja aktywacji ReLU
        """
        return np.maximum(0, z)

    def softmax(self, x):
        """
        Softmax activation function.
        """
        exps = np.exp(x - np.max(x, axis=0))
        return exps / np.sum(exps, axis=0)

    def linear(self, z):
        """
        Funkcja aktywacji liniowej.
        """
        return z

    def tanh(self, z):
        return np.tanh(z)

    def sigmoid_derivative(self, a):
        # a = sigmoid(z), więc d/dz sigmoid = a * (1 - a)
        return a * (1 - a)

    def relu_derivative(self, z):
        # Dla ReLU kluczowe jest z (a = relu(z) nie wystarcza do odróżnienia punktów 0)
        return (z > 0).astype(float)

    def tanh_derivative(self, a):
        # a = tanh(z), więc d/dz tanh(z) = 1 - a^2
        return 1 - a**2

    def forward(self, X):
        """
        Przeprowadza propagację w przód.

        :param X: Dane wejściowe w postaci macierzy [D x N] (D - cechy, N - liczba próbek)
        :return: Wynik propagacji (output sieci)
        """
        # Propagacja przez warstwy ukryte

        activation_func = getattr(self, self.activation)
        for i in range(len(self.layer_sizes) - 2):
            X = activation_func(self.params[i]['W'] @ X + self.params[i]['b'])

        # Warstwa wyjściowa z określoną funkcją aktywacji
        output_func = getattr(self, self.output_activation)
        return output_func(self.params[-1]['W'] @ X + self.params[-1]['b'])

    def compute_mse(self, y_true, y_pred):
        """
        Oblicza funkcję kosztu (MSE).

        MSE = (1/N) * sum((y_pred - y_true)^2)
        """
        y_true = y_true.flatten()
        y_pred = y_pred.flatten()
        N = y_true.shape[0]
        return (1 / N) * np.sum((y_pred - y_true) ** 2)

    def compute_cross_entropy(self, y, y_pred):
        """
        Computes the cross-entropy loss between the true and predicted values.
        :param y: True values
        :param y_pred: Predicted values
        :return: The computed cross-entropy loss
        """
        n = y.shape[0]
        return -np.sum(y * np.log(y_pred)) / n


    def calculate_gradient(self, x_batch, y_batch, clip_threshold=1.0):
        """
        Oblicza gradienty wag i biasów metodą backpropagation z gradient clipping.

        :param x_batch: Dane wejściowe [D x N], D - liczba cech, N - liczba próbek
        :param y_batch: Odpowiedzi dla batcha (wektor lub macierz [output_size x N])
        :param clip_threshold: Próg normy, powyżej którego gradienty są skalowane
        :return: Lista gradientów dla poszczególnych warstw,
                 gdzie każdy element to słownik {'W': dW, 'b': db}
        """
        # ----------------------
        # Forward pass
        # ----------------------
        activations = [x_batch]  # a^(0) = x_batch
        zs = []                  # z = W*a + b dla każdej warstwy

        num_layers = len(self.params)
        for i in range(num_layers):
            W = self.params[i]['W']
            b = self.params[i]['b']

            z = W @ activations[-1] + b
            zs.append(z)

            if i < num_layers - 1:
                # warstwy ukryte
                a = getattr(self, self.activation)(z)
            else:
                # warstwa wyjściowa
                a = getattr(self, self.output_activation)(z)

            activations.append(a)

        # Upewnij się, że y_batch ma taki sam kształt jak output sieci
        if y_batch.ndim == 1:
            y_batch = y_batch.reshape(activations[-1].shape)

        m = x_batch.shape[1]  # liczba próbek w batchu

        # ----------------------
        # Obliczenie "delta" od warstwy wyjściowej
        # ----------------------
        # Przykładowo: MSE -> d/dA_last = (2/m) * (A_last - y_batch)
        delta = (2.0 / m) * (activations[-1] - y_batch)

        # Modyfikacja delta przez pochodną funkcji aktywacji warstwy wyjściowej
        if self.output_activation == 'linear':
            # d/dz linear = 1 -> bez zmian
            pass
        elif self.output_activation == 'sigmoid':
            delta *= self.sigmoid_derivative(activations[-1])
        elif self.output_activation == 'relu':
            delta *= self.relu_derivative(zs[-1])
        elif self.output_activation == 'tanh':
            delta *= self.tanh_derivative(activations[-1])
        elif self.output_activation == 'softmax':
            delta = (activations[-1] - y_batch)
        else:
            pass  # Jeśli inna funkcja, wstawić własny wariant

        # ----------------------
        # Backprop: warstwa wyjściowa
        # ----------------------
        gradients = [None] * num_layers

        dW = delta @ activations[-2].T
        db = np.sum(delta, axis=1, keepdims=True)
        gradients[-1] = {'W': dW, 'b': db}

        # ----------------------
        # Backprop: warstwy ukryte
        # ----------------------
        for i in range(num_layers - 2, -1, -1):
            # W_(i+1).T @ delta_(i+1)
            delta = self.params[i+1]['W'].T @ delta

            # Pochodna funkcji aktywacji warstwy ukrytej
            if self.activation == 'sigmoid':
                delta *= self.sigmoid_derivative(activations[i+1])
            elif self.activation == 'relu':
                delta *= self.relu_derivative(zs[i])
            elif self.activation == 'tanh':
                delta *= self.tanh_derivative(activations[i+1])
            elif self.activation == 'linear':
                pass

            dW = delta @ activations[i].T
            db = np.sum(delta, axis=1, keepdims=True)
            gradients[i] = {'W': dW, 'b': db}

        # ----------------------
        # Gradient clipping
        # ----------------------
        for layer_grad in gradients:
            for key in layer_grad:
                grad_norm = np.linalg.norm(layer_grad[key])
                if grad_norm > clip_threshold:
                    layer_grad[key] *= (clip_threshold / grad_norm)

        return gradients


    def vector_to_gradients(self, grad_vector):
        """
        Odwraca funkcję gradients_to_vector.
        Na podstawie wektora grad_vector oraz oryginalnych kształtów parametrów,
        zwraca listę słowników gradientów w tej samej strukturze, co w self.params.

        :param grad_vector: Jednowymiarowy numpy array zawierający wszystkie gradienty.
        :return: Lista słowników gradientów, gdzie każdy słownik ma klucze 'W' oraz 'b'
                 z gradientami o odpowiednich kształtach.
        """
        gradients = []
        current_index = 0

        # Iterujemy przez warstwy korzystając z kształtów parametrów zapisanych w self.params
        for layer in self.params:
            layer_grad = {}

            # Kształt i liczba elementów gradientu dla wag (W)
            W_shape = layer['W'].shape
            W_size = np.prod(W_shape)
            # Wydzielamy fragment wektora dla wag i przekształcamy go do odpowiedniego kształtu
            W_grad = grad_vector[current_index: current_index + W_size].reshape(W_shape)
            current_index += W_size

            # Kształt i liczba elementów gradientu dla biasów (b)
            b_shape = layer['b'].shape
            b_size = np.prod(b_shape)
            # Wydzielamy fragment wektora dla biasów i przekształcamy go do odpowiedniego kształtu
            b_grad = grad_vector[current_index: current_index + b_size].reshape(b_shape)
            current_index += b_size

            layer_grad['W'] = W_grad
            layer_grad['b'] = b_grad

            gradients.append(layer_grad)

        return gradients


    def gradients_to_vector(self,gradients):
        """
        Zamienia listę słowników gradientów na jeden wektor.

        :param gradients: Lista słowników gradientów,
                          gdzie każdy słownik ma klucze 'W' (gradient wag)
                          oraz 'b' (gradient biasów).
        :return: Jednowymiarowy numpy array zawierający wszystkie gradienty.
        """
        grad_vector = []
        for layer_grad in gradients:
            # Spłaszczamy gradient wag i biasów i dodajemy do listy
            grad_vector.append(layer_grad['W'].ravel())
            grad_vector.append(layer_grad['b'].ravel())
        # Łączymy wszystkie spłaszczone elementy w jeden wektor
        return np.concatenate(grad_vector)


    def train(self,
              X_train,
              y_train,
              batch_size,
              epochs,
              learning_rate=0.01,
              verbose=250,
              optimizer='basic',
              beta=0.9,
              beta_1=0.9,
              beta_2=0.999,
              eps=1e-8,
              return_loss = True
              ):
        """
        Trenuje sieć neuronową metodą mini-batch gradient descent, wspierając
        różne optymalizatory (m.in. Adam, RMSProp).

        Parametry:
        ----------
        X_train : ndarray, shape [D, N]
            Dane treningowe, gdzie D to liczba cech, a N to liczba próbek.
        y_train : ndarray, shape [N] lub [1, N]
            Odpowiedzi (etykiety) dla próbek.
        batch_size : int
            Rozmiar mini-batcha.
        epochs : int
            Liczba epok treningowych.
        learning_rate : float
            Współczynnik uczenia (domyślnie 0.01).
        verbose : int
            Co ile epok wyświetlać informację o błędzie (domyślnie co 250).
        optimizer : {'basic', 'adam', 'RMSProp'}
            Wybór optymalizatora.
        beta : float
            Współczynnik momentum dla RMSProp (domyślnie 0.9).
        beta_1 : float
            Współczynnik momentum dla Adama (domyślnie 0.9).
        beta_2 : float
            Współczynnik dla średniej kwadratów gradientów w Adamie (domyślnie 0.999).
        eps : float
            Drobna stała zapobiegająca dzieleniu przez zero (domyślnie 1e-8).
        return_losses : bool
            Jeśli True, po zakończeniu treningu zwracana jest lista strat
            z każdej epoki (domyślnie False).

        Zwraca:
        -------
        Nic (lub listę strat, jeśli return_losses=True).
        """

        # Zmienna 't' dla Adama – zlicza łączną liczbę batchy (kroków optymalizacji).
        if optimizer == 'adam':
            t = 0


        # Liczba próbek
        num_samples = X_train.shape[1]

        # Pętla po epokach
        for epoch in range(epochs):
            # Losowa permutacja indeksów (shuffle)
            indices = np.random.permutation(num_samples)

            # Pętla po batchach
            for start_idx in range(0, num_samples, batch_size):
                end_idx = min(start_idx + batch_size, num_samples)
                batch_indices = indices[start_idx:end_idx]

                # Tworzymy batch
                X_batch = X_train[:, batch_indices]


                if self.classification:
                    y_batch = y_train[:,batch_indices]
                else:
                    y_batch = y_train[batch_indices]


                # Obliczamy gradienty (backprop + ewentualny clipping)
                gradients = self.calculate_gradient(X_batch, y_batch)

                # Jeśli używamy Adama lub RMSProp, przetwarzamy te gradienty
                if optimizer in ('adam', 'RMSProp'):
                    # Inkrementacja kroków optymalizacji
                    vector_gradient = self.gradients_to_vector(gradients)

                    if optimizer == 'adam':
                        t += 1
                        # Inicjalizacja m_t i v_t w pierwszym kroku
                        if epoch == 0 and start_idx == 0:
                            m_t = np.zeros_like(vector_gradient)
                            v_t = np.zeros_like(vector_gradient)
                        else:
                            # Aktualizacja pierwszego i drugiego momentu
                            m_t = beta_1 * m_t + (1 - beta_1) * vector_gradient
                            v_t = beta_2 * v_t + (1 - beta_2) * (vector_gradient ** 2)

                        # Korekta biasu (Adam)
                        m_t_hat = m_t / (1 - beta_1 ** t)
                        v_t_hat = v_t / (1 - beta_2 ** t)

                        # Obliczamy przyrost (update)
                        update = m_t_hat / (np.sqrt(v_t_hat) + eps)

                        # Zamiana wektora 'update' w strukturę gradientów
                        gradients = self.vector_to_gradients(update)

                    elif optimizer == 'RMSProp':
                        # Inicjalizacja w pierwszym kroku
                        if epoch == 0 and start_idx == 0:
                            gradient_squared_mean = np.zeros_like(vector_gradient)

                        # Aktualizacja wykładniczej średniej kwadratów gradientów
                        gradient_squared_mean = beta * gradient_squared_mean + (1 - beta) * (vector_gradient ** 2)

                        # Obliczamy przyrost
                        update = vector_gradient / (np.sqrt(gradient_squared_mean) + eps)

                        # Konwersja wektora 'update' na listę gradientów
                        gradients = self.vector_to_gradients(update)

                # Aktualizacja wag i biasów w sieci
                for i in range(len(self.params)):
                    self.params[i]['W'] -= learning_rate * gradients[i]['W']
                    self.params[i]['b'] -= learning_rate * gradients[i]['b']

            # Wyświetlamy, jeśli przypada kolej epoki zgodnie z verbose
            if epoch % verbose == verbose-1:
                y_pred = self.forward(X_train)
                if self.classification:
                    loss = self.compute_cross_entropy(y_train, y_pred)
                    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}")
                else:
                    mse = self.compute_mse(y_train, y_pred)
                    print(f"Epoch {epoch + 1}/{epochs}, MSE: {mse}")


        if self.classification:
            final_loss = self.compute_cross_entropy(y_train, self.forward(X_train))
            print(f"Final loss: {final_loss}")
            if return_loss:
                return final_loss
        else:
            final_mse = self.compute_mse(y_train, self.forward(X_train))
            print(f"Final MSE: {final_mse}")
            if return_loss:
                return final_mse


In [4]:
def one_hot_encode(y):
    """
    This function one hot encodes the labels
    :param y: ndarray containing the labels
    :return: ndarray with the one hot encoded labels
    """
    one_hot = np.zeros((y.size, y.max() + 1))
    one_hot[np.arange(y.size), y] = 1
    return one_hot.T

In [5]:

def load_and_scale_data_regression(training_csv, test_csv, feature_col='x', target_col='y'):
    """
    Wczytuje dane z plików CSV oraz skaluje je (standaryzacja).

    Parametry:
      - training_csv: Ścieżka do pliku CSV z danymi treningowymi.
      - test_csv: Ścieżka do pliku CSV z danymi testowymi.
      - feature_col: Nazwa kolumny zawierającej cechy (domyślnie 'x').
      - target_col: Nazwa kolumny zawierającej etykiety (domyślnie 'y').

    Zwraca:
      - x_train_scaled: Zescalowane dane treningowe (features) [1 x N_train].
      - y_train_scaled: Zescalone etykiety treningowe.
      - x_test_scaled: Zescalowane dane testowe (features) [1 x N_test].
      - y_test: Oryginalne etykiety testowe (bez skalowania).
      - scaling_params: Krotka (x_mean, x_std, y_mean, y_std) – parametry skalowania,
                        które później można użyć do odwrotnej transformacji wyników.
    """
    # Wczytanie danych


    # Wyodrębnienie kolumn i przekształcenie do odpowiednich kształtów
    x_train = training_csv[[feature_col]].values.T
    y_train = training_csv[target_col].values

    x_test = test_csv[[feature_col]].values.T
    y_test = test_csv[target_col].values

    # Obliczanie średniej i odchylenia standardowego dla x oraz y
    x_mean = np.mean(x_train)
    x_std  = np.std(x_train)
    y_mean = np.mean(y_train)
    y_std  = np.std(y_train)

    # Skalowanie danych (standaryzacja)
    x_train_scaled = (x_train - x_mean) / x_std
    y_train_scaled = (y_train - y_mean) / y_std
    x_test_scaled  = (x_test - x_mean) / x_std

    scaling_params = (x_mean, x_std, y_mean, y_std)
    return x_train_scaled, y_train_scaled, x_test_scaled, y_test, scaling_params

In [6]:
def load_and_scale_data_for_classification(training_data, test_data, target_col='c'):
    """
    This function prepares the data for training and testing. It standardizes the features and one hot encodes the labels
    :param training_data: DataFrame containing training data.
    :param test_data: DataFrame containing test data.
    :param target_col: string, name of the column containing labels. Defaults to 'c'.
    :return: A tuple containing:
        - x_train_scaled (ndarray): Standardized training features [1 x N_train].
        - y_train (ndarray): One hot encoded training labels.
        - x_test_scaled (ndarray): Standardized test features [1 x N_test].
        - y_test (ndarray): Original test labels (not scaled).
    """
    x_train = training_data.drop(columns=[target_col]).values.T
    y_train = training_data[target_col].values
    x_test = test_data.drop(columns=[target_col]).values.T
    y_test = test_data[target_col].values
    x_mean = np.mean(x_train)
    x_std = np.std(x_train)
    x_train_scaled = (x_train - x_mean) / x_std
    x_test_scaled = (x_test - x_mean) / x_std
    if y_train.dtype == bool:
        y_train = y_train.astype(int)
        y_test = y_test.astype(int)
    y_train = one_hot_encode(y_train)
    return x_train_scaled, y_train, x_test_scaled, y_test

In [7]:
# Wczytywanie multimodal-large
multimodal_large_train = pd.read_csv("data/NN5/multimodal-large-training.csv")
multimodal_large_test  = pd.read_csv("data/NN5/multimodal-large-test.csv")

steps_large_train = pd.read_csv("data/NN5/steps-large-training.csv")
steps_large_test     = pd.read_csv("data/NN5/steps-large-test.csv")

rings3_train = pd.read_csv("data/NN5/rings3-regular-training.csv")
rings3_test = pd.read_csv("data/NN5/rings3-regular-test.csv")

rings5_train = pd.read_csv("data/NN5/rings5-regular-training.csv")
rings5_test = pd.read_csv("data/NN5/rings5-regular-test.csv")


In [8]:
import numpy as np
import pandas as pd
from itertools import product

def train_model_gridsearch(
        dfs,
        architectures,
        activations,
        classification,
        output_activation,
        batch_size=100,
        epochs=100,
        learning_rate=0.05,
        verbose=500,
        optimizer='basic'
):


    if not classification:
        x_train_scaled, y_train_scaled, *_ = load_and_scale_data_regression(
            dfs["train"], dfs["test"], 'x', 'y')
    else:
        x_train_scaled, y_train_scaled, *_ = load_and_scale_data_for_classification(dfs['train'], dfs['test'], target_col='c')

    # Przygotowanie ramki danych do wyników
    results = pd.DataFrame(index=activations, columns=architectures.keys())

    # Pętla treningowa
    for activation, (arch_name, layer_sizes) in product(activations, architectures.items()):
        losses = []
        for _ in range(5):
            model = NeuralNetwork5(
                layer_sizes=layer_sizes,
                activation=activation,
                output_activation=output_activation,
                init_method='normal',
                classification=classification
            )
            loss = model.train(
                x_train_scaled, y_train_scaled,
                batch_size=batch_size,
                epochs=epochs,
                learning_rate=learning_rate,
                verbose=verbose,
                optimizer=optimizer,
            )
            losses.append(loss)
        results.loc[activation, arch_name] = np.mean(losses)

    return results


# testy na multimodal

In [9]:
dfs_multimodal = {
    "train": multimodal_large_train,
    "test": multimodal_large_test
}

results_multimodal = train_model_gridsearch(
    dfs=dfs_multimodal,
    architectures = {
        "1-2-1": [1, 2, 1],
        "1-4-1": [1, 4, 1],
        "1-8-1": [1, 8, 1],
        "1-2-2-1": [1, 2, 2, 1],
        "1-4-4-1": [1, 4, 4, 1],
        "1-8-8-1": [1, 8, 8, 1],
        "1-2-2-2-1": [1, 2, 2, 2, 1],
        "1-4-4-4-1": [1, 4, 4, 4, 1],
        "1-8-8-8-1": [1, 8, 8, 8, 1],
        "1-16-1": [1, 16, 1],
        "1-16-16-1": [1, 16, 16, 1],
        "1-16-16-16-1": [1, 16, 16, 16, 1]
    },
    activations=["sigmoid", "relu", "tanh", "linear"],
    classification=False,
    output_activation="linear"
)




Final MSE: 0.36179028791868917
Final MSE: 0.3640700964903205
Final MSE: 0.3613372440136941
Final MSE: 0.3668197180242447
Final MSE: 0.3607030631021684
Final MSE: 0.30342944495256124
Final MSE: 0.29874031680344193
Final MSE: 0.36039267423698657
Final MSE: 0.3598365442886046
Final MSE: 0.3580559809508623
Final MSE: 0.3383582093469374
Final MSE: 0.24807677687933424
Final MSE: 0.35972653605691823
Final MSE: 0.3605600061777429
Final MSE: 0.3392672807660635
Final MSE: 0.3314142749285868
Final MSE: 0.3177880636368762
Final MSE: 0.43929063912633465
Final MSE: 0.33396226126558365
Final MSE: 0.5320497166965491
Final MSE: 0.29442738556319575
Final MSE: 0.27618275605697457
Final MSE: 0.31829788556742905
Final MSE: 0.3158345598998764
Final MSE: 0.2948219736091145
Final MSE: 0.15375618397560775
Final MSE: 0.24729556030825461
Final MSE: 0.2778574878367363
Final MSE: 0.26700689658473536
Final MSE: 0.276363949152779
Final MSE: 0.3315717929598721
Final MSE: 0.5075491966579554
Final MSE: 0.32061544775927

In [10]:
results_multimodal

Unnamed: 0,1-2-1,1-4-1,1-8-1,1-2-2-1,1-4-4-1,1-8-8-1,1-2-2-2-1,1-4-4-4-1,1-8-8-8-1,1-16-1,1-16-16-1,1-16-16-16-1
sigmoid,0.362944,0.336091,0.329198,0.390901,0.299913,0.244456,0.401166,0.286769,0.263831,0.310708,0.144249,0.090355
relu,0.687836,0.469846,0.334244,0.632188,0.224055,0.061195,0.742767,0.194694,0.032094,0.240279,0.037066,0.009884
tanh,0.322971,0.107036,0.089056,0.294828,0.072586,0.045953,0.233853,0.104006,0.0249,0.070553,0.023181,0.008373
linear,0.850747,0.850109,0.850705,0.84985,0.850087,0.850191,0.851352,0.851481,0.851618,0.85151,0.850759,0.852168


### wygrywa tanh/relu 1-16-16-16-1

# testy dla steps_large

In [12]:
dfs_steps_large = {
    "train": steps_large_train,
    "test": steps_large_test
}

results_steps_large = train_model_gridsearch(
    dfs=dfs_steps_large,
    architectures = {
        "1-2-1": [1, 2, 1],
        "1-4-1": [1, 4, 1],
        "1-8-1": [1, 8, 1],
        "1-2-2-1": [1, 2, 2, 1],
        "1-4-4-1": [1, 4, 4, 1],
        "1-8-8-1": [1, 8, 8, 1],
        "1-2-2-2-1": [1, 2, 2, 2, 1],
        "1-4-4-4-1": [1, 4, 4, 4, 1],
        "1-8-8-8-1": [1, 8, 8, 8, 1],
        "1-16-1": [1, 16, 1],
        "1-16-16-1": [1, 16, 16, 1],
        "1-16-16-16-1": [1, 16, 16, 16, 1]
    },
    activations=["sigmoid", "relu", "tanh", "linear"],
    classification=False,
    output_activation="linear"
)




Final MSE: 0.07054156384255571
Final MSE: 0.07053649144961667
Final MSE: 0.0708278582844069
Final MSE: 0.0704165317829581
Final MSE: 0.07078960333156245
Final MSE: 0.0706733910801956
Final MSE: 0.07016648402117454
Final MSE: 0.0701904804955443
Final MSE: 0.07071001993280863
Final MSE: 0.07035351014325458
Final MSE: 0.07019310274983319
Final MSE: 0.06996560517194886
Final MSE: 0.06947177651141909
Final MSE: 0.07045389892102294
Final MSE: 0.07111766753744887
Final MSE: 0.0713355612277263
Final MSE: 0.07133280934690352
Final MSE: 0.07177906608256886
Final MSE: 0.07158525130160479
Final MSE: 0.07140377165460204
Final MSE: 0.07009496700372947
Final MSE: 0.0709853014044366
Final MSE: 0.07071996740665744
Final MSE: 0.0700707624721753
Final MSE: 0.07078367905210128
Final MSE: 0.07090060874849141
Final MSE: 0.0703901679831918
Final MSE: 0.07159112400352569
Final MSE: 0.0691881634382518
Final MSE: 0.07046235753601222
Final MSE: 0.07131300126228188
Final MSE: 0.07226295771880721
Final MSE: 0.0722

In [13]:
results_steps_large

Unnamed: 0,1-2-1,1-4-1,1-8-1,1-2-2-1,1-4-4-1,1-8-8-1,1-2-2-2-1,1-4-4-4-1,1-8-8-8-1,1-16-1,1-16-16-1,1-16-16-16-1
sigmoid,0.070622,0.070419,0.07024,0.071487,0.070531,0.070506,0.071855,0.071066,0.069351,0.070489,0.067454,0.066695
relu,0.256316,0.064602,0.05966,0.248542,0.057877,0.034825,0.247988,0.047343,0.016071,0.052061,0.01895,0.031914
tanh,0.068967,0.067603,0.056652,0.050659,0.032634,0.022228,0.058416,0.017589,0.01131,0.048256,0.016364,0.007747
linear,0.075988,0.077031,0.076969,0.076096,0.076498,0.076133,0.076271,0.076349,0.076076,0.079252,0.076467,0.076147


### Wygrywa tanh 1-16-16-16-1 a z relu wygrywa 1-8-8-8-1

# testy dla rings3

In [15]:
dfs_rings3 = {
    "train": rings3_train,
    "test": rings3_test
}

results_rings3 = train_model_gridsearch(
    dfs=dfs_rings3,
    architectures = {
        "1-2-1": [2, 2, 3],
        "1-4-1": [2, 4, 3],
        "1-8-1": [2, 8, 3],
        "1-2-2-1": [2, 2, 2, 3],
        "1-4-4-1": [2, 4, 4, 3],
        "1-8-8-1": [2, 8, 8, 3],
        "1-2-2-2-1": [2, 2, 2, 2, 3],
        "1-4-4-4-1": [2, 4, 4, 4, 3],
        "1-8-8-8-1": [2, 8, 8, 8, 3],
        "1-16-1": [2, 16, 3],
        "1-16-16-1": [2, 16, 16, 3],
        "1-16-16-16-1": [2, 16, 16, 16, 3]
    },
    activations=["sigmoid", "relu", "tanh", "linear"],
    classification=True,
    output_activation="softmax"
)




Final loss: 406.09206869109903
Final loss: 407.37963793088903
Final loss: 474.25656053100874
Final loss: 442.1089218392779
Final loss: 440.63290528733677
Final loss: 274.59638666096686
Final loss: 252.00631348013675
Final loss: 308.84281831150247
Final loss: 285.8755737453204
Final loss: 251.35206580002114
Final loss: 186.69372949456397
Final loss: 207.09257676726054
Final loss: 207.66984588509823
Final loss: 215.42715056897728
Final loss: 257.90821552819506
Final loss: 441.88026647700855
Final loss: 453.6254780919026
Final loss: 347.2187729106558
Final loss: 345.6536668805373
Final loss: 456.31154846750434
Final loss: 276.1384507484757
Final loss: 217.39366543616245
Final loss: 261.1835862922001
Final loss: 233.80000150864225
Final loss: 224.2675796281459
Final loss: 159.85611645727465
Final loss: 160.6562954827267
Final loss: 154.49164487906867
Final loss: 160.96366411384432
Final loss: 150.38744871611416
Final loss: 346.3850477985302
Final loss: 339.29613599911295
Final loss: 340.35

In [18]:
results_rings3


Unnamed: 0,1-2-1,1-4-1,1-8-1,1-2-2-1,1-4-4-1,1-8-8-1,1-2-2-2-1,1-4-4-4-1,1-8-8-8-1,1-16-1,1-16-16-1,1-16-16-16-1
sigmoid,434.094019,274.534632,214.958304,408.937947,242.556657,157.271034,366.185256,281.225961,152.437023,197.150648,135.781678,122.304926
relu,461.839516,309.249467,129.199696,483.004266,294.562206,96.877681,475.900488,274.816815,88.550042,120.38206,75.917755,89.760769
tanh,412.091376,252.001167,152.944248,380.445817,140.232991,94.284483,375.030191,152.573404,99.410995,120.949889,78.711084,62.700593
linear,495.281939,496.337756,499.010435,496.49652,496.670829,496.026157,497.374877,497.473402,496.098959,507.020005,563.965251,588.678004


### wygrywa dla tanh 1-16-16-16-1 oraz relu 1-16-16-1

# testy dla rings5

In [19]:
dfs_rings5 = {
    "train": rings5_train,
    "test": rings5_test
}

results_rings5 = train_model_gridsearch(
    dfs=dfs_rings5,
    architectures = {
        "1-2-1": [2, 2, 5],
        "1-4-1": [2, 4, 5],
        "1-8-1": [2, 8, 5],
        "1-2-2-1": [2, 2, 2, 5],
        "1-4-4-1": [2, 4, 4, 5],
        "1-8-8-1": [2, 8, 8, 5],
        "1-2-2-2-1": [2, 2, 2, 2, 5],
        "1-4-4-4-1": [2, 4, 4, 4, 5],
        "1-8-8-8-1": [2, 8, 8, 8, 5],
        "1-16-1": [2, 16, 5],
        "1-16-16-1": [2, 16, 16, 5],
        "1-16-16-16-1": [2, 16, 16, 16, 5]
    },
    activations=["sigmoid", "relu", "tanh", "linear"],
    classification=True,
    output_activation="softmax"
)

Final loss: 315.7033505544106
Final loss: 283.40613509083795
Final loss: 315.27524308755955
Final loss: 295.9780353648809
Final loss: 315.7340461683324
Final loss: 204.27724887617296
Final loss: 226.37601562649235
Final loss: 234.79117657064398
Final loss: 265.39352208571364
Final loss: 228.10369594109687
Final loss: 167.52076172377284
Final loss: 188.81624497039292
Final loss: 193.2108039495632
Final loss: 201.26922921702914
Final loss: 189.5424248445243
Final loss: 241.02109695786186
Final loss: 236.60338455131463
Final loss: 241.06174688363973
Final loss: 243.90952982142176
Final loss: 287.40533348331763
Final loss: 138.81862706414202
Final loss: 215.144287223376
Final loss: 208.52202813524696
Final loss: 153.68212838285976
Final loss: 160.00271374672752
Final loss: 131.70041187295556
Final loss: 130.75757578420772
Final loss: 144.2593254407688
Final loss: 162.08449364414298
Final loss: 116.13411874991466
Final loss: 238.63175740650678
Final loss: 224.01296242109456
Final loss: 232.

In [20]:
results_rings5

Unnamed: 0,1-2-1,1-4-1,1-8-1,1-2-2-1,1-4-4-1,1-8-8-1,1-2-2-2-1,1-4-4-4-1,1-8-8-8-1,1-16-1,1-16-16-1,1-16-16-16-1
sigmoid,305.219362,231.788332,188.071893,250.000218,175.233957,136.987185,233.701904,159.885098,88.264301,159.517785,108.775348,94.973864
relu,313.425608,161.817861,117.761897,294.271487,169.596281,82.411086,376.21527,168.313444,89.460471,107.009042,64.88863,64.168408
tanh,278.962491,202.265653,128.425424,248.064272,137.866118,77.632852,221.756815,123.473276,80.075343,97.234843,54.959985,47.702052
linear,343.246616,343.908917,344.119956,345.061628,344.220076,348.552187,344.364266,344.851678,348.490158,345.12759,378.834452,627.27246


### wygrywa 1-16-16-16-1 relu oraz 1-16-16-16-1 oraz tanh

## steps_large

### relu, tutaj nie wybieram tego kto wygral, bo wygralo 1-8-8-8-1, ale dopiero ta architektura pozwolila mi zejsc do mse 3, ktory byl wymagany z 2 tygodnie temu

In [21]:
np.random.seed(100)
random.seed(100)

In [22]:
neural_network_0_v0 = NeuralNetwork5(layer_sizes=[1,16,16,16,1],
                                  activation='relu',
                                  output_activation='linear',
                                  init_method='he',
                                  classification = False)



x_train_scaled_0_v0, y_train_scaled_0_v0,  x_test_scaled_0_v0, y_test_0_v0, scaling_params_0_v0 =  load_and_scale_data_regression(steps_large_train,
                                                                                                        steps_large_test, 'x','y')

neural_network_0_v0.train(x_train_scaled_0_v0, y_train_scaled_0_v0, batch_size = 200 ,epochs = 2500, learning_rate=0.05, verbose = 500, optimizer = 'basic')
neural_network_0_v0.train(x_train_scaled_0_v0, y_train_scaled_0_v0, batch_size = 200 ,epochs = 2500, learning_rate=0.025, verbose = 500, optimizer = 'basic')
neural_network_0_v0.train(x_train_scaled_0_v0, y_train_scaled_0_v0, batch_size = 200 ,epochs = 5000, learning_rate=0.01, verbose = 500, optimizer = 'basic')
neural_network_0_v0.train(x_train_scaled_0_v0, y_train_scaled_0_v0, batch_size = 200 ,epochs = 10000, learning_rate=0.005, verbose = 500, optimizer = 'basic')


Epoch 500/2500, MSE: 0.006950468386940014
Epoch 1000/2500, MSE: 0.005500858783549404
Epoch 1500/2500, MSE: 0.002480614805124578
Epoch 2000/2500, MSE: 0.0020219813453761716
Epoch 2500/2500, MSE: 0.0026900614851047507
Final MSE: 0.0026900614851047507
Epoch 500/2500, MSE: 0.0014969014620098819
Epoch 1000/2500, MSE: 0.0013382412632331985
Epoch 1500/2500, MSE: 0.0012503443972652034
Epoch 2000/2500, MSE: 0.0012348813244576183
Epoch 2500/2500, MSE: 0.0009551882185865726
Final MSE: 0.0009551882185865726
Epoch 500/5000, MSE: 0.0011343751239334253
Epoch 1000/5000, MSE: 0.0009882853046820049
Epoch 1500/5000, MSE: 0.0008540782481838378
Epoch 2000/5000, MSE: 0.0008693578016460321
Epoch 2500/5000, MSE: 0.0008105161713140659
Epoch 3000/5000, MSE: 0.0008789257132084857
Epoch 3500/5000, MSE: 0.00076255118916802
Epoch 4000/5000, MSE: 0.0008155291669283274
Epoch 4500/5000, MSE: 0.0007317409019276834
Epoch 5000/5000, MSE: 0.0007293142864070633
Final MSE: 0.0007293142864070633
Epoch 500/10000, MSE: 0.00069

np.float64(0.0005894091608278995)

In [23]:
y_predict_0_v0  = neural_network_0_v0.forward(x_test_scaled_0_v0)
y_test_mean_0_v0 = scaling_params_0_v0[2]
y_test_sd_0_v0 = scaling_params_0_v0[3]
y_predict_rescaled_0_v0 = y_predict_0_v0 * y_test_sd_0_v0 + y_test_mean_0_v0
neural_network_0_v0.compute_mse(y_test_0_v0, y_predict_rescaled_0_v0)

np.float64(3.010448105699494)

### tanh

In [24]:
neural_network_0_v1 = NeuralNetwork5(layer_sizes=[1,16,16,16,1],
                                     activation='tanh',
                                     output_activation='linear',
                                     init_method='xavier',
                                     classification = False)



x_train_scaled_0_v1, y_train_scaled_0_v1,  x_test_scaled_0_v1, y_test_0_v1, scaling_params_0_v1 =  load_and_scale_data_regression(steps_large_train,
                                                                                                                                  steps_large_test, 'x','y')

neural_network_0_v1.train(x_train_scaled_0_v1, y_train_scaled_0_v1, batch_size = 200 ,epochs = 2500, learning_rate=0.05, verbose = 500, optimizer = 'basic')
neural_network_0_v1.train(x_train_scaled_0_v1, y_train_scaled_0_v1, batch_size = 200 ,epochs = 2500, learning_rate=0.025, verbose = 500, optimizer = 'basic')
neural_network_0_v1.train(x_train_scaled_0_v1, y_train_scaled_0_v1, batch_size = 200 ,epochs = 5000, learning_rate=0.01, verbose = 500, optimizer = 'basic')
neural_network_0_v1.train(x_train_scaled_0_v1, y_train_scaled_0_v1, batch_size = 200 ,epochs = 10000, learning_rate=0.005, verbose = 500, optimizer = 'basic')


Epoch 500/2500, MSE: 0.06975479423087147
Epoch 1000/2500, MSE: 0.010678611074460474
Epoch 1500/2500, MSE: 0.005448168668574284
Epoch 2000/2500, MSE: 0.0036911901725424183
Epoch 2500/2500, MSE: 0.003024579325018334
Final MSE: 0.003024579325018334
Epoch 500/2500, MSE: 0.002779356175888831
Epoch 1000/2500, MSE: 0.002606385933725048
Epoch 1500/2500, MSE: 0.0024924211136914884
Epoch 2000/2500, MSE: 0.002317082948803508
Epoch 2500/2500, MSE: 0.0022116648952616403
Final MSE: 0.0022116648952616403
Epoch 500/5000, MSE: 0.0021378966022000613
Epoch 1000/5000, MSE: 0.0021133396550394768
Epoch 1500/5000, MSE: 0.0020621651726712236
Epoch 2000/5000, MSE: 0.0020300932127661894
Epoch 2500/5000, MSE: 0.0019790014126508867
Epoch 3000/5000, MSE: 0.0019434825334516334
Epoch 3500/5000, MSE: 0.001919893501669537
Epoch 4000/5000, MSE: 0.001876639062196283
Epoch 4500/5000, MSE: 0.0018435091088224366
Epoch 5000/5000, MSE: 0.0018282172081213216
Final MSE: 0.0018282172081213216
Epoch 500/10000, MSE: 0.00179804361

np.float64(0.0015413520691408883)

In [25]:
y_predict_0_v1  = neural_network_0_v1.forward(x_test_scaled_0_v1)
y_test_mean_0_v1 = scaling_params_0_v1[2]
y_test_sd_0_v1 = scaling_params_0_v1[3]
y_predict_rescaled_0_v1 = y_predict_0_v1 * y_test_sd_0_v1 + y_test_mean_0_v1
neural_network_0_v1.compute_mse(y_test_0_v1, y_predict_rescaled_0_v1)

np.float64(8.841041693390824)

## rings3

### relu

In [45]:
np.random.seed(101)
random.seed(101)

In [46]:
neural_network_1_v0 = NeuralNetwork5(layer_sizes=[2,16,16,3],
                                     activation='relu',
                                     output_activation='softmax',
                                     init_method='he',
                                     classification = True)



x_train_scaled_1_v0, y_train_1_v0, x_test_scaled_1_v0, y_test_1_v0 =  load_and_scale_data_for_classification(rings3_train, rings3_test, target_col='c')

neural_network_1_v0.train(x_train_scaled_1_v0, y_train_1_v0, batch_size = 200 ,epochs = 2500, learning_rate=0.05, verbose = 500, optimizer = 'basic')
neural_network_1_v0.train(x_train_scaled_1_v0, y_train_1_v0, batch_size = 200 ,epochs = 2500, learning_rate=0.025, verbose = 500, optimizer = 'basic')
neural_network_1_v0.train(x_train_scaled_1_v0, y_train_1_v0, batch_size = 200 ,epochs = 2500, learning_rate=0.01, verbose = 500, optimizer = 'basic')


Epoch 500/2500, Loss: 46.74319884215007
Epoch 1000/2500, Loss: 27.76033524375089
Epoch 1500/2500, Loss: 25.627574457082996
Epoch 2000/2500, Loss: 18.903026510877012
Epoch 2500/2500, Loss: 16.052849185817934
Final loss: 16.052849185817934
Epoch 500/2500, Loss: 15.370227417247728
Epoch 1000/2500, Loss: 15.338390075074066
Epoch 1500/2500, Loss: 6.348556332457179
Epoch 2000/2500, Loss: 6.799962852189053
Epoch 2500/2500, Loss: 5.593476023241237
Final loss: 5.593476023241237
Epoch 500/2500, Loss: 3.9994790495256147
Epoch 1000/2500, Loss: 2.468635284244512
Epoch 1500/2500, Loss: 3.062095100736784
Epoch 2000/2500, Loss: 1.922933116126045
Epoch 2500/2500, Loss: 2.026192736517818
Final loss: 2.026192736517818


np.float64(2.026192736517818)

In [47]:
y_pred_test_1_v0 = neural_network_1_v0.forward(x_test_scaled_1_v0)
f1_score(y_test_1_v0, y_pred_test_1_v0.argmax(axis=0), average='macro')

0.9713737958706292

### tanh

In [48]:
neural_network_1_v1 = NeuralNetwork5(layer_sizes=[2,16,16,16,3],
                                     activation='relu',
                                     output_activation='softmax',
                                     init_method='xavier',
                                     classification = True)



x_train_scaled_1_v1, y_train_1_v1, x_test_scaled_1_v1, y_test_1_v1 =  load_and_scale_data_for_classification(rings3_train, rings3_test, target_col='c')

neural_network_1_v1.train(x_train_scaled_1_v1, y_train_1_v1, batch_size = 200 ,epochs = 2500, learning_rate=0.05, verbose = 500, optimizer = 'basic')
neural_network_1_v1.train(x_train_scaled_1_v1, y_train_1_v1, batch_size = 200 ,epochs = 2500, learning_rate=0.025, verbose = 500, optimizer = 'basic')
neural_network_1_v1.train(x_train_scaled_1_v1, y_train_1_v1, batch_size = 200 ,epochs = 2500, learning_rate=0.01, verbose = 500, optimizer = 'basic')


Epoch 500/2500, Loss: 52.243729920606434
Epoch 1000/2500, Loss: 49.14085082015387
Epoch 1500/2500, Loss: 34.59913557053367
Epoch 2000/2500, Loss: 58.92111940709615
Epoch 2500/2500, Loss: 31.87478472671765
Final loss: 31.87478472671765
Epoch 500/2500, Loss: 12.875688813923661
Epoch 1000/2500, Loss: 11.592430692302644
Epoch 1500/2500, Loss: 8.845905400213033
Epoch 2000/2500, Loss: 8.776770953191752
Epoch 2500/2500, Loss: 11.59128001156909
Final loss: 11.59128001156909
Epoch 500/2500, Loss: 1.8038827316747004
Epoch 1000/2500, Loss: 2.7008441671961436
Epoch 1500/2500, Loss: 0.8420645265534071
Epoch 2000/2500, Loss: 1.384866972363933
Epoch 2500/2500, Loss: 1.1251673160716595
Final loss: 1.1251673160716595


np.float64(1.1251673160716595)

In [44]:
y_pred_test_1_v1 = neural_network_1_v1.forward(x_test_scaled_1_v1)
f1_score(y_test_1_v1, y_pred_test_1_v1.argmax(axis=0), average='macro')

0.9729209341656763

## rings5

## relu

In [31]:
np.random.seed(102)
random.seed(102)

In [32]:
neural_network_2_v0 = NeuralNetwork5(layer_sizes=[2,16,16,16,5],
                                     activation='relu',
                                     output_activation='softmax',
                                     init_method='he',
                                     classification = True)



x_train_scaled_2_v0, y_train_2_v0, x_test_scaled_2_v0, y_test_2_v0 =  load_and_scale_data_for_classification(rings5_train, rings5_test, target_col='c')

neural_network_2_v0.train(x_train_scaled_2_v0, y_train_2_v0, batch_size = 200 ,epochs = 2500, learning_rate=0.05, verbose = 500, optimizer = 'basic')
neural_network_2_v0.train(x_train_scaled_2_v0, y_train_2_v0, batch_size = 200 ,epochs = 2500, learning_rate=0.025, verbose = 500, optimizer = 'basic')
neural_network_2_v0.train(x_train_scaled_2_v0, y_train_2_v0, batch_size = 200 ,epochs = 2500, learning_rate=0.01, verbose = 500, optimizer = 'basic')


Epoch 500/2500, Loss: 38.00674763980892
Epoch 1000/2500, Loss: 44.52340957150055
Epoch 1500/2500, Loss: 27.58043587898033
Epoch 2000/2500, Loss: 19.75462898242037
Epoch 2500/2500, Loss: 14.741891705307765
Final loss: 14.741891705307765
Epoch 500/2500, Loss: 12.508024728714442
Epoch 1000/2500, Loss: 8.006576968194477
Epoch 1500/2500, Loss: 13.930123475502274
Epoch 2000/2500, Loss: 8.112924480658176
Epoch 2500/2500, Loss: 8.51772272650754
Final loss: 8.51772272650754
Epoch 500/2500, Loss: 3.751397325570602
Epoch 1000/2500, Loss: 3.2829430294041613
Epoch 1500/2500, Loss: 2.987349946867836
Epoch 2000/2500, Loss: 4.988641303932326
Epoch 2500/2500, Loss: 3.671043124811834
Final loss: 3.671043124811834


np.float64(3.671043124811834)

In [33]:
y_pred_test_2_v0 = neural_network_2_v0.forward(x_test_scaled_2_v0)
f1_score(y_test_2_v0, y_pred_test_2_v0.argmax(axis=0), average='macro')

0.9586758051369211

In [34]:
neural_network_2_v1 = NeuralNetwork5(layer_sizes=[2,16,16,16,5],
                                     activation='tanh',
                                     output_activation='softmax',
                                     init_method='xavier',
                                     classification = True)



x_train_scaled_2_v1, y_train_2_v1, x_test_scaled_2_v1, y_test_2_v1 =  load_and_scale_data_for_classification(rings5_train, rings5_test, target_col='c')

neural_network_2_v1.train(x_train_scaled_2_v1, y_train_2_v1, batch_size = 200 ,epochs = 2500, learning_rate=0.05, verbose = 500, optimizer = 'basic')
neural_network_2_v1.train(x_train_scaled_2_v1, y_train_2_v1, batch_size = 200 ,epochs = 2500, learning_rate=0.025, verbose = 500, optimizer = 'basic')
neural_network_2_v1.train(x_train_scaled_2_v1, y_train_2_v1, batch_size = 200 ,epochs = 2500, learning_rate=0.01, verbose = 500, optimizer = 'basic')


Epoch 500/2500, Loss: 79.76938291795325
Epoch 1000/2500, Loss: 31.79729696540621
Epoch 1500/2500, Loss: 42.13832780918155
Epoch 2000/2500, Loss: 26.27479503550581
Epoch 2500/2500, Loss: 45.10630956235056
Final loss: 45.10630956235056
Epoch 500/2500, Loss: 16.729179215714193
Epoch 1000/2500, Loss: 17.118789055915446
Epoch 1500/2500, Loss: 29.25387591008485
Epoch 2000/2500, Loss: 22.825226201249723
Epoch 2500/2500, Loss: 17.397139228669054
Final loss: 17.397139228669054
Epoch 500/2500, Loss: 8.058059100124122
Epoch 1000/2500, Loss: 8.421491037338244
Epoch 1500/2500, Loss: 12.13255380563918
Epoch 2000/2500, Loss: 8.122923308567758
Epoch 2500/2500, Loss: 6.713630255584003
Final loss: 6.713630255584003


np.float64(6.713630255584003)

In [35]:
y_pred_test_2_v1 = neural_network_2_v1.forward(x_test_scaled_2_v1)
f1_score(y_test_2_v1, y_pred_test_2_v1.argmax(axis=0), average='macro')

0.9486007653845526