In [32]:
#Metoda 1 ale ze standaryzacją
import pandas as pd
import numpy as np
from collections import Counter

# === 1) Wczytanie i przygotowanie danych ===
df = pd.read_csv(r'train_data.csv', sep=';').dropna()

# Kodowanie zmiennych kategorycznych (stringów) na liczby
for col in df.select_dtypes(include=['object']).columns:
    df[col], _ = pd.factorize(df[col])

# Subsampling – wybieramy 10 000 losowych próbek
np.random.seed(0)
df = df.sample(10000).reset_index(drop=True)

X_full = df.drop('Stay', axis=1).values
y_full = df['Stay'].values

# === 2) Stratified 5-fold Cross-Validation ===
def make_folds(y, k=5, seed=0):
    np.random.seed(seed)
    folds = [[] for _ in range(k)]
    for cls in np.unique(y):
        cls_idx = np.where(y == cls)[0]
        np.random.shuffle(cls_idx)
        for i, ix in enumerate(cls_idx):
            folds[i % k].append(ix)
    return [np.array(f) for f in folds]

folds = make_folds(y_full, k=5)

# === 3) Implementacja k-NN “od zera” ===
def make_knn(X_tr, y_tr, k, chunk_size=200):
    def predict(Xm):
        preds = np.empty(len(Xm), dtype=y_tr.dtype)
        for start in range(0, len(Xm), chunk_size):
            end = min(start + chunk_size, len(Xm))
            batch = Xm[start:end]
            D = np.linalg.norm(batch[:, None, :] - X_tr[None, :, :], axis=2)
            nn = np.argpartition(D, k, axis=1)[:, :k]
            for i, neigh in enumerate(nn):
                preds[start + i] = Counter(y_tr[neigh]).most_common(1)[0][0]
        return preds
    return predict

# === 4) Standaryzacja danych (ręczna, zrobiona osobno dla każdego folda) ===
def standardize(X_train, X_test):
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    std[std == 0] = 1  # zapobiega dzieleniu przez zero
    X_train_std = (X_train - mean) / std
    X_test_std = (X_test - mean) / std
    return X_train_std, X_test_std

# === 5) Ewaluacja k-NN z ręczną standaryzacją ===
def eval_k(k):
    train_scores, test_scores = [], []
    total = len(folds)
    for i in range(total):
        print(f'\r[k={k}] Fold {i+1}/{total} ({(i+1)/total*100:.1f}%)', end='', flush=True)
        te = folds[i]
        tr = np.hstack([folds[j] for j in range(total) if j != i])
        X_tr_raw, y_tr = X_full[tr], y_full[tr]
        X_te_raw, y_te = X_full[te], y_full[te]

        # === Ręczna standaryzacja
        X_tr, X_te = standardize(X_tr_raw, X_te_raw)

        clf = make_knn(X_tr, y_tr, k)
        train_scores.append((clf(X_tr) == y_tr).mean())
        test_scores.append((clf(X_te) == y_te).mean())
    print()
    return np.mean(train_scores), np.mean(test_scores)

# === 6) Testowanie różnych wartości k ===
k_values = [1, 3, 5, 7]
results = {}

for i, k in enumerate(k_values):
    print(f'Przetwarzanie k={k} ({(i+1)/len(k_values)*100:.1f}% wszystkich)')
    results[k] = eval_k(k)

# === 7) Wyświetlenie wyników ===
print("\nk-NN (po standaryzacji) – 5-fold CV:")
for k, (tr, te) in results.items():
    print(f"  k = {k:2d} → train_acc = {tr:.3f}, test_acc = {te:.3f}")


Przetwarzanie k=1 (25.0% wszystkich)
[k=1] Fold 5/5 (100.0%)
Przetwarzanie k=3 (50.0% wszystkich)
[k=3] Fold 1/5 (20.0%)

KeyboardInterrupt: 

In [31]:
#Metoda 2 ale ze standaryzacją
import numpy as np

# === Funkcja do ręcznej standaryzacji danych ===
def standardize(X_train, X_test):
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    std[std == 0] = 1  # unikamy dzielenia przez zero
    X_train_std = (X_train - mean) / std
    X_test_std = (X_test - mean) / std
    return X_train_std, X_test_std

# === Funkcja klasyfikatora Nearest Centroid z wybraną metryką ===
def make_centroid(X_tr, y_tr, metric):
    classes = np.unique(y_tr)
    cents = {c: X_tr[y_tr == c].mean(axis=0) for c in classes}

    def dist(a, b):
        if metric == 'euclidean':
            return np.linalg.norm(a - b)
        elif metric == 'manhattan':
            return np.sum(np.abs(a - b))
        elif metric == 'chebyshev':
            return np.max(np.abs(a - b))
        else:  # cosine
            return 1 - (a @ b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def predict(Xm):
        preds = np.empty(len(Xm), dtype=y_tr.dtype)
        for i, x in enumerate(Xm):
            dists = [dist(x, cents[c]) for c in classes]
            preds[i] = classes[np.argmin(dists)]
        return preds

    return predict

# === Funkcja ewaluacji metryki z 5-fold CV i standaryzacją ===
def eval_metric_std(metric):
    train_scores, test_scores = [], []
    for i in range(len(folds)):
        tr_idx = np.hstack([folds[j] for j in range(len(folds)) if j != i])
        te_idx = folds[i]

        X_tr_raw, y_tr = X[tr_idx], y[tr_idx]
        X_te_raw, y_te = X[te_idx], y[te_idx]

        # === Ręczna standaryzacja
        X_tr, X_te = standardize(X_tr_raw, X_te_raw)

        clf = make_centroid(X_tr, y_tr, metric)
        train_scores.append((clf(X_tr) == y_tr).mean())
        test_scores.append((clf(X_te) == y_te).mean())
    return np.mean(train_scores), np.mean(test_scores)

# === Testujemy 4 metryki odległości ===
metrics = ['euclidean', 'manhattan', 'chebyshev', 'cosine']
results = {m: eval_metric_std(m) for m in metrics}

# === Wyświetlenie wyników ===
print("Nearest Centroid (ze standaryzacją) – 5-fold CV:")
for m, (tr, te) in results.items():
    print(f"  {m:<10} → train_acc = {tr:.3f}, test_acc = {te:.3f}")


Nearest Centroid (ze standaryzacją) – 5-fold CV:
  euclidean  → train_acc = 0.232, test_acc = 0.221
  manhattan  → train_acc = 0.217, test_acc = 0.203
  chebyshev  → train_acc = 0.138, test_acc = 0.135
  cosine     → train_acc = 0.234, test_acc = 0.221


In [25]:
# === Metoda 3: Gaussian Naive Bayes “od zera” z 5-fold CV i 4 wartościami var_smoothing ===

import numpy as np
import math

def make_gnb(X_tr, y_tr, var_smooth):
    classes = np.unique(y_tr)
    stats = {}
    for c in classes:
        Xc = X_tr[y_tr == c]
        mean = Xc.mean(axis=0)
        var  = Xc.var(axis=0) + var_smooth
        stats[c] = (mean, var)
    priors = {c: np.mean(y_tr == c) for c in classes}
    
    def predict(Xm):
        preds = np.empty(len(Xm), dtype=y_tr.dtype)
        for i, x in enumerate(Xm):
            posteriors = {}
            for c, (mean, var) in stats.items():
                # log prior
                logp = math.log(priors[c])
                # log-likelihood of Gaussian
                log_lik = -0.5 * np.sum(np.log(2 * math.pi * var) + ((x - mean) ** 2) / var)
                posteriors[c] = logp + log_lik
            preds[i] = max(posteriors, key=posteriors.get)
        return preds
    
    return predict

def eval_gnb(var_smooth):
    train_scores, test_scores = [], []
    for i in range(len(folds)):
        tr_idx = np.hstack([folds[j] for j in range(len(folds)) if j != i])
        te_idx = folds[i]
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_te, y_te = X[te_idx], y[te_idx]
        
        clf = make_gnb(X_tr, y_tr, var_smooth)
        train_scores.append((clf(X_tr) == y_tr).mean())
        test_scores .append((clf(X_te) == y_te).mean())
    return np.mean(train_scores), np.mean(test_scores)

var_smooth_values = [1e-9, 1e-8, 1e-7, 1e-6]
results = {vs: eval_gnb(vs) for vs in var_smooth_values}

print("Gaussian Naive Bayes – 5-fold CV dla różnych var_smoothing:")
for vs, (tr, te) in results.items():
    print(f"  var_smoothing = {vs:.0e} → train_acc = {tr:.3f}, test_acc = {te:.3f}")

# WNIOSKI
# var_smoothing = 1e-9 → train_acc = 0.369, test_acc = 0.359  
#   Minimalne wygładzenie; model jest stabilny, ale nadal osiąga niski wynik (~36%).
# var_smoothing = 1e-8 → train_acc = 0.369, test_acc = 0.359  
#   Dziesięciokrotnie większe wygładzenie nie zmienia accuracy – model jest już niewrażliwy na tak małe zmiany.
# var_smoothing = 1e-7 → train_acc = 0.369, test_acc = 0.359  
#   Kolejne zwiększenie wygładzenia również nie wpływa na wyniki – zakres parametrów za mały, by odcisnąć efekt.
# var_smoothing = 1e-6 → train_acc = 0.369, test_acc = 0.359  
#   Nawet przy największym testowanym wygładzeniu wyniki pozostają identyczne – var_smoothing nie ma znaczenia.

# Model Gaussian Naive Bayes jest całkowicie niewrażliwy na dobór parametru var_smoothing w badanym zakresie, co 
# skazuje, że wariancje cech dominują nad drobnymi poprawkami wygładzenia. Osiągnięte ~36 % accuracy przewyższa 
# proste metody odległościowe, ale nadal pozostaje poniżej użytecznego poziomu. Aby poprawić wyniki, warto wdrożyć 
# zaawansowaną inżynierię cech, modele ensemble lub bardziej złożone klasyfikatory nieliniowe.


Gaussian Naive Bayes – 5-fold CV dla różnych var_smoothing:
  var_smoothing = 1e-09 → train_acc = 0.369, test_acc = 0.359
  var_smoothing = 1e-08 → train_acc = 0.369, test_acc = 0.359
  var_smoothing = 1e-07 → train_acc = 0.369, test_acc = 0.359
  var_smoothing = 1e-06 → train_acc = 0.369, test_acc = 0.359


In [23]:
# === Metoda 3 (zaawansowana): Gaussian Naive Bayes z PCA – wpływ liczby komponentów ===

import numpy as np
import math

# 1) Funkcje PCA “od zera”
def compute_pca(X, n_components):
    mu = X.mean(axis=0)
    Xc = X - mu
    cov = np.cov(Xc, rowvar=False)
    eigvals, eigvecs = np.linalg.eigh(cov)
    idx = np.argsort(eigvals)[::-1][:n_components]
    comps = eigvecs[:, idx]
    return mu, comps

def transform_pca(X, mu, comps):
    return (X - mu).dot(comps)

# 2) Gaussian NB z opcjonalnym PCA
def make_gnb_pca(X_tr, y_tr, n_comp, var_smooth=1e-9):
    # obliczamy PCA na danych uczących
    mu, comps = compute_pca(X_tr, n_comp)
    Xp = transform_pca(X_tr, mu, comps)
    # zbieramy statystyki: mean, var dla każdej klasy
    classes = np.unique(y_tr)
    stats = {
        c: (
            Xp[y_tr==c].mean(axis=0),
            Xp[y_tr==c].var(axis=0) + var_smooth
        )
        for c in classes
    }
    priors = {c: np.mean(y_tr==c) for c in classes}
    
    def predict(Xm):
        Xm_p = transform_pca(Xm, mu, comps)
        preds = np.empty(len(Xm_p), dtype=y_tr.dtype)
        for i, x in enumerate(Xm_p):
            post = {}
            for c, (m, v) in stats.items():
                # log prior + log-likelihood Gaussa
                logp = math.log(priors[c]) - 0.5 * np.sum(
                    np.log(2*math.pi*v) + (x-m)**2 / v
                )
                post[c] = logp
            preds[i] = max(post, key=post.get)
        return preds
    
    return predict

# 3) Przygotowanie par indeksów 5-fold CV
folds_cv = [
    (
        np.hstack([folds[j] for j in range(len(folds)) if j != i]),
        folds[i]
    )
    for i in range(len(folds))
]

# 4) Ewaluacja dla różnych liczby komponentów PCA
components = [5, 10, 15, X.shape[1]]
results = {}
for n_comp in components:
    train_accs, test_accs = [], []
    for tr_idx, te_idx in folds_cv:
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_te, y_te = X[te_idx], y[te_idx]
        clf = make_gnb_pca(X_tr, y_tr, n_comp)
        train_accs.append((clf(X_tr) == y_tr).mean())
        test_accs .append((clf(X_te) == y_te).mean())
    results[n_comp] = (np.mean(train_accs), np.mean(test_accs))

# 5) Wyświetlenie wyników
print("GNB + PCA – 5-fold CV dla różnych n_components:")
for n_comp, (tr, te) in results.items():
    print(f"  n_components = {n_comp:2d} → train_acc = {tr:.3f}, test_acc = {te:.3f}")

# WNIOSKI
# Przy bardzo redukowanej przestrzeni (5 komponentów) model traci dużo informacji, co skutkuje niskim test_acc ≈ 28.6%.
# Zwiększenie liczby komponentów do 10 poprawia zarówno train_acc, jak i test_acc (do ≈ 31.3%).
# Optymalny kompromis uzyskujemy przy 15 komponentach: train_acc ≈ 36.1%, test_acc ≈ 35.2%; użycie wszystkich 17 cech nie przynosi dalszej poprawy.
# W porównaniu do poprzedniej metody bez PCA (var_smoothing), która osiągała test_acc ≈ 36%, stosowanie PCA z 15 komponentami daje niemal równorzędne wyniki,
# ale pozwala redukować wymiarowość i potencjalnie przyspiesza obliczenia w dalszych krokach analizy.


GNB + PCA – 5-fold CV dla różnych n_components:
  n_components =  5 → train_acc = 0.288, test_acc = 0.286
  n_components = 10 → train_acc = 0.323, test_acc = 0.313
  n_components = 15 → train_acc = 0.361, test_acc = 0.352
  n_components = 17 → train_acc = 0.362, test_acc = 0.350


In [26]:
# === Metoda 4: Softmax Regression (wieloklasowa regresja logistyczna) “od zera” z 5-fold CV i 4 wartościami regularyzacji L2 ===

import numpy as np

# 1) Budowa klasyfikatora Softmax Regression
def make_logreg(X_tr, y_tr, reg, lr=0.1, epochs=200):
    n, d = X_tr.shape
    classes = np.unique(y_tr)
    C = len(classes)
    # inicjalizacja wag i biasów
    W = np.zeros((d, C))
    b = np.zeros(C)
    # mapowanie etykiet na indeksy 0..C-1
    class_to_idx = {c: i for i, c in enumerate(classes)}
    y_idx = np.array([class_to_idx[c] for c in y_tr])
    # one-hot
    Y = np.eye(C)[y_idx]
    # gradient descent
    for _ in range(epochs):
        scores = X_tr.dot(W) + b                          # (n, C)
        exp_s = np.exp(scores - np.max(scores, axis=1, keepdims=True))
        P = exp_s / exp_s.sum(axis=1, keepdims=True)      # (n, C)
        # gradient
        dW = (X_tr.T.dot(P - Y)) / n + reg * W             # (d, C)
        db = (P - Y).mean(axis=0)                          # (C,)
        # update
        W -= lr * dW
        b -= lr * db
    # funkcja predykcji
    def predict(Xm):
        out = Xm.dot(W) + b
        return classes[np.argmax(out, axis=1)]
    return predict

# 2) Funkcja ewaluacji dla danego reg w 5-fold CV
def eval_reg(reg):
    train_scores, test_scores = [], []
    for i in range(len(folds)):
        tr_idx = np.hstack([folds[j] for j in range(len(folds)) if j != i])
        te_idx = folds[i]
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_te, y_te = X[te_idx], y[te_idx]
        clf = make_logreg(X_tr, y_tr, reg)
        train_scores.append((clf(X_tr) == y_tr).mean())
        test_scores .append((clf(X_te) == y_te).mean())
    return np.mean(train_scores), np.mean(test_scores)

# 3) Testujemy 4 wartości regularyzacji L2
regs = [0.0, 0.01, 0.1, 1.0]
results = {r: eval_reg(r) for r in regs}

# 4) Wyświetlenie wyników
print("Softmax Regression – 5-fold CV dla różnych L2 regularization:")
for r, (tr, te) in results.items():
    print(f"  reg = {r:.2f} → train_acc = {tr:.3f}, test_acc = {te:.3f}")

# WNIOSKI
# reg = 0.00 → train_acc = 0.207, test_acc = 0.207  
#   Brak regularyzacji daje najlepsze wyniki, model nie wykazuje nadmiernego overfittingu.
# reg = 0.01 → train_acc = 0.200, test_acc = 0.200  
#   Lekka regularyzacja obniża accuracy o ~0.7 p.p., sugerując niewielki wpływ.
# reg = 0.10 → train_acc = 0.173, test_acc = 0.172  
#   Silniejsza regularyzacja prowadzi do underfittingu i znacznego spadku dokładności.
# reg = 1.00 → train_acc = 0.198, test_acc = 0.198  
#   Bardzo duże L2 częściowo przywraca dopasowanie, lecz nadal nie dorównuje modelowi bez regularyzacji.

# Softmax regression osiąga jedynie około 20 % accuracy niezależnie od poziomu L2, co świadczy o tym, że 
# liniowe granice decyzyjne są zbyt proste dla tych danych. Brak regularyzacji daje najlepsze rezultaty, co 
# oznacza, że model nie overfituje znacząco, ale też nie potrafi efektywnie uogólniać. Aby uzyskać sensowną 
# poprawę wyników, warto sięgnąć po bardziej złożone klasyfikatory lub zaawansowaną inżynierię cech.


Softmax Regression – 5-fold CV dla różnych L2 regularization:
  reg = 0.00 → train_acc = 0.207, test_acc = 0.207
  reg = 0.01 → train_acc = 0.200, test_acc = 0.200
  reg = 0.10 → train_acc = 0.173, test_acc = 0.172
  reg = 1.00 → train_acc = 0.198, test_acc = 0.198


In [None]:
import numpy as np
from collections import Counter

# === 1) Podstawowe drzewo decyzyjne do klasyfikacji ===
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2, n_feats=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_feats = n_feats
        self.tree = None

    def fit(self, X, y):
        self.n_classes = len(set(y))
        self.n_features = X.shape[1] if self.n_feats is None else self.n_feats
        self.tree = self._grow(X, y, depth=0)

    def predict(self, X):
        return np.array([self._predict(x, self.tree) for x in X])

    def _gini(self, y):
        counts = np.bincount(y)
        probs = counts / len(y)
        return 1 - np.sum(probs ** 2)

    def _best_split(self, X, y, feat_idxs):
        best_gini = 1
        split_idx, split_thresh = None, None
        for feat in feat_idxs:
            thresholds = np.unique(X[:, feat])
            for thresh in thresholds:
                left = y[X[:, feat] <= thresh]
                right = y[X[:, feat] > thresh]
                if len(left) == 0 or len(right) == 0:
                    continue
                g = (len(left) * self._gini(left) + len(right) * self._gini(right)) / len(y)
                if g < best_gini:
                    best_gini = g
                    split_idx = feat
                    split_thresh = thresh
        return split_idx, split_thresh

    def _grow(self, X, y, depth):
        if (depth >= self.max_depth or len(y) < self.min_samples_split or len(set(y)) == 1):
            return Counter(y).most_common(1)[0][0]
        feat_idxs = np.random.choice(X.shape[1], self.n_features, replace=False)
        idx, thresh = self._best_split(X, y, feat_idxs)
        if idx is None:
            return Counter(y).most_common(1)[0][0]
        left_idx = X[:, idx] <= thresh
        right_idx = X[:, idx] > thresh
        left = self._grow(X[left_idx], y[left_idx], depth + 1)
        right = self._grow(X[right_idx], y[right_idx], depth + 1)
        return (idx, thresh, left, right)

    def _predict(self, x, node):
        if not isinstance(node, tuple):
            return node
        idx, thresh, left, right = node
        if x[idx] <= thresh:
            return self._predict(x, left)
        else:
            return self._predict(x, right)

# === 2) Random Forest od zera ===
class RandomForest:
    def __init__(self, n_trees=100, max_depth=10, min_samples_split=2):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_feats = int(np.sqrt(X.shape[1]))
        for _ in range(self.n_trees):
            idxs = np.random.choice(len(X), len(X), replace=True)
            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                n_feats=n_feats
            )
            tree.fit(X[idxs], y[idxs])
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([
            Counter(tree_preds[:, i]).most_common(1)[0][0]
            for i in range(X.shape[0])
        ])

# === 3) 5-fold cross-validation i test n_estimators ===
def stratified_folds(y, k=5):
    np.random.seed(0)
    folds = [[] for _ in range(k)]
    for cls in np.unique(y):
        idx = np.where(y == cls)[0]
        np.random.shuffle(idx)
        for i, ix in enumerate(idx):
            folds[i % k].append(ix)
    return [np.array(f) for f in folds]

def crossval_rf(X, y, n_trees):
    folds = stratified_folds(y)
    train_accs, test_accs = [], []
    for i in range(5):
        test_idx = folds[i]
        train_idx = np.hstack([folds[j] for j in range(5) if j != i])
        X_tr, y_tr = X[train_idx], y[train_idx]
        X_te, y_te = X[test_idx], y[test_idx]
        rf = RandomForest(n_trees=n_trees, max_depth=10)
        rf.fit(X_tr, y_tr)
        train_preds = rf.predict(X_tr)
        test_preds = rf.predict(X_te)
        train_accs.append((train_preds == y_tr).mean())
        test_accs.append((test_preds == y_te).mean())
    return np.mean(train_accs), np.mean(test_accs)

# === 4) Wczytaj dane i uruchom testy ===
import pandas as pd

df = pd.read_csv(r'C:train_data.csv', sep=';').dropna()
for col in df.select_dtypes(include='object').columns:
    df[col], _ = pd.factorize(df[col])
np.random.seed(0)
df = df.sample(10000).reset_index(drop=True)
X = df.drop(columns='Stay').values
y = df['Stay'].values

n_trees_list = [10, 50, 100, 200]
results = {}
for n in n_trees_list:
    print(f'Testowanie {n} drzew...')
    train_acc, test_acc = crossval_rf(X, y, n)
    results[n] = (train_acc, test_acc)

print("\nRandom Forest – 5-fold CV (implementacja ręczna):")
for n, (tr, te) in results.items():
    print(f"  n_estimators = {n:3d} → train_acc = {tr:.3f}, test_acc = {te:.3f}")


Testowanie 10 drzew...


In [37]:
import numpy as np

# Funkcje aktywacji
def relu(x):
    return np.maximum(0, x)

def relu_deriv(x):
    return (x > 0).astype(float)

def softmax(x):
    e = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e / e.sum(axis=1, keepdims=True)

# Funkcja straty
def cross_entropy(P, y):
    n = len(y)
    log_likelihood = -np.log(P[np.arange(n), y] + 1e-15)
    return np.mean(log_likelihood)

# Przygotowanie danych (X, y i folds muszą być zdefiniowane wcześniej)

# Trening i predykcja
class TwoLayerNN:
    def __init__(self, input_dim, hidden1, hidden2, output_dim, lr=0.01, epochs=200):
        self.lr = lr
        self.epochs = epochs
        self.W1 = np.random.randn(input_dim, hidden1) * 0.01
        self.b1 = np.zeros((1, hidden1))
        self.W2 = np.random.randn(hidden1, hidden2) * 0.01
        self.b2 = np.zeros((1, hidden2))
        self.W3 = np.random.randn(hidden2, output_dim) * 0.01
        self.b3 = np.zeros((1, output_dim))

    def train(self, X, y):
        y_onehot = np.eye(np.max(y)+1)[y]
        for _ in range(self.epochs):
            # Forward
            z1 = X @ self.W1 + self.b1
            a1 = relu(z1)
            z2 = a1 @ self.W2 + self.b2
            a2 = relu(z2)
            z3 = a2 @ self.W3 + self.b3
            P = softmax(z3)
            # Backward
            dz3 = (P - y_onehot) / len(X)
            dW3 = a2.T @ dz3
            db3 = dz3.sum(axis=0, keepdims=True)

            dz2 = (dz3 @ self.W3.T) * relu_deriv(z2)
            dW2 = a1.T @ dz2
            db2 = dz2.sum(axis=0, keepdims=True)

            dz1 = (dz2 @ self.W2.T) * relu_deriv(z1)
            dW1 = X.T @ dz1
            db1 = dz1.sum(axis=0, keepdims=True)

            # Update
            self.W3 -= self.lr * dW3
            self.b3 -= self.lr * db3
            self.W2 -= self.lr * dW2
            self.b2 -= self.lr * db2
            self.W1 -= self.lr * dW1
            self.b1 -= self.lr * db1

    def predict(self, X):
        a1 = relu(X @ self.W1 + self.b1)
        a2 = relu(a1 @ self.W2 + self.b2)
        scores = a2 @ self.W3 + self.b3
        return np.argmax(scores, axis=1)

# Ewaluacja
hidden_sizes = [(64, 32), (128, 64), (256, 128), (512, 256)]
results = {}

for h1, h2 in hidden_sizes:
    train_acc, test_acc = [], []
    for i in range(len(folds)):
        tr = np.hstack([folds[j] for j in range(len(folds)) if j != i])
        te = folds[i]
        X_tr, y_tr = X[tr], y[tr]
        X_te, y_te = X[te], y[te]

        model = TwoLayerNN(X.shape[1], h1, h2, np.max(y)+1)
        model.train(X_tr, y_tr)
        train_acc.append((model.predict(X_tr) == y_tr).mean())
        test_acc.append((model.predict(X_te) == y_te).mean())
    results[(h1, h2)] = (np.mean(train_acc), np.mean(test_acc))

# Wyniki
for (h1, h2), (tr_acc, te_acc) in results.items():
    print(f"Hidden sizes ({h1}, {h2}): train_acc = {tr_acc:.3f}, test_acc = {te_acc:.3f}")


Hidden sizes (64, 32): train_acc = 0.280, test_acc = 0.280
Hidden sizes (128, 64): train_acc = 0.280, test_acc = 0.280
Hidden sizes (256, 128): train_acc = 0.280, test_acc = 0.280
Hidden sizes (512, 256): train_acc = 0.280, test_acc = 0.280
