In [1]:
import os
from tqdm import trange

from sklearn.datasets import fetch_openml
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import torch
import wandb
from pytorch_lightning import loggers

import utils

In [2]:
if not os.path.isdir("./data"):
    !mkdir data

filename = "./data/mnist.npz"
if os.path.isfile(filename):
    digits, labels, dist_mat = np.load(filename).values()
    N = len(digits)
else:
    N = 10000
    mnist = fetch_openml("mnist_784", version=1)

    idx = np.random.choice(np.arange(len(mnist["data"])), size=N, replace=False)
    digits, labels = mnist["data"].values[idx], mnist["target"].astype(int).values[idx]

    digits /= digits.max()

    dist_mat = np.zeros((N, N), dtype=float)
    for i in trange(N):
        for j in range(i):
            dist_mat[i, j] = dist_mat[j, i] = ((digits[i] - digits[j]) ** 2).sum() ** 0.5
            
    np.savez(filename, digits=digits, labels=labels, dist_mat=dist_mat)

In [3]:
metrics = {
    "bmds_train_acc": [],
    "bmds_eval_acc": [],
    "bmds_eval_acc_bayes": [],
    "pca_train_acc": [],
    "pca_eval_acc": [],
}

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for idx_train, idx_eval in k_fold.split(torch.arange(N)):
    digits_train, digits_eval = digits[idx_train], digits[idx_eval]
    labels_train, labels_eval = labels[idx_train], labels[idx_eval]
    dist_mat_train, dist_mat_eval = dist_mat[idx_train[:, None], idx_train], dist_mat[idx_train[:, None], idx_eval]
    
    bmds = utils.SklearnBMDS(bmds_train_kwargs={"max_dim": 100, "threshold": 0.1})
    
    run = wandb.init(project="mnist bmds train")
    logger = loggers.WandbLogger()
    x_train, std_train = bmds.fit_transform(dist_mat_train, max_epochs=2, logger=logger)
    run.finish()
    
    run = wandb.init(project="mnist bmds eval")
    logger = loggers.WandbLogger()
    x_eval, std_eval = bmds.transform(dist_mat_eval, logger=logger)
    run.finish()
    
    clf = utils.SklearnMLPClassifier(labels_train.max() + 1)
    run = wandb.init(project="mnist bmds clf")
    logger = loggers.WandbLogger()
    pred_train = clf.fit_predict(x_train, labels_train, std_train, logger=logger)
    pred_eval = clf.predict(x_eval)
    pred_eval_bayes = clf.predict(x_eval, std_eval)
    logger.log_hyperparams({
        "train_acc": accuracy_score(pred_train, labels_train),
        "eval_acc": accuracy_score(pred_eval, labels_eval),
        "eval_acc_bayes": accuracy_score(pred_eval_bayes, labels_eval),
    })
    run.finish()
    
    metrics["bmds_train_acc"].append(accuracy_score(pred_train, labels_train))
    metrics["bmds_eval_acc"].append(accuracy_score(pred_eval, labels_eval))
    metrics["bmds_eval_acc_bayes"].append(accuracy_score(pred_eval_bayes, labels_eval))
    
    
    pca = PCA(n_components=bmds.dim)
    pca_train = pca.fit_transform(digits_train)
    pca_eval = pca.transform(digits_eval)
    
    clf = utils.SklearnMLPClassifier(labels_train.max() + 1)
    run = wandb.init(project="mnist pca clf")
    logger = loggers.WandbLogger()
    pred_train = clf.fit_predict(pca_train, labels_train, logger=logger)
    pred_eval = clf.predict(pca_eval)
    logger.log_hyperparams({
        "train_acc": accuracy_score(pred_train, labels_train),
        "eval_acc": accuracy_score(pred_eval, labels_eval),
    })
    run.finish()
    
    metrics["pca_train_acc"].append(accuracy_score(pred_train, labels_train))
    metrics["pca_eval_acc"].append(accuracy_score(pred_eval, labels_eval))

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mantonii-belyshev[0m ([33mai-prentice[0m). Use [1m`wandb login --relogin`[0m to force relogin


  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type | Params
------------------------------
------------------------------
0         Trainable params
0         Non-trainable params
0         Total params
0.000     Total estimated model params size (MB)



Learning the optimal train embedding...


  rank_zero_warn(


Training: 0it [00:00, ?it/s]



VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
dim,██▇▇▆▆▆▅▅▄▄▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
frac,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▄▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇▇███▇██
loss,█▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
reg,▆█▇▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
total_loss,█▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
dim,13.0
epoch,1.0
frac,0.20674
loss,0.01923
reg,0.00642
total_loss,0.02565
trainer/global_step,3999.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112896534096863, max=1.0…

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type | Params
------------------------------
------------------------------
0         Trainable params
0         Non-trainable params
0         Total params
0.000     Total estimated model params size (MB)



Learning the optimal train embedding...


  rank_zero_warn(


Training: 0it [00:00, ?it/s]



VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
dim,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
reg,▁███████████████████████████████████████
total_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
dim,13.0
epoch,499.0
loss,35.6328
reg,20.05953
total_loss,55.69233
trainer/global_step,49999.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112890635720558, max=1.0…

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | layers  | ModuleList       | 666   
1 | loss_fn | CrossEntropyLoss | 0     
---------------------------------------------
666       Trainable params
0         Non-trainable params
666       Total params
0.003     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [None]:
metrics_df = pd.DataFrame(metrics)
metrics_df

In [None]:
mean_values = metrics_df.mean()
std_values = metrics_df.std()

summary_df = pd.DataFrame([mean_values, std_values], index=['Mean', 'Std'])
summary_df