In [3]:
import os
from tqdm import trange

from sklearn.datasets import fetch_openml
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import torch
import wandb
from pytorch_lightning import loggers
from netrd.distance import IpsenMikhailov

import utils

In [14]:
if not os.path.isdir("./data"):
    !mkdir data

filename = "./data/proteins.npz"
if os.path.isfile(filename):
    proteins, labels, dist_mat = np.load(filename).values()
else:
    proteins, labels = utils.read_graph_dataset("PROTEINS")
    N = len(proteins)
    
    ipsen_mikhailov_distance = IpsenMikhailov()
    dist_mat = np.zeros((N, N), dtype=float)
    for i in trange(N):
        for j in range(i):
            dist_mat[i, j] = dist_mat[j, i] = ipsen_mikhailov_distance(proteins[i], proteins[j])
            
    np.savez(filename, digits=digits, labels=labels, dist_mat=dist_mat)

FileNotFoundError: [Errno 2] No such file or directory: 'a'

In [ ]:
metrics = {
    "bmds_train_acc": [],
    "bmds_eval_acc": [],
    "bmds_eval_acc_bayes": [],
    "pca_train_acc": [],
    "pca_eval_acc": [],
}

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for idx_train, idx_eval in k_fold.split(torch.arange(N)):
    digits_train, digits_eval = digits[idx_train], digits[idx_eval]
    labels_train, labels_eval = labels[idx_train], labels[idx_eval]
    dist_mat_train, dist_mat_eval = dist_mat[idx_train[:, None], idx_train], dist_mat[idx_train[:, None], idx_eval]
    
    bmds = utils.SklearnBMDS(bmds_train_kwargs={"max_dim": 100, "threshold": 0.2})
    
    run = wandb.init(project="mnist bmds train")
    logger = loggers.WandbLogger()
    x_train, std_train = bmds.fit_tranform(dist_mat_train, max_epochs=3, logger=logger)
    run.finish()
    
    run = wandb.init(project="mnist bmds eval")
    logger = loggers.WandbLogger()
    x_eval, std_eval = bmds.transform(dist_mat_eval, logger=logger)
    run.finish()
    
    clf = utils.SklearnMLPClassifier(labels_train.max() + 1)
    run = wandb.init(project="mnist bmds clf")
    logger = loggers.WandbLogger()
    pred_train = clf.fit_transform(x_train, labels_train, std_train, logger=logger)
    pred_eval = clf.prdedict(x_eval)
    pred_eval_bayes = clf.predict(x_eval, std_eval)
    logger.log_hyperparams({
        "train_acc": accuracy_score(pred_train, labels_train),
        "eval_acc": accuracy_score(pred_eval, labels_eval),
        "eval_acc_bayes": accuracy_score(pred_eval_bayes, labels_eval),
    })
    run.finish()
    
    metrics["bmds_train_acc"].append(accuracy_score(pred_train, labels_train))
    metrics["bmds_eval_acc"].append(accuracy_score(pred_eval, labels_eval))
    metrics["bmds_eval_acc_bayes"].append(accuracy_score(pred_eval_bayes, labels_eval))
    
    
    pca = PCA(n_components=bmds.dim)
    pca_train = pca.fit_transform(digits_train)
    pca_eval = pca.transform(digits_eval)
    
    clf = utils.SklearnMLPClassifier(labels_train.max() + 1)
    run = wandb.init(project="mnist pca clf")
    logger = loggers.WandbLogger()
    pred_train = clf.fit_transform(pca_train, labels_train, logger=logger)
    pred_eval = clf.prdedict(pca_eval)
    logger.log_hyperparams({
        "train_acc": accuracy_score(pred_train, labels_train),
        "eval_acc": accuracy_score(pred_eval, labels_eval),
    })
    run.finish()
    
    metrics["pca_train_acc"].append(accuracy_score(pred_train, labels_train))
    metrics["pca_eval_acc"].append(accuracy_score(pred_eval, labels_eval))

In [ ]:
metrics_df = pd.DataFrame(metrics)
metrics_df

In [ ]:
mean_values = metrics_df.mean()
std_values = metrics_df.std()

summary_df = pd.DataFrame([mean_values, std_values], index=['Mean', 'Std'])
summary_df