# Calculation with splits from paper

## Data loading

In [1]:
import sys
sys.path.insert(0, "../..")
import os
import numpy as np
import scipy.io

In [2]:
# Load diagrams
data_path = "./pd.mat"
data_mat = scipy.io.loadmat(data_path)
data = data_mat["pds"]

print(f"Data shape: {data.shape}")

# y - labels - type of diagram [0-7]
# We transpose to iterate over diagram list
y = []
for label, diagram_list in enumerate(data.T):
    y += len(diagram_list) * [label]
y = np.array(y)

# data.T = 2d array of diagrams as row, column as type
X = data.T.reshape(-1)

Data shape: (50, 7)


In [3]:
# Load splits, these are indexes of points in X, matlab index from 1 (so we do -1)
synth_splits = scipy.io.loadmat('./presplited/synthetic_splits.mat')
test_set = synth_splits['testSets'] - 1
train_set = synth_splits['trainSets'] - 1

tt_pairs = np.array([np.array((train, test)) for train, test in zip(train_set, test_set)])


## First step: Model hyperparameter optimization

In [4]:
import joblib
import pandas as pd
import sklearn

from preprocessing import *
from persistent_bow import *
from visualization import *
from fisher_vector_transformer import *
from experiments.utils import *

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC, LinearSVC
from sklearn.mixture import GaussianMixture

from gudhi.representations.kernel_methods import SlicedWassersteinKernel
from gudhi.representations.vector_methods import PersistenceImage
from gudhi.representations.metrics import BottleneckDistance


In [5]:
#Helper function, constructs final pipeline and returns girdsearch for it
def make_final_grid(estimator,
                    param_grid,
                    kernel="linear",
                    cv=tt_pairs,
                    **kwargs):
    new_param_grid = {f"Model__{name}" : values for name, values in param_grid.items()}
    new_param_grid["Predictor__C"] = [0.1, 1, 10]
    
    final_pipeline = Pipeline([
        ("Model", estimator),
        ("Predictor", SVC(kernel=kernel, max_iter=1e6))
    ])
    
    return GridSearchCV(final_pipeline, new_param_grid, cv = cv, **kwargs)

In [6]:
#PBoW gridsearch
pbow_gridsearch = make_final_grid(
    estimator = PersistenceBow(KMeans(7, n_init=1, max_iter=100, random_state=42),
                              sampler=RandomPDSampler(2500, random_state=42)),
    param_grid = {
        "cluster__n_clusters" : np.arange(10, 150, 15),
        "sampler__max_points" : np.arange(1000, 10000, 2000),
        "sampler__weight_function" : [const, linear]
    },
    n_jobs = -1
)

#SPBoW gridsearch
spbow_gridsearch = make_final_grid(
    estimator = StablePersistenceBow(GaussianMixture(covariance_type="diag", random_state=42),
                              sampler=RandomPDSampler(2500, random_state=42)),
    param_grid = {
        "mixture__n_components" : np.arange(10, 200, 15),
        "sampler__max_points" : np.arange(1000, 13000, 2000),
        "sampler__weight_function" : [const, linear]
    },
    n_jobs = -1
)

#PBoW gridsearch with grid sampler
pbow_grid_gridsearch = make_final_grid(
    estimator = PersistenceBow(KMeans(7, n_init=1, max_iter=100, random_state=42),
                              sampler=GridPDSampler((10,10), 100, random_state=42)),
    param_grid = {
        "cluster__n_clusters" : np.arange(10, 150, 15),
        "sampler__max_points" : [10, 50, 100, 200, 500],
        "sampler__grid_shape" : [(5,5), (10,10), (15,15), (20,20), (40,40)],
        "sampler__weight_function" : [const, linear]
    },
    n_jobs = -1
)

#SPBoW gridsearch with grid sampler
spbow_grid_gridsearch = make_final_grid(
    estimator = StablePersistenceBow(GaussianMixture(covariance_type="diag", random_state=42),
                              sampler=GridPDSampler((10,10),100, random_state=42)),
    param_grid = {
        "mixture__n_components" : np.arange(10, 200, 15),
        "sampler__max_points" : [10, 50, 100, 200, 500],
        "sampler__grid_shape" : [(5,5), (10,10), (15,15), (20,20), (40,40)],
        "sampler__weight_function" : [const, linear]
    },
    n_jobs = -1
)


models_to_test = {
    "PBoW" : pbow_gridsearch, 
    "PBoW_Grid" : pbow_grid_gridsearch,
    "SPboW" : spbow_gridsearch,
    "SPBoW_Grid" : spbow_grid_gridsearch}

In [7]:
#Hyperparameter optimization
for name, grid in models_to_test.items():
    print(name)
    grid_path = f"grid/{name}.dill"
    
    out = load(grid_path)
    if out:
        print("Loaded from file")
        models_to_test[name] = out
    else:
        grid.verbose = 10
        grid.fit(X, y)
        save(grid, grid_path)

PBoW
Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 120 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:   58.9s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 181 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 241 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 373 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed: 

PBoW_Grid
Fitting 5 folds for each of 1500 candidates, totalling 7500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 120 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 181 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 241 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 373 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed: 

SPboW
Fitting 5 folds for each of 468 candidates, totalling 2340 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 120 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 181 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 241 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 373 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed: 

SPBoW_Grid
Fitting 5 folds for each of 1950 candidates, totalling 9750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 120 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 181 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 241 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 373 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 408 tasks      | elapsed: 

In [8]:
#Grid computation
grid_path = f"./grid/"
cv_path = f"./cv/"

for filename in os.listdir(grid_path):
    name = os.path.splitext(filename)[0]
    grid = load(os.path.join(grid_path, filename))
    results = load(os.path.join(cv_path, filename))
    
    if not results:
        print("Computing", name)
        model = grid.best_estimator_
        results = cross_validate(model, X, y, cv=tt_pairs, n_jobs=-1, verbose=10)
        save(results, os.path.join(cv_path, f"{name}.dill"))
    else:
        print(f"Loaded results: {name}")


Computing PBoW_Grid


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 120 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   11.9s remaining:    8.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 120 concurrent workers.


Computing SPboW


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 120 concurrent workers.


Computing SPBoW_Grid


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.7s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.7s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 120 concurrent workers.


Computing PBoW


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.9s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.9s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished


In [9]:
base_path = "./cv"
for filename in os.listdir(base_path):
    path = os.path.join(base_path, filename)
    name = os.path.splitext(filename)[0]
    result = load(path)
    print(name, "Test set score:\n", result["test_score"].mean(), "\n")


SPBoW_Grid Test set score:
 0.9800000000000001 

PBoW Test set score:
 0.9714285714285715 

SPboW Test set score:
 0.9771428571428572 

PBoW_Grid Test set score:
 0.9742857142857144 

