In [1]:
import numpy as np
import pandas as pd

import src
import src.data.datasets as datasets
import src.quality_measures as qm
from src.paths import processed_data_path
from src.models.train_model import available_algorithms

import json

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
## Specify dataset, algorithm and quality measure triples for training

In [4]:
def save_json(filename, obj):
    with open(filename, 'w') as fw:
        json.dump(obj, fw, indent=2, sort_keys=True)

In [5]:
datasets.available_datasets()

['ball',
 'broken-swiss-roll',
 'coil-100',
 'coil-20',
 'difficult',
 'f-mnist',
 'frey-faces',
 'gaussian-blobs',
 'helix',
 'hiva',
 'lvq-pak',
 'mnist',
 'orl-faces',
 's-curve',
 'shuttle-statlog',
 'sphere',
 'swiss-roll',
 'twinpeaks',
 'unit-cube']

In [7]:
available_algorithms().keys()

dict_keys(['autoencoder', 'HLLE', 'Isomap', 'KernelPCA', 'LaplacianEigenmaps', 'LLE', 'LTSA', 'MDS', 'PCA', 'TSNE', 'UMAP'])

In [92]:
qm.available_quality_measures().keys()

dict_keys(['1nn-error', 'continuity', 'stress', 'strain', 'trustworthiness'])

In [93]:
mph_synthetic_datasets = [
 'swiss-roll',
 'helix',
 'twinpeaks',
 'broken-swiss-roll',
 'difficult'
]

mph_datasets = [
 'swiss-roll',
 'helix',
 'twinpeaks',
 'broken-swiss-roll',
 'difficult',
 'coil-20',
 'orl-faces',
 'hiva'
]

# Leave out LaplacianEigenmaps until we figure out what to do with it failing...

mph_algs = [
    ('PCA', {'n_components':2}, None),
    ('Isomap', {'n_components':[2], 'n_neighbors':[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    ('KernelPCA', {'n_components':2, 'kernel':'poly', 'degree':5}, None),
    ('LLE', {'n_components':[2], 'n_neighbors':[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15], 'method':['modified']}, ('grid_search', {'verbose':10, 'return_train_score':True})), #use modified lle as n_neighbors > n_components
    #('LaplacianEigenmaps', {'n_components':[2], 'n_neighbors':[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15], 'eigen_solver':['dense']}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    ('HLLE', {'n_components':[2], 'n_neighbors':[6,  7,  8,  9, 10, 11, 12, 13, 14, 15]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    ('LTSA', {'n_components':[2], 'n_neighbors':[5, 6,  7,  8,  9, 10, 11, 12, 13, 14, 15]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    ('MDS', {'n_components':2}, None),
    ('TSNE', {'n_components':[2], 'perplexity':[ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    ('UMAP', {'n_components':[2], 'n_neighbors':[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
]

mph_scores = [
#    ('1nn-error', {'metric':'euclidean', 'greater_is_better':False}),
    ('continuity', {'n_neighbors':12, 'metric':'euclidean', 'greater_is_better':True}),
    ('stress', {'metric':'euclidean', 'greater_is_better':False}),
    ('strain',  {'metric':'euclidean', 'greater_is_better':False}),
    ('trustworthiness', {'n_neighbors':12, 'metric':'euclidean', 'greater_is_better':True})
]

In [96]:
training_dicts = []
for ds in mph_datasets:
    for alg, alg_params, meta in mph_algs:
        for score, score_params in mph_scores:
            d = {}
            d['dataset'] = ds
            d['algorithm'] = alg
            new_alg_params = alg_params.copy()
            rs = [int(x) for x in np.random.randint(2**16, size=2)]
            new_alg_params['random_state'] = rs
            if (alg in ['LLE', 'HLLE', 'LTSA']) and (ds in mph_synthetic_datasets):
                new_alg_params['eigen_solver'] = ['dense']
            d['algorithm_params'] = new_alg_params
            d['score'] = score
            d['score_params'] = score_params
            if meta is not None:
                d['meta'] = meta[0]
                d['meta_params'] = meta[1]
            training_dicts.append(d)

In [97]:
training_dicts

[{'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': [51214, 51567]},
  'score': 'continuity',
  'score_params': {'n_neighbors': 12,
   'metric': 'euclidean',
   'greater_is_better': True}},
 {'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': [44351, 45069]},
  'score': 'stress',
  'score_params': {'metric': 'euclidean', 'greater_is_better': False}},
 {'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': [34775, 7949]},
  'score': 'strain',
  'score_params': {'metric': 'euclidean', 'greater_is_better': False}},
 {'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': [43592, 5239]},
  'score': 'trustworthiness',
  'score_params': {'n_neighbors': 12,
   'metric': 'euclidean',
   'greater_is_better': True}},
 {'dataset': 'swiss-roll',
  'algorithm': 'Isomap',
  'algorithm_params':

In [98]:
!ls '../models/'

experiment_models.json	experiments.json  trained


In [99]:
save_json('../models/experiment_models_mph.json', training_dicts)