In [194]:
import numpy as np
import pandas as pd

#import src.data.datasets as dataset
from src import quality_measures as qm
from src.paths import processed_data_path
from src.models.train_model import available_algorithms
from src.utils import save_json,
import json

In [196]:
from src import utils

In [202]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [225]:
def read_json(filename):
    with open(filename, 'r') as f:
        j = json.load(f)
    return j

## Specify dataset, algorithm and quality measure triples for training

In [106]:
datasets.available_datasets()

['ball',
 'broken-swiss-roll',
 'coil-100',
 'coil-20',
 'difficult',
 'f-mnist',
 'frey-faces',
 'gaussian-blobs',
 'helix',
 'hiva',
 'lvq-pak',
 'mnist',
 'orl-faces',
 's-curve',
 'shuttle-statlog',
 'sphere',
 'swiss-roll',
 'twinpeaks',
 'unit-cube']

In [107]:
available_algorithms().keys()

dict_keys(['autoencoder', 'HLLE', 'Isomap', 'KernelPCA', 'LaplacianEigenmaps', 'LLE', 'LTSA', 'MDS', 'PCA', 'TSNE', 'UMAP'])

In [108]:
qm.available_quality_measures().keys()

dict_keys(['1nn-error', 'continuity', 'stress', 'strain', 'trustworthiness'])

In [219]:
mph_synthetic_datasets = [
 'swiss-roll',
 'helix',
 'twinpeaks',
 'broken-swiss-roll',
 'difficult'
]

mph_datasets = [
 'swiss-roll',
 #'helix',
 #'twinpeaks',
 #'broken-swiss-roll',
 #'difficult',
 #'coil-20',
 #'orl-faces',
 #'hiva'
]

# Leave out LaplacianEigenmaps until we figure out what to do with it failing...

mph_algs = [
    ('PCA', {'n_components':2}, None),
    #('Isomap', {'n_components':[2], 'n_neighbors':[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    #('KernelPCA', {'n_components':2, 'kernel':'poly', 'degree':5}, None),
    #('LLE', {'n_components':[2], 'n_neighbors':[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15], 'method':['modified']}, ('grid_search', {'verbose':10, 'return_train_score':True})), #use modified lle as n_neighbors > n_components
    #('LaplacianEigenmaps', {'n_components':[2], 'n_neighbors':[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15], 'eigen_solver':['dense']}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    #('HLLE', {'n_components':[2], 'n_neighbors':[6,  7,  8,  9, 10, 11, 12, 13, 14, 15]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    #('LTSA', {'n_components':[2], 'n_neighbors':[5, 6,  7,  8,  9, 10, 11, 12, 13, 14, 15]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    #('MDS', {'n_components':2}, None),
    #('TSNE', {'n_components':[2], 'perplexity':[ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
    ('UMAP', {'n_components':[2], 'n_neighbors':[ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]}, ('grid_search', {'verbose':10, 'return_train_score':True})),
]

mph_scores = [
    ('1nn-error', {'metric':'euclidean'}),
    ('continuity', {'n_neighbors':12, 'metric':'euclidean'}),
    ('stress', {'metric':'euclidean'}),
    ('strain',  {'metric':'euclidean'}),
    ('trustworthiness', {'n_neighbors':12, 'metric':'euclidean', })
]

In [220]:
training_dicts = []
for ds in mph_datasets:
    for alg, alg_params, meta in mph_algs:
        for score, score_params in mph_scores:
            d = {}
            d['dataset'] = ds
            d['algorithm'] = alg
            new_alg_params = alg_params.copy()
            if meta is not None:
                d['meta'] = meta[0]
                d['meta_params'] = meta[1]
                if alg != 'Isomap':
                    rs = [int(x) for x in np.random.randint(2**16, size=2)]
                    new_alg_params['random_state'] = rs
            else:
                new_alg_params['random_state'] = int(np.random.randint(2**16))
            if (alg in ['LLE', 'HLLE', 'LTSA']) and (ds in mph_synthetic_datasets):
                new_alg_params['eigen_solver'] = ['dense']
            d['algorithm_params'] = new_alg_params
            d['score'] = score
            d['score_params'] = score_params

            training_dicts.append(d)

In [221]:
training_dicts

[{'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 51129},
  'score': '1nn-error',
  'score_params': {'metric': 'euclidean'}},
 {'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 5891},
  'score': 'continuity',
  'score_params': {'n_neighbors': 12, 'metric': 'euclidean'}},
 {'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 21915},
  'score': 'stress',
  'score_params': {'metric': 'euclidean'}},
 {'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 25159},
  'score': 'strain',
  'score_params': {'metric': 'euclidean'}},
 {'dataset': 'swiss-roll',
  'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 26330},
  'score': 'trustworthiness',
  'score_params': {'n_neighbors': 12, 'metric': 'euclidean'}},
 {'dataset': 'swiss-roll',
  'algorithm': 'UMAP

In [222]:
!ls '../models/'

analysis		  experiment_models_mph.json  predict_list.json
experiment_models-2.json  experiments.json	      trained
experiment_models.json	  predictions


In [223]:
save_json('../models/experiment_models_mph.json', training_dicts)

In [186]:
#save_json('../models/experiment_models.json', training_dicts)

In [224]:
train_output_json = '../models/experiment_mph.json'

In [231]:
train_output = read_json(train_output_json)

### Make the prediction list

In [232]:
train_output.keys()

dict_keys(['PCA_swiss-roll_1nn-error_0', 'PCA_swiss-roll_continuity_0', 'PCA_swiss-roll_strain_0', 'PCA_swiss-roll_stress_0', 'PCA_swiss-roll_trustworthiness_0', 'UMAP_swiss-roll_1nn-error_0', 'UMAP_swiss-roll_continuity_0', 'UMAP_swiss-roll_strain_0', 'UMAP_swiss-roll_stress_0', 'UMAP_swiss-roll_trustworthiness_0'])

In [258]:
predict_list = []
for key in train_output.keys():
    _, ds_name, _, _ = key.split('_')
    d = {"dataset_name":ds_name,
         "model_name":key,
         "force":True
        }
    predict_list.append(d)

In [259]:
predict_list

[{'dataset_name': 'swiss-roll',
  'model_name': 'PCA_swiss-roll_1nn-error_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'PCA_swiss-roll_continuity_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'PCA_swiss-roll_strain_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'PCA_swiss-roll_stress_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'PCA_swiss-roll_trustworthiness_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'UMAP_swiss-roll_1nn-error_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'UMAP_swiss-roll_continuity_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'UMAP_swiss-roll_strain_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'UMAP_swiss-roll_stress_0',
  'force': True},
 {'dataset_name': 'swiss-roll',
  'model_name': 'UMAP_swiss-roll_trustworthiness_0',
  'force': True}]

In [240]:
save_json('../models/predict_list_mph.json', predict_list)

In [241]:
predict_output_json = '../models/prediction_mph.json'

## Now set up analysis

In [242]:
predict_output = read_json(predict_output_json)

In [244]:
key = list(predict_output.keys())[0]
predict_output[key]['dataset_name'] == key

True

In [248]:
predict_output[key]['experiment']['dataset_name']

'swiss-roll'

In [249]:
predict_output[key]['experiment']

{'data_hash': 'e8c9d51a7b8a83589043041b2eafb86c0cdc4e63',
 'dataset_name': 'swiss-roll',
 'dataset_opts': {},
 'duration': 0.0002238750457763672,
 'hash_type': 'sha1',
 'model_hash': '167f3fa9aa7a3df513bde8a632ffce8a9898f610',
 'model_metadata': {'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 51129},
  'data_hash': 'e8c9d51a7b8a83589043041b2eafb86c0cdc4e63',
  'dataset': 'swiss-roll',
  'model_hash': '883e31b501d2010f4132daea1f40c736f260ef3e',
  'run_number': 0,
  'score': '1nn-error',
  'score_params': {'metric': 'euclidean'},
  'target_hash': 'f6384ac94eac334efc650fe1e85f85d221187168'},
 'model_name': 'PCA_swiss-roll_1nn-error_0',
 'run_number': 0,
 'start_time': 1536349011.4622514,
 'target_hash': 'f6384ac94eac334efc650fe1e85f85d221187168'}

In [246]:
predict_output[key]['experiment']['model_metadata']

{'algorithm': 'PCA',
 'algorithm_params': {'n_components': 2, 'random_state': 51129},
 'data_hash': 'e8c9d51a7b8a83589043041b2eafb86c0cdc4e63',
 'dataset': 'swiss-roll',
 'model_hash': '883e31b501d2010f4132daea1f40c736f260ef3e',
 'run_number': 0,
 'score': '1nn-error',
 'score_params': {'metric': 'euclidean'},
 'target_hash': 'f6384ac94eac334efc650fe1e85f85d221187168'}

In [260]:
analysis_list = []
for key in predict_output.keys():
    print(key)
    for score in mph_scores:
        #print(score)
        d = {"low_dataset_name":key,
             "high_dataset_name":predict_output[key]['experiment']['dataset_name'],
             "score_name":score[0],
             "score_params":score[1],
             "include_pointwise":True
            }
    analysis_list.append(d)

PCA_swiss-roll_1nn-error_0_exp_swiss-roll_0
PCA_swiss-roll_continuity_0_exp_swiss-roll_0
PCA_swiss-roll_strain_0_exp_swiss-roll_0
PCA_swiss-roll_stress_0_exp_swiss-roll_0
PCA_swiss-roll_trustworthiness_0_exp_swiss-roll_0
UMAP_swiss-roll_1nn-error_0_exp_swiss-roll_0
UMAP_swiss-roll_continuity_0_exp_swiss-roll_0
UMAP_swiss-roll_strain_0_exp_swiss-roll_0
UMAP_swiss-roll_stress_0_exp_swiss-roll_0
UMAP_swiss-roll_trustworthiness_0_exp_swiss-roll_0


In [261]:
analysis_list

[{'low_dataset_name': 'PCA_swiss-roll_1nn-error_0_exp_swiss-roll_0',
  'high_dataset_name': 'swiss-roll',
  'score_name': 'trustworthiness',
  'score_params': {'n_neighbors': 12, 'metric': 'euclidean'},
  'include_pointwise': True},
 {'low_dataset_name': 'PCA_swiss-roll_continuity_0_exp_swiss-roll_0',
  'high_dataset_name': 'swiss-roll',
  'score_name': 'trustworthiness',
  'score_params': {'n_neighbors': 12, 'metric': 'euclidean'},
  'include_pointwise': True},
 {'low_dataset_name': 'PCA_swiss-roll_strain_0_exp_swiss-roll_0',
  'high_dataset_name': 'swiss-roll',
  'score_name': 'trustworthiness',
  'score_params': {'n_neighbors': 12, 'metric': 'euclidean'},
  'include_pointwise': True},
 {'low_dataset_name': 'PCA_swiss-roll_stress_0_exp_swiss-roll_0',
  'high_dataset_name': 'swiss-roll',
  'score_name': 'trustworthiness',
  'score_params': {'n_neighbors': 12, 'metric': 'euclidean'},
  'include_pointwise': True},
 {'low_dataset_name': 'PCA_swiss-roll_trustworthiness_0_exp_swiss-roll_0'

In [262]:
save_json('../models/analysis_list_mph.json', analysis_list)