### Dimension Reduction Analysis

Reproducing results from MPH2009

Create a dataframe containing the results from running:
* All specified datasets
* Group of algorithms
* Choice of quality measures

Then producing results as tables and visualizations.

In [12]:
import itertools
import pandas as pd
from functools import partial
import json

from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, Isomap, LocallyLinearEmbedding

from MulticoreTSNE import MulticoreTSNE as TSNE
from umap import UMAP

import src
import src.data.datasets as datasets
from src.visualization.plotting import two_dim_multiplot, embeddable_image
import src.quality_measures as qm
from src.paths import processed_data_path
from src.data.experiment import run_algorithm

import logging

LOG_FORMAT = "%(levelname)s %(asctime)s - %(message)s"
DATE_FORMAT = "%m/%d/%Y %I:%M:%S %p"

logging.basicConfig(format=LOG_FORMAT, datefmt=DATE_FORMAT, level=logging.INFO) 
logger = logging.getLogger()

AttributeError: module 'src' has no attribute 'data'

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import holoviews as hv
hv.extension('bokeh', 'matplotlib')
%opts Curve Scatter Bars [tools=['hover']]
%matplotlib inline
sns.set_context('poster')
sns.set(style='white', rc={'figure.figsize':(12,8)})

In [87]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# location to save data
if not processed_data_path.exists():
    os.makedirs(processed_data_path)

## Specify datasets/algorithms/scores to train on

See notebook 16 for the generation of the json.

Pick the "best algorithm" parameters for each dataset/score combination using grid search.

In [17]:
def read_json(filename):
    with open(filename, 'r') as f:
        j = json.load(f)
    return j

In [18]:
train_input_json = '../models/experiment_models_mph.json'

In [21]:
read_json(train_input_json)

[{'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 51129},
  'dataset': 'swiss-roll',
  'score': '1nn-error',
  'score_params': {'metric': 'euclidean'}},
 {'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 5891},
  'dataset': 'swiss-roll',
  'score': 'continuity',
  'score_params': {'metric': 'euclidean', 'n_neighbors': 12}},
 {'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 21915},
  'dataset': 'swiss-roll',
  'score': 'stress',
  'score_params': {'metric': 'euclidean'}},
 {'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 25159},
  'dataset': 'swiss-roll',
  'score': 'strain',
  'score_params': {'metric': 'euclidean'}},
 {'algorithm': 'PCA',
  'algorithm_params': {'n_components': 2, 'random_state': 26330},
  'dataset': 'swiss-roll',
  'score': 'trustworthiness',
  'score_params': {'metric': 'euclidean', 'n_neighbors': 12}},
 {'algorithm': 'UMAP',
  'algorithm_params': {'

In [22]:
train_output_json = '../models/experiment_mph.json'

In [23]:
!python -m src.models.train_model experiment_models_mph.json -o ../models/experiment_mph.json

2018-09-07 14:59:19,155 - train_model - INFO - Building models from experiment_models_mph.json
Fitting 3 folds for each of 22 candidates, totalling 66 fits
[CV] n_components=2, n_neighbors=5, random_state=22786 ...............
[CV]  n_components=2, n_neighbors=5, random_state=22786, score=-0.09880239520958084, total=  10.3s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.3s remaining:    0.0s
[CV] n_components=2, n_neighbors=5, random_state=22786 ...............
[CV]  n_components=2, n_neighbors=5, random_state=22786, score=-0.1021021021021021, total=   1.4s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.7s remaining:    0.0s
[CV] n_components=2, n_neighbors=5, random_state=22786 ...............
[CV]  n_components=2, n_neighbors=5, random_state=22786, score=-0.10510510510510511, total=   1.6s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   13.3s remaining:    0.0s
[CV] n_components=2, n_neighbors=5, random_state=53671 ...............
[CV]  n_components=2, n_

[CV]  n_components=2, n_neighbors=12, random_state=22786, score=-0.06006006006006006, total=   1.9s
[CV] n_components=2, n_neighbors=12, random_state=22786 ..............
[CV]  n_components=2, n_neighbors=12, random_state=22786, score=-0.08708708708708708, total=   1.9s
[CV] n_components=2, n_neighbors=12, random_state=53671 ..............
[CV]  n_components=2, n_neighbors=12, random_state=53671, score=-0.07784431137724551, total=   1.9s
[CV] n_components=2, n_neighbors=12, random_state=53671 ..............
[CV]  n_components=2, n_neighbors=12, random_state=53671, score=-0.06606606606606606, total=   1.9s
[CV] n_components=2, n_neighbors=12, random_state=53671 ..............
[CV]  n_components=2, n_neighbors=12, random_state=53671, score=-0.07207207207207207, total=   1.9s
[CV] n_components=2, n_neighbors=13, random_state=22786 ..............
[CV]  n_components=2, n_neighbors=13, random_state=22786, score=-0.10479041916167664, total=   2.0s
[CV] n_components=2, n_neighbors=13, random_s

[CV]  n_components=2, n_neighbors=8, random_state=63107, score=0.912866027058403, total=   1.6s
[CV] n_components=2, n_neighbors=8, random_state=63107 ...............
[CV]  n_components=2, n_neighbors=8, random_state=63107, score=0.9288127555218175, total=   1.7s
[CV] n_components=2, n_neighbors=8, random_state=63107 ...............
[CV]  n_components=2, n_neighbors=8, random_state=63107, score=0.9300357591295587, total=   1.9s
[CV] n_components=2, n_neighbors=9, random_state=54598 ...............
[CV]  n_components=2, n_neighbors=9, random_state=54598, score=0.9194471595635987, total=   1.8s
[CV] n_components=2, n_neighbors=9, random_state=54598 ...............
[CV]  n_components=2, n_neighbors=9, random_state=54598, score=0.9388243569483633, total=   1.7s
[CV] n_components=2, n_neighbors=9, random_state=54598 ...............
[CV]  n_components=2, n_neighbors=9, random_state=54598, score=0.9328748462293772, total=   1.9s
[CV] n_components=2, n_neighbors=9, random_state=63107 .........

[CV]  n_components=2, n_neighbors=5, random_state=54599, score=-5586.271500612161, total=   1.4s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.5s remaining:    0.0s
[CV] n_components=2, n_neighbors=5, random_state=16529 ...............
[CV]  n_components=2, n_neighbors=5, random_state=16529, score=-5748.839146549209, total=   1.6s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.1s remaining:    0.0s
[CV] n_components=2, n_neighbors=5, random_state=16529 ...............
[CV]  n_components=2, n_neighbors=5, random_state=16529, score=-5548.43720521569, total=   1.5s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.5s remaining:    0.0s
[CV] n_components=2, n_neighbors=5, random_state=16529 ...............
[CV]  n_components=2, n_neighbors=5, random_state=16529, score=-5975.21747802798, total=   1.4s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    9.0s remaining:    0.0s
[CV] n_components=2, n_neighbors=6, random_state=54599 ...............
[CV]  n_co

[CV]  n_components=2, n_neighbors=13, random_state=54599, score=-6402.512468509119, total=   1.9s
[CV] n_components=2, n_neighbors=13, random_state=54599 ..............
[CV]  n_components=2, n_neighbors=13, random_state=54599, score=-6341.930776483831, total=   2.0s
[CV] n_components=2, n_neighbors=13, random_state=54599 ..............
[CV]  n_components=2, n_neighbors=13, random_state=54599, score=-6765.406247790729, total=   2.0s
[CV] n_components=2, n_neighbors=13, random_state=16529 ..............
[CV]  n_components=2, n_neighbors=13, random_state=16529, score=-6068.637982556086, total=   2.0s
[CV] n_components=2, n_neighbors=13, random_state=16529 ..............
[CV]  n_components=2, n_neighbors=13, random_state=16529, score=-6341.316510247381, total=   1.9s
[CV] n_components=2, n_neighbors=13, random_state=16529 ..............
[CV]  n_components=2, n_neighbors=13, random_state=16529, score=-6537.792594466299, total=   2.0s
[CV] n_components=2, n_neighbors=14, random_state=54599 .

[CV]  n_components=2, n_neighbors=9, random_state=31698, score=-3117.451594062258, total=   1.8s
[CV] n_components=2, n_neighbors=9, random_state=27608 ...............
[CV]  n_components=2, n_neighbors=9, random_state=27608, score=-3061.454121646449, total=   1.7s
[CV] n_components=2, n_neighbors=9, random_state=27608 ...............
[CV]  n_components=2, n_neighbors=9, random_state=27608, score=-3525.5849674604533, total=   1.8s
[CV] n_components=2, n_neighbors=9, random_state=27608 ...............
[CV]  n_components=2, n_neighbors=9, random_state=27608, score=-3591.2747407477004, total=   1.7s
[CV] n_components=2, n_neighbors=10, random_state=31698 ..............
[CV]  n_components=2, n_neighbors=10, random_state=31698, score=-2912.024958892541, total=   1.8s
[CV] n_components=2, n_neighbors=10, random_state=31698 ..............
[CV]  n_components=2, n_neighbors=10, random_state=31698, score=-3338.786382554442, total=   2.0s
[CV] n_components=2, n_neighbors=10, random_state=31698 ...

[CV]  n_components=2, n_neighbors=6, random_state=26095, score=0.9763136168234055, total=   1.4s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.3s remaining:    0.0s
[CV] n_components=2, n_neighbors=6, random_state=26095 ...............
[CV]  n_components=2, n_neighbors=6, random_state=26095, score=0.9744060435634363, total=   1.6s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   13.0s remaining:    0.0s
[CV] n_components=2, n_neighbors=6, random_state=26095 ...............
[CV]  n_components=2, n_neighbors=6, random_state=26095, score=0.9681621207853323, total=   1.4s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   14.6s remaining:    0.0s
[CV] n_components=2, n_neighbors=6, random_state=18231 ...............
[CV]  n_components=2, n_neighbors=6, random_state=18231, score=0.9772380753548371, total=   1.5s
[CV] n_components=2, n_neighbors=6, random_state=18231 ...............
[CV]  n_components=2, n_neighbors=6, random_state=18231, score=0.9699858841353277, total

[CV]  n_components=2, n_neighbors=14, random_state=26095, score=0.9838429322021567, total=   2.4s
[CV] n_components=2, n_neighbors=14, random_state=26095 ..............
[CV]  n_components=2, n_neighbors=14, random_state=26095, score=0.9863512160809458, total=   2.3s
[CV] n_components=2, n_neighbors=14, random_state=26095 ..............
[CV]  n_components=2, n_neighbors=14, random_state=26095, score=0.984026156522182, total=   2.2s
[CV] n_components=2, n_neighbors=14, random_state=18231 ..............
[CV]  n_components=2, n_neighbors=14, random_state=18231, score=0.9829524785611028, total=   2.2s
[CV] n_components=2, n_neighbors=14, random_state=18231 ..............
[CV]  n_components=2, n_neighbors=14, random_state=18231, score=0.9851823206354208, total=   2.4s
[CV] n_components=2, n_neighbors=14, random_state=18231 ..............
[CV]  n_components=2, n_neighbors=14, random_state=18231, score=0.9848481231629085, total=   2.3s
[CV] n_components=2, n_neighbors=15, random_state=26095 ..

## Look at the grid search results to see what they're like
...decide later what to do with this as part of the analysis

In [24]:
train_output = read_json(train_output_json)

In [26]:
train_output.keys()

dict_keys(['PCA_swiss-roll_1nn-error_0', 'PCA_swiss-roll_continuity_0', 'PCA_swiss-roll_strain_0', 'PCA_swiss-roll_stress_0', 'PCA_swiss-roll_trustworthiness_0', 'UMAP_swiss-roll_1nn-error_0', 'UMAP_swiss-roll_continuity_0', 'UMAP_swiss-roll_strain_0', 'UMAP_swiss-roll_stress_0', 'UMAP_swiss-roll_trustworthiness_0'])

## Run the dimension reductions on our data

In [33]:
predict_list_json = '../models/predict_list_mph.json'

In [34]:
read_json(predict_list_json)

[{'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'PCA_swiss-roll_1nn-error_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'PCA_swiss-roll_continuity_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'PCA_swiss-roll_strain_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'PCA_swiss-roll_stress_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'PCA_swiss-roll_trustworthiness_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'UMAP_swiss-roll_1nn-error_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'UMAP_swiss-roll_continuity_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'UMAP_swiss-roll_strain_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'UMAP_swiss-roll_stress_0'},
 {'dataset_name': 'swiss-roll',
  'force': True,
  'model_name': 'UMAP_swiss-roll_trustworthiness_0'}]

In [54]:
!python -m src.models.predict_model predict_list_mph.json -o prediction_mph.json

2018-09-07 15:36:51,449 - predict_model - INFO - Executing models from predict_list_mph.json
2018-09-07 15:36:51,462 - utils - INFO - PROCESS_TIME:PCA_swiss-roll_1nn-error_0_exp_swiss-roll_0    0.2 ms
2018-09-07 15:36:51,478 - utils - INFO - PROCESS_TIME:PCA_swiss-roll_continuity_0_exp_swiss-roll_0    0.2 ms
2018-09-07 15:36:51,493 - utils - INFO - PROCESS_TIME:PCA_swiss-roll_strain_0_exp_swiss-roll_0    0.1 ms
2018-09-07 15:36:51,519 - utils - INFO - PROCESS_TIME:PCA_swiss-roll_stress_0_exp_swiss-roll_0    0.1 ms
2018-09-07 15:36:51,534 - utils - INFO - PROCESS_TIME:PCA_swiss-roll_trustworthiness_0_exp_swiss-roll_0    0.1 ms
2018-09-07 15:36:51,563 - utils - INFO - PROCESS_TIME:UMAP_swiss-roll_1nn-error_0_exp_swiss-roll_0    0.4 ms
2018-09-07 15:36:51,585 - utils - INFO - PROCESS_TIME:UMAP_swiss-roll_continuity_0_exp_swiss-roll_0    0.3 ms
2018-09-07 15:36:51,605 - utils - INFO - PROCESS_TIME:UMAP_swiss-roll_strain_0_exp_swiss-roll_0    0.3 ms
2018-09-07 15:36:51,622 - utils - INFO - 

In [55]:
predict_output_json = '../models/prediction_mph.json'

In [57]:
predict_output = read_json(predict_output_json)

In [58]:
predict_output.keys()

dict_keys(['PCA_swiss-roll_1nn-error_0_exp_swiss-roll_0', 'PCA_swiss-roll_continuity_0_exp_swiss-roll_0', 'PCA_swiss-roll_strain_0_exp_swiss-roll_0', 'PCA_swiss-roll_stress_0_exp_swiss-roll_0', 'PCA_swiss-roll_trustworthiness_0_exp_swiss-roll_0', 'UMAP_swiss-roll_1nn-error_0_exp_swiss-roll_0', 'UMAP_swiss-roll_continuity_0_exp_swiss-roll_0', 'UMAP_swiss-roll_strain_0_exp_swiss-roll_0', 'UMAP_swiss-roll_stress_0_exp_swiss-roll_0', 'UMAP_swiss-roll_trustworthiness_0_exp_swiss-roll_0'])

In [68]:
analysis_list_json = '../models/analysis_list_mph.json'

In [70]:
analysis_list = read_json(analysis_list_json)

In [78]:
import inspect

In [90]:
from src.models.analyze import get_score

In [91]:
inspect.getfullargspec(get_score)

FullArgSpec(args=['input_path', 'score_params', 'pointwise'], varargs='low_dataset_name', varkw=None, defaults=(None, None, False), kwonlyargs=['high_dataset_name', 'score_name'], kwonlydefaults=None, annotations={})

In [99]:
def run_scores(**kwargs):
    pointwise = kwargs.pop('include_pointwise')
    s = get_score(pointwise=pointwise, **kwargs)
    return s


In [95]:
## To go into main for analyze.py

In [100]:
# decide which kind of output file to finish with here
for exp in analysis_list[:2]:
    score = run_scores(**exp)

In [101]:
score

array([6.90269995e-05, 1.02988623e-04, 2.42570895e-04, 2.55561216e-04,
       2.71692987e-06, 2.01477331e-04, 1.65053490e-04, 3.90643573e-04,
       7.21684497e-05, 3.38767193e-04, 7.37816268e-05, 3.56597045e-06,
       6.98760401e-05, 1.43402955e-04, 1.65308202e-04, 9.40736967e-05,
       2.35184242e-05, 1.78298523e-06, 6.87722873e-06, 1.63864833e-05,
       6.30837154e-05, 2.01647139e-04, 1.96977416e-05, 4.80302258e-04,
       2.78824928e-04, 1.56647988e-04, 6.75836305e-05, 2.42316183e-04,
       2.24995755e-05, 7.90456784e-05, 2.90626592e-04, 2.80862625e-04,
       3.42927492e-04, 1.54949907e-04, 3.38342673e-04, 1.66921379e-04,
       1.12922398e-04, 1.15384615e-04, 1.17082697e-04, 1.06979114e-05,
       5.78196638e-05, 4.07284768e-04, 2.17354390e-04, 7.75174053e-05,
       4.19426049e-05, 8.77907964e-05, 6.27440992e-05, 4.13822381e-04,
       9.76396672e-05, 2.15656308e-05, 1.37035150e-04, 1.27356088e-06,
       1.13941246e-04, 2.51316013e-05, 2.83155035e-04, 7.25080659e-05,
      

## Warnings

# Choose quality scores

In [15]:
# Do all the quality scores. Eventually add caching into the qm module itself.

In [16]:
high_distances = {}
for dataset_name in ds_names:
    dataset = datasets_dict[dataset_name]
    try:
        high_distances[dataset_name] = pairwise_distances(dataset.data, metric=metric)
    except Exception as e:
        print(f"Cannot complete {dataset_name}")
        pass

In [17]:
# make this a double loop instead...to match

In [18]:
%%time
low_distances_dict = {}
for ds_name in ds_names:
    low_distances_dict[ds_name] = {}
    for alg_name in algorithms.keys():
        try:
            _, low_distances, _ = qm.pairwise_distance_differences(high_distances=high_distances[ds_name],
                                                                   low_data=low_data[ds_name][alg_name].data,
                                                                   metric=metric)
            low_distances_dict[ds_name][alg_name] = low_distances
        except:
            print(f"Cannot complete {ds_name}, {alg_name}")
            pass

CPU times: user 15.6 s, sys: 13.4 s, total: 29 s
Wall time: 19.5 s


In [19]:
## This is next step is slow-ish. Do some profiling.

In [20]:
def scores_function(high_distances, low_distances_dict, ds_names, algorithms, scores):
    total_scores_dict = {}
    point_scores_dict = {}
    for ds_name in ds_names:
        logger.info(ds_name)
        total_scores_dict[ds_name] = {}
        point_scores_dict[ds_name] = {}
        for alg_name in algorithms.keys():
            logger.info(alg_name)
            total_scores_dict[ds_name][alg_name] = {}
            point_scores_dict[ds_name][alg_name] = {}
            for score_name, score_fcns in scores.items():
                #logger.info(score_name)
                if score_name == '1nn Error':
                    dataset = low_data[ds_name][alg_name]
                    point_score = score_fcns[0](data=dataset.data, classes=dataset.target)
                    total_score = score_fcns[1](data=dataset.data, classes=dataset.target)
                    total_scores_dict[ds_name][alg_name][score_name] = total_score
                    point_scores_dict[ds_name][alg_name][score_name] = point_score
                else:
                    #try:
                    point_score = score_fcns[0](high_distances=high_distances[ds_name],
                                                low_distances=low_distances_dict[ds_name][alg_name])
                    total_score = score_fcns[1](high_distances=high_distances[ds_name],
                                                low_distances=low_distances_dict[ds_name][alg_name])
                    total_scores_dict[ds_name][alg_name][score_name] = total_score
                    point_scores_dict[ds_name][alg_name][score_name] = point_score
                    #except:
                    #    pass
    return point_scores_dict, total_scores_dict

In [21]:
point_scores_dict, total_scores_dict = scores_function(high_distances, low_distances_dict, ds_names, algorithms, scores)

INFO 08/22/2018 02:43:24 PM - broken-swiss-roll
INFO 08/22/2018 02:43:24 PM - PCA
INFO 08/22/2018 02:43:25 PM - KernelPCA
INFO 08/22/2018 02:43:27 PM - LLE
INFO 08/22/2018 02:43:29 PM - HLLE
INFO 08/22/2018 02:43:30 PM - LaplacianEigenmaps
INFO 08/22/2018 02:43:32 PM - tSNE
INFO 08/22/2018 02:43:33 PM - Isomap
INFO 08/22/2018 02:43:35 PM - UMAP
INFO 08/22/2018 02:43:36 PM - coil-20
INFO 08/22/2018 02:43:36 PM - PCA
INFO 08/22/2018 02:43:39 PM - KernelPCA
INFO 08/22/2018 02:43:42 PM - LLE
INFO 08/22/2018 02:43:45 PM - HLLE
INFO 08/22/2018 02:43:48 PM - LaplacianEigenmaps
INFO 08/22/2018 02:43:51 PM - tSNE
INFO 08/22/2018 02:43:55 PM - Isomap
INFO 08/22/2018 02:43:57 PM - UMAP
INFO 08/22/2018 02:44:00 PM - difficult
INFO 08/22/2018 02:44:00 PM - PCA
INFO 08/22/2018 02:46:26 PM - KernelPCA
INFO 08/22/2018 02:48:39 PM - LLE
INFO 08/22/2018 02:50:54 PM - HLLE
INFO 08/22/2018 02:53:02 PM - LaplacianEigenmaps
INFO 08/22/2018 02:55:06 PM - tSNE
INFO 08/22/2018 02:57:11 PM - Isomap
INFO 08/22/2

In [22]:
# Note: tried doing this straight to a df once, and it was a nightmare

In [23]:
for k, v in point_scores_dict[ds_name][alg_name].items():
    print(k, len(v))

Strain 3000
Stress 3000
Trustworthiness 3000
Continuity 3000
1nn Error 3000


In [24]:
ds_name

'twinpeaks'

In [25]:
alg_name

'UMAP'

In [26]:
df_dict = {}
for ds_name in ds_names:
    ds_df = pd.DataFrame(columns=list(scores.keys())+['algorithm', 'item'])
    for alg_name in algorithms.keys():
        df = pd.DataFrame(point_scores_dict[ds_name][alg_name])
        df['algorithm'] = alg_name
        df['item'] = df.index.astype(int)
        ds_df = pd.concat([ds_df, df], sort=True)
    df_dict[ds_name] = ds_df

In [27]:
# merge it in with the dimension reduction data

In [28]:
ds_df_dict = {}
for ds_name in ds_names:
    df = pd.DataFrame(columns=['x', 'y', 'target', 'rotation', 'filename', 'algorithm', 'item'])
    for name, v in low_data[ds_name].items():
        i_df = pd.DataFrame()
        for i, axis in enumerate(['x', 'y']):
            i_df[f'{axis}'] = v['data'][:, i]
        i_df['target'] = v['target']
        i_df['rotation'] = v['metadata'].get('rotation', pd.np.NAN)
        i_df['filename'] = v['metadata'].get('filename', pd.np.NAN)
        i_df['algorithm'] = name
        i_df['item'] = i_df.index
        df = df.append(i_df, ignore_index=True)
    df.shape
    df['target'] = df['target']
    df['item'] = df['item'].astype(int)
    df_dict[ds_name]['item'] = df_dict[ds_name]['item'].astype(int)
    df = df.merge(df_dict[ds_name], on=['item', 'algorithm'])
    ds_df_dict[ds_name] = df

In [29]:
## TODO: make visualizations out of ds_df_dict

## Create summary results df

In [30]:
pd.set_option('max_colwidth',60)
pd.set_option('precision',4)

In [31]:
total_df = pd.DataFrame(columns=['dataset'])
for ds_name in ds_names:
        df = pd.DataFrame(total_scores_dict[ds_name])
        df['dataset'] = ds_name
        total_df = pd.concat([total_df, df], sort=True)
total_df = total_df[['dataset'] + list(algorithms.keys())]

In [32]:
total_df[total_df.dataset == ds_name]

Unnamed: 0,dataset,PCA,KernelPCA,LLE,HLLE,LaplacianEigenmaps,tSNE,Isomap,UMAP
1nn Error,twinpeaks,0.33,0.2853,0.165,0.294,0.29,0.1347,0.1357,0.133
Continuity,twinpeaks,0.9962,0.9957,0.9986,0.4781,0.96,0.9985,0.9993,0.998
Strain,twinpeaks,3.6744,26.7609,1.0062,1.0224,1.0594,15199.723,6.2759,1057.8249
Stress,twinpeaks,546.1991,2623.393,1823.247,1928.228,1610.0,110817.2979,482.7217,27924.8568
Trustworthiness,twinpeaks,0.9403,0.9223,0.9963,0.4684,0.966,0.9993,0.9989,0.999


In [33]:
total_df.loc[total_df.index == 'Strain']

Unnamed: 0,dataset,PCA,KernelPCA,LLE,HLLE,LaplacianEigenmaps,tSNE,Isomap,UMAP
Strain,broken-swiss-roll,3.8374,22.173,1.0085,1.0084,1.0981,7256.7574,6.4529,1417.9896
Strain,coil-20,3.9468,53346000000000.0,1.0,1.0,1.0,1.0,101.9587,1.0
Strain,difficult,3.9431,5.8089,1.0001,1.0002,1.0033,8169.5365,13.8614,976.1528
Strain,hiva,3.0774,1.0032,1.0,1.0,1.0,137.7492,48.4155,472.6095
Strain,orl-faces,3.8403,208910000000000.0,1.0,1.0,1.0,1.0001,47.8683,1.0
Strain,swiss-roll,3.7607,21.748,1.0059,1.0096,1.1085,7606.4062,29.7879,1175.2658
Strain,twinpeaks,3.6744,26.761,1.0062,1.0224,1.0594,15199.723,6.2759,1057.8249


In [34]:
total_df.loc[total_df.index == 'Stress']

Unnamed: 0,dataset,PCA,KernelPCA,LLE,HLLE,LaplacianEigenmaps,tSNE,Isomap,UMAP
Stress,broken-swiss-roll,177.91,834.3,641.08,641.16,565.64,27062.0,316.64,11504.0
Stress,coil-20,8012800.0,35270000000000.0,17242000.0,17242000.0,17242000.0,17191000.0,40542000.0,17222000.0
Stress,difficult,2780.8,2627.6,9197.9,9212.5,8899.9,348400.0,7447.3,115400.0
Stress,hiva,39089.0,55011.0,56057.0,56057.0,55952.0,95970.0,38867.0,111990.0
Stress,orl-faces,1172300.0,8887000000000.0,2262400.0,2262500.0,2262400.0,2253600.0,2772700.0,2260300.0
Stress,swiss-roll,193.88,840.72,651.39,648.31,541.79,28602.0,1163.8,10370.0
Stress,twinpeaks,546.2,2623.4,1823.2,1928.2,1610.0,110820.0,482.72,27925.0


In [35]:
total_df.loc[total_df.index == 'Trustworthiness']

Unnamed: 0,dataset,PCA,KernelPCA,LLE,HLLE,LaplacianEigenmaps,tSNE,Isomap,UMAP
Trustworthiness,broken-swiss-roll,0.9118,0.9017,0.7422,0.744,0.7324,0.9993,0.8287,0.9985
Trustworthiness,coil-20,0.8902,0.8941,0.8438,0.5622,0.6508,0.9954,0.9186,0.9926
Trustworthiness,difficult,0.8911,0.8751,0.6411,0.756,0.7907,0.9974,0.879,0.9807
Trustworthiness,hiva,0.6774,0.6763,0.6078,0.4818,0.6065,0.9227,0.6939,0.8988
Trustworthiness,orl-faces,0.86,0.8547,0.8839,0.7051,0.799,0.9676,0.8654,0.9709
Trustworthiness,swiss-roll,0.8629,0.8677,0.9255,0.9943,0.8611,0.9994,0.9996,0.9985
Trustworthiness,twinpeaks,0.9403,0.9223,0.9963,0.4684,0.966,0.9993,0.9989,0.999


In [36]:
total_df.loc[total_df.index == 'Continuity']

Unnamed: 0,dataset,PCA,KernelPCA,LLE,HLLE,LaplacianEigenmaps,tSNE,Isomap,UMAP
Continuity,broken-swiss-roll,0.9925,0.992,0.8723,0.8592,0.8715,0.9984,0.9781,0.9954
Continuity,coil-20,0.9741,0.9536,0.9529,0.6988,0.8046,0.9929,0.986,0.9947
Continuity,difficult,0.9885,0.9866,0.8408,0.94,0.8352,0.9849,0.9853,0.9904
Continuity,hiva,0.8635,0.8623,0.7851,0.4956,0.4735,0.919,0.9061,0.9262
Continuity,orl-faces,0.9315,0.8894,0.9311,0.8307,0.7942,0.9521,0.9463,0.9597
Continuity,swiss-roll,0.9912,0.991,0.986,0.9946,0.9615,0.9965,0.9995,0.9962
Continuity,twinpeaks,0.9962,0.9957,0.9986,0.4781,0.96,0.9985,0.9993,0.998


In [37]:
total_df.loc[total_df.index == '1nn Error']

Unnamed: 0,dataset,PCA,KernelPCA,LLE,HLLE,LaplacianEigenmaps,tSNE,Isomap,UMAP
1nn Error,broken-swiss-roll,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1nn Error,coil-20,0.3326,0.3604,0.3007,0.1882,0.0993,0.0069,0.2062,0.1035
1nn Error,difficult,0.455,0.452,0.5039,0.5059,0.49,0.1493,0.45,0.294
1nn Error,hiva,0.0653,0.064,0.0637,0.0635,0.0632,0.0442,0.0536,0.0541
1nn Error,orl-faces,0.61,0.6225,0.4,0.7125,0.58,0.015,0.48,0.0625
1nn Error,swiss-roll,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1nn Error,twinpeaks,0.33,0.2853,0.165,0.294,0.29,0.1347,0.1357,0.133


## Visualizations

In [38]:
%output backend = 'bokeh'

In [39]:
ds_name = 'difficult'

color_index = 'Trustworthiness' # 'target', 'Trustworthiness', 'Continuity'
if color_index == 'target':
    cmap = 'spectral'
else:
    cmap = 'inferno_r'

In [40]:
%%opts Scatter [color_index=color_index colorbar=True colorbar_position='right'](cmap=cmap)
%%opts Scatter [width=450 height=350 labelled=[False, False]]

df = ds_df_dict[ds_name]
macro = hv.Dataset(df, ['x', 'y'])
scatter = []

#put them on the same color scale
c_min, c_max = min(df[color_index]), max(df[color_index])
for name in algorithms.keys():
    subset = macro.select(algorithm=name)
    new_scatter = subset.to(hv.Scatter, kdims=['x', 'y'], vdims=[color_index, 'rotation', 'item'], group=f'{ds_name}')
    #new_scatter = subset.to(hv.Scatter, kdims=['x', 'y'], vdims=[color_index], group=f'{ds_name}')
    new_scatter = new_scatter.relabel(f'{name}')

    # leave some padding around the edges
    alg_gb = df.groupby('algorithm')
    x_min, y_min = alg_gb.min().loc[name][['x', 'y']]
    x_max, y_max = alg_gb.max().loc[name][['x', 'y']]
    x_range_diff = (x_max - x_min)/10
    y_range_diff = (y_max - y_min)/10

    if color_index == 'Trustworthiness':
        new_scatter = new_scatter.redim.range(x=(x_min - x_range_diff, x_max + x_range_diff),
                                              y=(y_min - y_range_diff, y_max + y_range_diff),
                                              Trustworthiness=(c_min, c_max))
    elif color_index == 'Continuity':
        new_scatter = new_scatter.redim.range(x=(x_min - x_range_diff, x_max + x_range_diff),
                                              y=(y_min - y_range_diff, y_max + y_range_diff),
                                              Continuity=(c_min, c_max))
    elif color_index == 'target':
        new_scatter = new_scatter.redim.range(x=(x_min - x_range_diff, x_max + x_range_diff),
                                              y=(y_min - y_range_diff, y_max + y_range_diff))
    else:
        logger.warning("Unidentified color_index:{color_index}")
    scatter.append(new_scatter)

In [41]:
print(color_index)
p = scatter[0]
for f in scatter[1:]:
    p += f
p.cols(2)

Trustworthiness


In [42]:
total_df[(total_df.dataset == ds_name) & ((total_df.index == 'Continuity') | (total_df.index == 'Trustworthiness'))]

Unnamed: 0,dataset,PCA,KernelPCA,LLE,HLLE,LaplacianEigenmaps,tSNE,Isomap,UMAP
Continuity,difficult,0.9885,0.9866,0.8408,0.94,0.8352,0.9849,0.9853,0.9904
Trustworthiness,difficult,0.8911,0.8751,0.6411,0.756,0.7907,0.9974,0.879,0.9807
