In [20]:
import numpy as np
import pandas as pd

import src
import src.data.datasets as datasets
import src.quality_measures as qm
from src.paths import processed_data_path

In [4]:
## Specify dataset, algorithm and quality measure triples for training

In [6]:
datasets.available_datasets()

['ball',
 'broken-swiss-roll',
 'coil-100',
 'coil-20',
 'difficult',
 'f-mnist',
 'frey-faces',
 'gaussian-blobs',
 'helix',
 'hiva',
 'lvq-pak',
 'mnist',
 'orl-faces',
 's-curve',
 'shuttle-statlog',
 'sphere',
 'swiss-roll',
 'twinpeaks',
 'unit-cube']

In [9]:
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, Isomap, LocallyLinearEmbedding

from MulticoreTSNE import MulticoreTSNE as TSNE
from umap import UMAP

DR_ALGORITHMS = {
    "autoencoder": None,
    "isomap": Isomap,
    "MDS": MDS,
    "PCA": PCA,
    "t-SNE":TSNE,
    "UMAP": UMAP,
}

def available_algorithms():
    """Valid Algorithms for dimension reduction applications
    
    This function simply returns the list of known dimension reduction
    algorithms.
    
    It exists to allow for a description of the mapping for
    each of the valid strings.

    The valid quality metrics, and the function they map to, are:

    ============     ====================================
    Algorithm        Function
    ============     ====================================
    autoencoder
    isomap
    MDS
    PCA
    t-SNE
    UMAP
    ============     ====================================
    
    
    """
    return DR_ALGORITHMS

In [10]:
available_algorithms()

{'autoencoder': None,
 'isomap': sklearn.manifold.isomap.Isomap,
 'MDS': sklearn.manifold.mds.MDS,
 'PCA': sklearn.decomposition.pca.PCA,
 't-SNE': MulticoreTSNE.MulticoreTSNE,
 'UMAP': umap.umap_.UMAP}

In [11]:
qm.available_quality_measures()

{'1nn-error': <function src.quality_measures.generalized_1nn_error>,
 'continuity': <function src.quality_measures.continuity>,
 'stress': <function src.quality_measures.stress>,
 'strain': <function src.quality_measures.strain>,
 'trustworthiness': <function src.quality_measures.trustworthiness>}

In [None]:
UMAP()

In [97]:
training_dicts = [
    {
        'dataset':'coil-20',
        'algorithm':'UMAP',
        'score':'trustworthiness',
        'meta':'grid_search',
        'algorithm_params':{'n_components':[2], 'n_neighbors':np.arange(2, 100, 20)},
        #'algorithm_params':{'n_neighbors':np.arange(2, 100, 3)},
        'score_params':{'n_neighbors':12, 'metric':'euclidean'},
        'meta_params':{'verbose':10}
    }
    
]

In [98]:
# Check for valid parameters

In [99]:
for td in training_dicts:
    assert td['dataset'] in datasets.available_datasets()
    assert td['algorithm'] in available_algorithms()
    assert td['score'] in qm.available_quality_measures()

In [100]:
from collections import defaultdict

In [101]:
metadata_dict = {}
for td in training_dicts:
    run_number = td.get('run_number', 0)
    id_base = f"{td['algorithm']}_{td['dataset']}_{td['score']}_{run_number}"
    if id_base in metadata_dict:
        raise Exception("{id_base} already exists. Give a run_number to avoid collisions.")
    else:
        metadata_dict[id_base] = td
        

In [102]:
metadata_dict

{'UMAP_coil-20_trustworthiness_0': {'dataset': 'coil-20',
  'algorithm': 'UMAP',
  'score': 'trustworthiness',
  'meta': 'grid_search',
  'algorithm_params': {'n_components': [2],
   'n_neighbors': array([ 2, 22, 42, 62, 82])},
  'score_params': {'n_neighbors': 12, 'metric': 'euclidean'},
  'meta_params': {'verbose': 10}}}

In [103]:
training_dicts

[{'dataset': 'coil-20',
  'algorithm': 'UMAP',
  'score': 'trustworthiness',
  'meta': 'grid_search',
  'algorithm_params': {'n_components': [2],
   'n_neighbors': array([ 2, 22, 42, 62, 82])},
  'score_params': {'n_neighbors': 12, 'metric': 'euclidean'},
  'meta_params': {'verbose': 10}}]

In [104]:
from sklearn.model_selection import GridSearchCV

In [105]:
meta_estimators = {
    'grid_search':GridSearchCV
}

In [106]:
td

{'dataset': 'coil-20',
 'algorithm': 'UMAP',
 'score': 'trustworthiness',
 'meta': 'grid_search',
 'algorithm_params': {'n_components': [2],
  'n_neighbors': array([ 2, 22, 42, 62, 82])},
 'score_params': {'n_neighbors': 12, 'metric': 'euclidean'},
 'meta_params': {'verbose': 10}}

In [107]:
import joblib

In [108]:
cv_results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
mean_fit_time,1.63644,2.37816,2.98482,3.6155,3.73691,3.92655,4.11091,4.27164,4.46732,4.94375
std_fit_time,0.0272363,0.0122962,0.0504601,0.161521,0.0737798,0.0657729,0.0568165,0.0698887,0.0473142,0.362012
mean_score_time,0.575108,0.771139,0.800064,0.922732,0.997608,0.996977,0.981751,0.990957,1.00635,1.07782
std_score_time,0.0320018,0.0703172,0.0522966,0.0405968,0.244573,0.231804,0.200115,0.243115,0.187318,0.263265
param_n_components,2,2,2,2,2,2,2,2,2,2
param_n_neighbors,2,12,22,32,42,52,62,72,82,92
params,"{'n_components': 2, 'n_neighbors': 2}","{'n_components': 2, 'n_neighbors': 12}","{'n_components': 2, 'n_neighbors': 22}","{'n_components': 2, 'n_neighbors': 32}","{'n_components': 2, 'n_neighbors': 42}","{'n_components': 2, 'n_neighbors': 52}","{'n_components': 2, 'n_neighbors': 62}","{'n_components': 2, 'n_neighbors': 72}","{'n_components': 2, 'n_neighbors': 82}","{'n_components': 2, 'n_neighbors': 92}"
split0_test_score,0.800097,0.884682,0.860198,0.838047,0.84612,0.836835,0.833799,0.827952,0.804516,0.820189
split1_test_score,0.737786,0.910609,0.915338,0.903391,0.897949,0.898784,0.870386,0.877482,0.857264,0.85815
split2_test_score,0.766183,0.914963,0.896483,0.874721,0.877271,0.87973,0.872562,0.874201,0.863742,0.863275


In [109]:
import json

In [110]:
def save_json(filename, obj):
    with open(filename, 'w') as fw:
        json.dump(obj, fw, indent=2, sort_keys=True)

In [124]:
def normalize_numpy_dict(d):
    ret = d.copy()
    for k, v in ret.items():
        if isinstance(v, np.generic):
            ret[k] = np.asscalar(v)
    return ret

In [122]:
grid_search.best_params_

{'n_components': 2, 'n_neighbors': 22}

In [123]:
[type(y) for y in something(grid_search.best_params_).values()]

[int, int]

In [131]:
cv_results.index.name = 'grid_search_results'

In [132]:
cv_results

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
results,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
mean_fit_time,1.63644,2.37816,2.98482,3.6155,3.73691,3.92655,4.11091,4.27164,4.46732,4.94375
std_fit_time,0.0272363,0.0122962,0.0504601,0.161521,0.0737798,0.0657729,0.0568165,0.0698887,0.0473142,0.362012
mean_score_time,0.575108,0.771139,0.800064,0.922732,0.997608,0.996977,0.981751,0.990957,1.00635,1.07782
std_score_time,0.0320018,0.0703172,0.0522966,0.0405968,0.244573,0.231804,0.200115,0.243115,0.187318,0.263265
param_n_components,2,2,2,2,2,2,2,2,2,2
param_n_neighbors,2,12,22,32,42,52,62,72,82,92
params,"{'n_components': 2, 'n_neighbors': 2}","{'n_components': 2, 'n_neighbors': 12}","{'n_components': 2, 'n_neighbors': 22}","{'n_components': 2, 'n_neighbors': 32}","{'n_components': 2, 'n_neighbors': 42}","{'n_components': 2, 'n_neighbors': 52}","{'n_components': 2, 'n_neighbors': 62}","{'n_components': 2, 'n_neighbors': 72}","{'n_components': 2, 'n_neighbors': 82}","{'n_components': 2, 'n_neighbors': 92}"
split0_test_score,0.800097,0.884682,0.860198,0.838047,0.84612,0.836835,0.833799,0.827952,0.804516,0.820189
split1_test_score,0.737786,0.910609,0.915338,0.903391,0.897949,0.898784,0.870386,0.877482,0.857264,0.85815
split2_test_score,0.766183,0.914963,0.896483,0.874721,0.877271,0.87973,0.872562,0.874201,0.863742,0.863275


In [133]:
# Grid search for best parameters and models
results =[]
for k, td in metadata_dict.items():
        meta = td.get('meta', None)
        if meta == 'grid_search':
            ds = datasets.load_dataset(td['dataset'])
            alg = available_algorithms()[td['algorithm']]()
            score = qm.make_hi_lo_scorer(qm.available_quality_measures()[td['score']], **td['score_params'])
            grid_search = meta_estimators[td['meta']](alg, td['algorithm_params'], scoring=score, **td['meta_params'])
            grid_search.fit(ds.data)#, y=ds.target)

            #save off the results from the grid search
            print(k) # metadata id
            metadata = td.copy()
            metadata['algorithm_params'] = normalize_numpy_dict(grid_search.best_params_) # save td off as k.metadata
            save_json(f"{k}.metadata", metadata)
            best_est = result.best_estimator_ # save this off  as k.model
            joblib.dump(best_est, f"{k}.model")
            cv_results = pd.DataFrame(result.cv_results_).T # save this off as k.csv
            cv_results.index.name = 'grid_search_results'
            cv_results.to_csv(f"{k}-gridsearch.csv")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] n_components=2, n_neighbors=2 ...................................


  n_components


[CV]  n_components=2, n_neighbors=2, score=0.7723715691585412, total=   2.3s
[CV] n_components=2, n_neighbors=2 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s remaining:    0.0s
  n_components


[CV]  n_components=2, n_neighbors=2, score=0.7384096244131455, total=   2.2s
[CV] n_components=2, n_neighbors=2 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.2s remaining:    0.0s
  n_components


[CV]  n_components=2, n_neighbors=2, score=0.7557869116407852, total=   2.4s
[CV] n_components=2, n_neighbors=22 ..................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    9.4s remaining:    0.0s
  n_components


[CV]  n_components=2, n_neighbors=22, score=0.8423415492957748, total=   3.9s
[CV] n_components=2, n_neighbors=22 ..................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   14.0s remaining:    0.0s
  n_components


[CV]  n_components=2, n_neighbors=22, score=0.912723079932587, total=   3.8s
[CV] n_components=2, n_neighbors=22 ..................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.6s remaining:    0.0s
  n_components


[CV]  n_components=2, n_neighbors=22, score=0.8989921903214158, total=   3.8s
[CV] n_components=2, n_neighbors=42 ..................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   23.1s remaining:    0.0s
  n_components


[CV]  n_components=2, n_neighbors=42, score=0.8591598200312989, total=   4.5s
[CV] n_components=2, n_neighbors=42 ..................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   28.4s remaining:    0.0s


[CV]  n_components=2, n_neighbors=42, score=0.9008712531599854, total=   4.7s
[CV] n_components=2, n_neighbors=42 ..................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   33.9s remaining:    0.0s
  n_components


[CV]  n_components=2, n_neighbors=42, score=0.8783589894065247, total=   4.8s
[CV] n_components=2, n_neighbors=62 ..................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   39.5s remaining:    0.0s


[CV]  n_components=2, n_neighbors=62, score=0.8260822950523654, total=   4.9s
[CV] n_components=2, n_neighbors=62 ..................................
[CV]  n_components=2, n_neighbors=62, score=0.8822243288792585, total=   4.9s
[CV] n_components=2, n_neighbors=62 ..................................
[CV]  n_components=2, n_neighbors=62, score=0.8815219844709281, total=   5.2s
[CV] n_components=2, n_neighbors=82 ..................................
[CV]  n_components=2, n_neighbors=82, score=0.809519832671241, total=   5.5s
[CV] n_components=2, n_neighbors=82 ..................................
[CV]  n_components=2, n_neighbors=82, score=0.8651066118935838, total=   5.5s
[CV] n_components=2, n_neighbors=82 ..................................
[CV]  n_components=2, n_neighbors=82, score=0.8654692578548212, total=   5.8s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.3min finished
  n_components


UMAP_coil-20_trustworthiness_0




In [134]:
!ls -la

total 148272
drwxrwx---  3 ava00125 wheel            4096 Aug 25 20:03 .
drwxrwx--- 14 ava00125 wheel            4096 Aug 23 17:14 ..
-rw-rw----  1 ava00125 domain users    45902 Aug 22 16:28 00-fetch-data.ipynb
-rw-rw----  1 ava00125 domain users  1817543 Jun 30 21:02 01-acw-initial-exploration.ipynb
-rwxrwx---  1 ava00125 domain users   197347 Jul  3 17:07 02-jch-initial-exploration.ipynb
-rw-rw----  1 ava00125 domain users  2061481 Aug 22 16:04 03-kw-synthetic-datasets.ipynb
-rw-rw----  1 ava00125 domain users  1565457 Jul  6 12:47 04-acw-analysis-first-run-toy-mnist.ipynb
-rw-rw----  1 ava00125 domain users 17296073 Aug 22 17:48 05-acw-analysis-first-run-COIL-20.ipynb
-rw-rw----  1 ava00125 domain users  1146968 Jul  4 12:05 06-kw-comparing-algorithms.ipynb
-rw-rw----  1 ava00125 domain users    51732 Jul 12 11:19 08-kw-add-new-datasets.ipynb
-rw-rw----  1 ava00125 domain users 15135975 Aug 22 13:49 09-acw-new-quality-measures.ipynb
-rw-rw----  1 ava00125 domain users  

In [135]:
!head UMAP_coil-20_trustworthiness_0-gridsearch.csv

grid_search_results,0,1,2,3,4,5,6,7,8,9
mean_fit_time,1.6364404360453289,2.3781584898630777,2.9848212401072183,3.6154975096384683,3.736905574798584,3.9265499909718833,4.110905488332112,4.271638790766398,4.467320919036865,4.943752845128377
std_fit_time,0.02723626550435345,0.01229615387105236,0.05046007123215775,0.16152051023454786,0.07377983608944652,0.06577293432561963,0.05681649582266725,0.06988873003901228,0.04731416257190435,0.36201242403389494
mean_score_time,0.5751083691914877,0.7711386680603027,0.800063689549764,0.9227323532104492,0.9976077079772949,0.9969770908355713,0.9817514419555664,0.9909574190775553,1.0063544909159343,1.0778177579243977
std_score_time,0.03200181285231984,0.07031716108939111,0.05229658819025741,0.0405968236954654,0.24457321836993368,0.2318042629165074,0.20011472033788966,0.2431147737679454,0.18731757772903912,0.2632653292258201
param_n_components,2,2,2,2,2,2,2,2,2,2
param_n_neighbors,2,12,22,32,42,52,62,72,82,92
params,"{'n_components': 2, 'n_neighbor

In [136]:
t = pd.read_csv('UMAP_coil-20_trustworthiness_0-gridsearch.csv')

In [137]:
t

Unnamed: 0,grid_search_results,0,1,2,3,4,5,6,7,8,9
0,mean_fit_time,1.6364404360453289,2.3781584898630777,2.9848212401072183,3.6154975096384683,3.736905574798584,3.9265499909718833,4.110905488332112,4.271638790766398,4.467320919036865,4.943752845128377
1,std_fit_time,0.02723626550435345,0.01229615387105236,0.05046007123215775,0.16152051023454786,0.07377983608944652,0.06577293432561963,0.05681649582266725,0.06988873003901228,0.04731416257190435,0.36201242403389494
2,mean_score_time,0.5751083691914877,0.7711386680603027,0.800063689549764,0.9227323532104492,0.9976077079772949,0.9969770908355713,0.9817514419555664,0.9909574190775553,1.0063544909159343,1.0778177579243977
3,std_score_time,0.03200181285231984,0.07031716108939111,0.05229658819025741,0.0405968236954654,0.24457321836993368,0.2318042629165074,0.20011472033788966,0.2431147737679454,0.18731757772903912,0.2632653292258201
4,param_n_components,2,2,2,2,2,2,2,2,2,2
5,param_n_neighbors,2,12,22,32,42,52,62,72,82,92
6,params,"{'n_components': 2, 'n_neighbors': 2}","{'n_components': 2, 'n_neighbors': 12}","{'n_components': 2, 'n_neighbors': 22}","{'n_components': 2, 'n_neighbors': 32}","{'n_components': 2, 'n_neighbors': 42}","{'n_components': 2, 'n_neighbors': 52}","{'n_components': 2, 'n_neighbors': 62}","{'n_components': 2, 'n_neighbors': 72}","{'n_components': 2, 'n_neighbors': 82}","{'n_components': 2, 'n_neighbors': 92}"
7,split0_test_score,0.800097432887926,0.8846819700252797,0.860197724810401,0.838046978451908,0.8461203653545203,0.8368352744673169,0.8337994312026002,0.827951576983267,0.8045157698326713,0.8201889219934994
8,split1_test_score,0.7377855272661609,0.910608899121223,0.9153375917900566,0.9033913416395811,0.8979486427109666,0.8987841579390875,0.8703864210906465,0.8774824696039485,0.857263828698688,0.8581501294089322
9,split2_test_score,0.7661828879258455,0.914962907788612,0.8964830113157578,0.8747212441314555,0.8772706753340557,0.879729821235103,0.8725615444805586,0.8742005988925003,0.863741799085109,0.8632753250270856
