In [174]:
from sklearn.decomposition import PCA
#from sklearn.manifold import MDS, Isomap
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from src.data.datasets import load_dataset
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.utils.estimator_checks import check_estimator
from umap import UMAP

from src import quality_measures as qm

import numpy as np
import pandas as pd

from src.data import datasets

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import logging

LOG_FORMAT = "%(levelname)s %(asctime)s - %(message)s"
DATE_FORMAT = "%m/%d/%Y %I:%M:%S %p"

logging.basicConfig(format=LOG_FORMAT, datefmt=DATE_FORMAT, level=logging.INFO) 
logger = logging.getLogger()

In [4]:
random_state = 6502
metric = "euclidean"
low_dimension = 2
score_neighbors = 12

In [5]:
# this should probably be dataset names and parameters in a dict, like below
ds_names = [
 'broken-swiss-roll',
 'difficult',
 'helix',
 'swiss-roll',
 'twinpeaks'   
]

assert all([ds_name in datasets.available_datasets() for ds_name in ds_names])


In [7]:
algorithms = {
    'PCA': PCA,
#    'tSNE': TSNE,
    'UMAP': UMAP,
    
}
models = {
    'PCA': {"n_components":low_dimension, "svd_solver":"arpack"},
#    'tSNE' : {"n_components":low_dimension, "random_state":random_state},
    'UMAP' : {"n_components":low_dimension, "metric":metric, "random_state":random_state},    
}
assert all([mname in algorithms for mname in models])

all_quality_measures = qm.available_quality_measures()
quality_measures = {
    'strain': None,
    'stress': None,
    'trustworthiness': {"n_neighbors":score_neighbors},
    'continuity': {"n_neighbors":score_neighbors},
    '1nn-error': None,
}

assert all([qm_name in all_quality_measures for qm_name in quality_measures])

model_list = [
    {
        "meta_est":None,
        "estimator":"UMAP",
        "model_opts":models['UMAP'],
        "dataset":None,
        "run_no":0
    }
]


In [175]:
# Fix an Algorithm, Dataset, Quality Measure. Grid search
model = 'UMAP'
dataset = 'helix'
score = 'trustworthiness'
greater_is_better = True
random_seed=6502

In [49]:
scoring = make_scorer(all_quality_measures[score], greater_is_better=greater_is_better); scoring

make_scorer(generalized_1nn_error, greater_is_better=False)

In [140]:
ds = load_dataset(dataset)
from functools import wraps

In [168]:
def make_hi_lo_scorer(func, greater_is_better=True, **kwargs):
    """Make a sklearn-style scoring function for measures taking high/low data representations.
    
    Assumes the wrapped function expects `high_data` and `low_data` as parameters
    
    greater_is_better : boolean, default=True
        Whether `func` is a score function (default), meaning high is good,
        or a loss function, meaning low is good. In the latter case, the
        scorer object will sign-flip the outcome of the `func`.
    """
    sign = 1 if greater_is_better else -1
    def wrapped_func(estimator, X, y=None, **wrap_kw):
        low_data = estimator.transform(X)
        new_kwargs = {**kwargs, **wrap_kw}
        score = func(high_data=X, low_data=low_data, **new_kwargs)
        return sign * score
    return wrapped_func
   

In [169]:
def scorer(estimator, X, y=None, metric='euclidean', n_neighbors=12):
    logger.info(f"scoring X:{X.shape}")

    low_data = estimator.transform(X)
    pt = qm.point_untrustworthiness(high_data=X,
                                    low_data=low_data,
                                    metric=metric,
                                    n_neighbors=n_neighbors)
    return(1 - np.sum(pt))
trust = make_hi_lo_scorer(qm.trustworthiness, n_neighbors=12, metric='euclidean')

In [170]:
dr_stage = "dr"
#alg = Pipeline([("hd", trust), (dr_stage, algorithms[model]())])
#alg.get_params(deep=False)
#param_grid = {f'{dr_stage}__learning_rate': np.arange(0.5, 2., 0.5)}

# this one
alg = algorithms[model](random_state=random_seed)
param_grid = {f'learning_rate': np.arange(0.5, 2., 0.3)}


In [171]:
grid_search = GridSearchCV(alg, param_grid, scoring=trust, verbose=1000)
grid_search.verbose=True

In [172]:
gs = grid_search.fit(ds.data, None)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else

In [138]:
grid_search.best_estimator_

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=0.5, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
   n_neighbors=15, negative_sample_rate=5, random_state=6502,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=False)

In [139]:
import pandas as pd
pd.DataFrame(grid_search.cv_results_).T



Unnamed: 0,0,1,2,3,4
mean_fit_time,1.63251,1.58881,1.73833,1.59135,1.6106
std_fit_time,0.0307917,0.0176061,0.118575,0.0189069,0.0735372
mean_score_time,0.258376,0.260564,0.252258,0.245802,0.280321
std_score_time,0.0224662,0.0210867,0.0110594,0.00884721,0.024209
param_learning_rate,0.5,0.8,1.1,1.4,1.7
params,{'learning_rate': 0.5},{'learning_rate': 0.8},{'learning_rate': 1.1},{'learning_rate': 1.4000000000000001},{'learning_rate': 1.7000000000000002}
split0_test_score,0.980327,0.980327,0.980327,0.980327,0.980327
split1_test_score,0.982438,0.982438,0.982438,0.982438,0.982438
split2_test_score,0.979565,0.979565,0.979565,0.979565,0.979565
mean_test_score,0.980776,0.980776,0.980776,0.980776,0.980776


In [None]:
def apply_est_to_datasets(estimator_name, dataset_list, **kwargs):
    for ds_name in dataset_list:
        model_list.append({"estimator":ds_name, model_opts})