In [None]:
from sklearn.decomposition import PCA
#from sklearn.manifold import MDS, Isomap
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from src.data.datasets import load_dataset
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.utils.estimator_checks import check_estimator
from umap import UMAP

from src import quality_measures as qm

import numpy as np
import pandas as pd

from src.data import datasets

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

LOG_FORMAT = "%(levelname)s %(asctime)s - %(message)s"
DATE_FORMAT = "%m/%d/%Y %I:%M:%S %p"

logging.basicConfig(format=LOG_FORMAT, datefmt=DATE_FORMAT, level=logging.INFO) 
logger = logging.getLogger()

In [None]:
random_state = 6502
metric = "euclidean"
low_dimension = 2
score_neighbors = 12

In [None]:
# this should probably be dataset names and parameters in a dict, like below
ds_names = [
 'broken-swiss-roll',
 'difficult',
 'helix',
 'swiss-roll',
 'twinpeaks'   
]

assert all([ds_name in datasets.available_datasets() for ds_name in ds_names])


In [None]:
algorithms = {
    'PCA': PCA,
#    'tSNE': TSNE,
    'UMAP': UMAP,
    
}
models = {
    'PCA': {"n_components":low_dimension, "svd_solver":"arpack"},
#    'tSNE' : {"n_components":low_dimension, "random_state":random_state},
    'UMAP' : {"n_components":low_dimension, "metric":metric, "random_state":random_state},    
}
assert all([mname in algorithms for mname in models])

all_quality_measures = qm.available_quality_measures()
quality_measures = {
    'strain': None,
    'stress': None,
    'trustworthiness': {"n_neighbors":score_neighbors},
    'continuity': {"n_neighbors":score_neighbors},
    '1nn-error': None,
}

assert all([qm_name in all_quality_measures for qm_name in quality_measures])

model_list = [
    {
        "meta_est":None,
        "estimator":"UMAP",
        "model_opts":models['UMAP'],
        "dataset":None,
        "run_no":0
    }
]


In [None]:
# Fix an Algorithm, Dataset, Quality Measure. Grid search
model = 'UMAP'
dataset = 'helix'
score = 'trustworthiness'
greater_is_better = True
random_seed=6502

In [None]:
ds = load_dataset(dataset)

## Run a grid search with trustworthiness as the score

In [None]:
trust = qm.make_hi_lo_scorer(qm.trustworthiness, n_neighbors=12, metric='euclidean')

In [None]:
dr_stage = "dr"
#alg = Pipeline([("hd", trust), (dr_stage, algorithms[model]())])
#alg.get_params(deep=False)
#param_grid = {f'{dr_stage}__learning_rate': np.arange(0.5, 2., 0.5)}

# this one
alg = algorithms[model](random_state=random_seed)
param_grid = {f'learning_rate': np.arange(0.5, 2., 0.4)}


In [None]:
grid_search = GridSearchCV(alg, param_grid, scoring=trust, verbose=1000)
grid_search.verbose=True

In [None]:
gs = grid_search.fit(ds.data, None)

In [None]:
grid_search.best_estimator_

In [None]:
import pandas as pd
pd.DataFrame(grid_search.cv_results_).T