In [10]:
import numpy as np
import os
import glob
import pandas as pd
from sklearn.manifold import Isomap,\
                            LocallyLinearEmbedding,\
                            SpectralEmbedding,\
                            TSNE,\
                            locally_linear_embedding
from umap import UMAP
from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import ParameterGrid

from joblib import Parallel, delayed
import matplotlib.pyplot as plt
from utils import transform, calculate_Q_mae, KNN_MAE

import warnings
warnings.simplefilter("ignore")

# Load data

In [2]:
intrinsic_dims = np.load(f'./intrinsic_dims_pca_strict.npy', allow_pickle=True).item()

data_orig = {}
for dataset_name in tqdm(['AGP', 'HMP']):
    for tax in ['o', 'f', 'g']: 
        dataframe = pd.read_csv(f'./data_processed/{dataset_name}_{tax}.csv', sep=',')
        label = f'{dataset_name}_{tax}'
        data_orig[label] = dataframe.drop('Unnamed: 0', axis=1).values

data_pca = {}
for dataset_name in tqdm(['AGP', 'HMP']):
    for tax in ['o', 'f', 'g']: 
        label = f'{dataset_name}_{tax}'
        data_pca[label] = np.genfromtxt(f'./results/pca/{label}', delimiter=';')
        
embeddings_root = 'results/embeddings'
embeddings_params_root'results/embeddings_params/'

os.makedirs(embeddings_root, exist_ok=True)
os.makedirs(embeddings_params_root, exist_ok=True)

In [3]:
# # uncomment to use synthetic data

# intrinsic_dims = np.load(f'./intrinsic_dims_pca_strict_synthetic.npy', allow_pickle=True).item()

# data_orig = {}
# for path in glob.glob('data_processed/synthetic/*'):
#     dataframe = pd.read_csv(path, index_col=0)
#     label = path.split('/')[-1].split('.')[0]
#     data_orig[label] = dataframe.values

# data_pca = {}
# for path in tqdm(glob.glob('./results/pca/synthetic/*')):
#     label = path.split('/')[-1]
#     data_pca[label] = np.genfromtxt(path, delimiter = ';')

    
# embeddings_root = 'results/embeddings_synthetic'
# embeddings_params_root = 'results/embeddings_params_synthetic/'

# os.makedirs(embeddings_root, exist_ok=True)
# os.makedirs(embeddings_params_root, exist_ok=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  3.05it/s]


# Hyperparameters Grid Search

In [4]:
knn_mae_scorer = lambda *args: KNN_MAE(*args, averaging='median', weights='distance', n_neighbors=4)
scorer = lambda *args: calculate_Q_mae(*args, mae_scorer=knn_mae_scorer)

In [7]:
mf_models = {
            'isomap':{'class':Isomap, 'params_grid':{'n_neighbors':[5,10,15,25],
                                                      'p':[1,2],
                                                      'metric':['minkowski']
                                                      }
                      },
            'spectral':{'class':SpectralEmbedding,'params_grid':{'affinity':['nearest_neighbors', 'rbf'],
                                                                 'n_neighbors':[5,10,15,25,50],
                                                                  'gamma':[1e-4, 1e-3, 1e-2, None, 1e-1],
                                                                  'random_state':[42],
                                                                  }
                        }, 
            'lle':{'class':LocallyLinearEmbedding, 'params_grid':{'method':['ltsa','modified', 'standard'],
                                                                    'reg':[0.0001, 0.001, 0.01],
                                                                   'n_neighbors':[10,15,25,30,50],
                                                                   'random_state':[42],
                                                                   'eigen_solver':['dense']}
                  },
            'umap':{'class':UMAP, 'params_grid':{'n_neighbors':[5,10,15,25,50],
                                                 'min_dist':[0.1, 0.2, 0.3, 0.4, 0.5],
                                                 'random_state':[42],
                                                 'metric':['euclidean','manhattan'],
                                                 'learning_rate':[0.1, 0.5, 1],
                                                 'n_epochs':[2000],
                                                 'n_jobs':[1]}
                   },
    
            'tsne':{'class':TSNE, 'params_grid':{'perplexity':[5,10,15,25,30,35],
                                                 'random_state':[42],
                                                 'early_exaggeration':[1,5,10,15,20,25,30,35],
                                                 'init':['pca'],
                                                 'metric':['euclidean','manhattan'],
                                                 'method':['exact'],
                                                 'n_jobs':[1],
                                                 'angle':[0.3, 0.6, 0.7],
                                                 'n_iter':[2000]
                                                 }
                   } 
            }

        
for mf_type, model in mf_models.items():
    results = defaultdict(dict)
    
    for i,(label, X) in tqdm(enumerate(data_pca.items())):
        
        dim = intrinsic_dims[label]
        print(label, dim, mf_type)
        
        # to avoid errors in lle
        if mf_type=='lle':
            model['params_grid']['n_neighbors'] = list(map(lambda x: max(x,dim+1),\
                                                              model['params_grid']['n_neighbors']))
            assert len(model['params_grid']['n_neighbors']) > 0

        model_class = model['class']
        model_param_grid = model['params_grid']
        results[label][mf_type] = {}

        scores = Parallel(n_jobs=-1)(delayed(transform)(model_class,
                                                        X, 
                                                        dim,
                                                        model_params,
                                                        scorer) \
                        for model_params in list(ParameterGrid(model_param_grid)))     

        results[label][mf_type]['scores'] = scores
        results[label][mf_type]['params'] = model_param_grid
    
    path = os.path.join(embeddings_params_root, f'mf_learning_metrics_results_pca_{mf_type}')
    np.save(path, results)

0it [00:00, ?it/s]

clust2_dim53 27 lle


  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
1it [00:58, 58.93s/it]

clust3_dim96 36 lle


  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
2it [01:53, 56.28s/it]

clust2_dim96 38 lle


  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
3it [02:47, 55.32s/it]

clust3_dim53 26 lle


4it [03:38, 53.62s/it]

clust4_dim180 48 lle


  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
5it [04:40, 56.77s/it]

clust2_dim180 53 lle


  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
  alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
6it [06:04, 65.73s/it]

clust4_dim53 25 lle


7it [07:30, 72.62s/it]

clust4_dim96 34 lle


8it [08:32, 69.18s/it]

clust3_dim180 51 lle


9it [09:32, 63.56s/it]


# Visualization

In [5]:
all_metrics = {metrics_results_name.split('.')[0].split('_')[-1]:np.load(os.path.join(embeddings_params_root, 
                                                                                      metrics_results_name), \
                                                                                      allow_pickle=True).item() \
                   for metrics_results_name in \
                   filter(lambda x: 'metrics_results_pca' in x, os.listdir(embeddings_params_root))}

In [13]:
for mf_type, mf_metrics in all_metrics.items():
    mf_metrics_items = mf_metrics.items()
    fig, axes = plt.subplots(nrows=1, ncols = len(mf_metrics_items), figsize=(len(mf_metrics_items)*5,5), dpi=200)
    for i,(label, mf_label_metrics) in enumerate(mf_metrics_items):
        
        scores = np.array(mf_label_metrics[mf_type]['scores'])
        params = list(ParameterGrid(mf_label_metrics[mf_type]['params']))
        dataset_name = label.split('_')[0]
        tax_name = label.split('_')[-1]

        sc = axes[i].scatter(scores[:,0], scores[:,1], c=scores[:,2])
        
        axes[i].set_title(f'MF algorithm: {mf_type} \n Dataset: {dataset_name}, Tax: {tax_name.capitalize()}')
        axes[i].set_xlabel('Q_loc')
        axes[i].set_ylabel('Q_glob')
    fig.colorbar(sc, orientation='vertical')
plt.tight_layout()        
plt.show()

 # Save best params

In [7]:
best_params = defaultdict(dict)

for mf_type, mf_metrics in all_metrics.items():
    mf_metrics_items = mf_metrics.items()
    for i,(label, mf_label_metrics) in enumerate(mf_metrics_items):
        
        scores = np.array(mf_label_metrics[mf_type]['scores'])
        params = np.array(list(ParameterGrid(mf_label_metrics[mf_type]['params'])))
        
        dataset_name = label.split('_')[0]
        tax_name = label.split('_')[-1]
        
        maes = scores[:,-1]
        qloc = scores[:,0]
        qglob = scores[:,1]
        
        mae_argmin = maes.argmin()
        
        best_params[label][mf_type] = {}
        best_params[label][mf_type]['best_params_mae'] = params[mae_argmin] 
        best_params[label][mf_type]['mae'] = maes[mae_argmin]
        best_params[label][mf_type]['Q_loc'] = qloc[mae_argmin]
        best_params[label][mf_type]['Q_glob'] = qglob[mae_argmin]            


In [8]:
methods_dict = {'tsne':TSNE,
                'lle':LocallyLinearEmbedding,
                'umap':UMAP,
                'isomap':Isomap,
                'spectral':SpectralEmbedding}

In [16]:
PERCENTILE = 95
DATA_PERCENT_THRESHOLD = 0.9

for label, best_params_ in tqdm(best_params.items()):
    
    X = data_pca[label]
    X_orig  = data_orig[label]
    intdim = intrinsic_dims[label]

    for method_name, params_dict in best_params_.items():
        
        embedding_path = os.path.join(embeddings_root, f'{label}_{method_name}')

        if os.path.isfile(embedding_path + '.npy'):
            print(embedding_path, f'Loading {method_name} for {label}...') 
            d = np.load(embedding_path + '.npy', allow_pickle=True).item()
            Q_loc = d['Q_loc']
            Q_glob = d['Q_glob']
            mae = d['knn_mae_loo_orig']
            
            Q_loc_ = d['Q_loc_']
            Q_glob_ = d['Q_glob_']
            mae_ = d['knn_mae_loo_orig_']
            
        else:
            print(embedding_path, f'Calculating {method_name} for {label}...')  

            d = {}

            d['method_name'] = method_name
            d['parameters'] = params_dict[f'best_params_mae']
            d['label'] = label
            d['Q_loc'] = params_dict['Q_loc']
            d['Q_glob'] = params_dict['Q_glob']
            d['intrinsic_dim'] = intdim

            print('Learning...')
            d['parameters']['n_jobs'] = -1
            Z = methods_dict[method_name](n_components=intdim,
                                          **d['parameters']).fit_transform(X)

            d['Z'] = Z

            mae = KNN_MAE(X_orig, Z, averaging='median', weights='distance')
            d['knn_mae_loo_orig'] = mae

            N = X_orig.shape[0]
            X_ = X_orig.copy()
            Z_ = Z.copy() 

            inliers_indexes = np.arange(N)
            maxiter = 20
            for _ in range(maxiter):
                scoring_list = KNN_MAE(X_, Z_, averaging=None, weights='distance') 
                q = np.percentile(scoring_list, PERCENTILE)
                mask = scoring_list < q
                if mask.sum()/N < DATA_PERCENT_THRESHOLD:
                    break
                X_ = X_[mask]
                Z_ = Z_[mask]
                inliers_indexes = inliers_indexes[mask]

            outliers_indexes = np.array(list(set(np.arange(N)) - set(inliers_indexes)))
            Q_loc_, Q_glob_, mae_ = scorer(X_, Z_)

            d['inliers_indexes_mae'] = inliers_indexes
            d['outliers_indexes_mae'] = outliers_indexes
            d['X_'] = X_
            d['Z_'] = Z_ # final embedding
            d['Q_loc_'] = Q_loc_ 
            d['Q_glob_'] = Q_glob_ 
            d['knn_mae_loo_orig_'] = mae_ # final mae

            Q_loc = d['Q_loc']
            Q_glob = d['Q_glob'] 
            
            np.save(embedding_path, d)   
            
        print(f'Initial metrics: mae={mae}, Q_loc={Q_loc}, Q_glob={Q_glob}') 
        print(f'After removing outliers: mae={mae_}, Q_loc={Q_loc_}, Q_glob={Q_glob_}')
        print('-----------------------------------------------------------','\n')
