In [1]:
import sklearn.datasets
import sklearn.manifold
import time

import fmsne

In [2]:
# Largest neighborhood size to consider when employing the
# 'red_rnx_auc' function for the reduced quality assessment in the
# demo.
Kup = 10000
Kup

10000

In [3]:
X_hds, label = sklearn.datasets.make_blobs(n_samples=11000, n_features=12, centers=22, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=3)

In [4]:
X_hds

array([[-2.17835312,  0.83513348,  2.44378383, ..., -4.05643124,
        -7.70324025,  3.86181136],
       [ 2.95942852,  0.73188722,  6.31845425, ...,  4.60871667,
         4.38290148,  7.3494298 ],
       [ 1.55727311,  3.4814885 , -5.08383881, ..., -0.93708656,
        -8.93866356, -0.11697329],
       ...,
       [ 0.7868025 ,  6.93857284, -5.42869924, ..., -6.266406  ,
         3.95897172, -6.41283556],
       [ 2.05716344,  6.41917298,  3.25338142, ..., -4.7592105 ,
         7.82157572,  6.29655331],
       [ 5.55539457,  1.03317579, -0.5089204 , ..., -0.76421238,
         3.25146932,  1.005046  ]])

In [10]:
label

array([19,  7,  0, ..., 11, 12, 21])

In [5]:
N_samp, M_HD = X_hds.shape
print("Number of data samples: {N_samp}".format(N_samp=N_samp))
print("HDS dimension: {M_HD}".format(M_HD=M_HD))

Number of data samples: 11000
HDS dimension: 12


In [6]:
# Targeted dimension of the LD embeddings
dim_LDS = 2
print("Targeted LDS dimension: {dim_LDS}".format(dim_LDS=dim_LDS))

Targeted LDS dimension: 2


In [25]:
big_data = (N_samp >= 10000)
if big_data:
    print('The data set is big in terms of its number of samples.')
    print('Multi-scale SNE, multi-scale t-SNE and t-SNE are hence not applied; only their fast versions are employed (fast multi-scale SNE, fast multi-scale t-SNE and Barnes-Hut t-SNE).')
    print('The reduced DR quality is evaluated; it means that the R_{NX}(K) curve is computed only for K=1 to Kup={Kup}, and that the AUC refers to the area under this reduced curve, with a log scale for K, instead of the full one for K=1 to N-2={v}, with N being the number of data samples.'.format(Kup=Kup, v=N_samp-2, NX='{NX}'))
else:
    print('The data set is moderate in terms of its number of samples.')
    print('Multi-scale SNE, multi-scale t-SNE and t-SNE are hence applied, as well as their fast versions (fast multi-scale SNE, fast multi-scale t-SNE and Barnes-Hut t-SNE).')
    print('The DR quality is completely evaluated; the R_{NX}(K) curve is computed for K=1 to N-2={v}, with N being the number of data samples, and the AUC refers to the area under this curve with a log scale for K.'.format(v=N_samp-2, NX='{NX}'))

The data set is big in terms of its number of samples.
Multi-scale SNE, multi-scale t-SNE and t-SNE are hence not applied; only their fast versions are employed (fast multi-scale SNE, fast multi-scale t-SNE and Barnes-Hut t-SNE).
The reduced DR quality is evaluated; it means that the R_{NX}(K) curve is computed only for K=1 to Kup=10000, and that the AUC refers to the area under this reduced curve, with a log scale for K, instead of the full one for K=1 to N-2=10998, with N being the number of data samples.


In [12]:
y1 = fmsne.fmstsne(X_hds=X_hds, rand_state=fmsne.np.random.RandomState(2))

In [13]:
y2 = fmsne.fmssne(X_hds=X_hds, rand_state=fmsne.np.random.RandomState(2))

In [14]:
y1
y2

array([[-12.07205687, -33.37838333],
       [ 21.43619895,  23.89537509],
       [ 21.75063098,  -1.804262  ],
       ...,
       [ 41.63248488,  38.06032428],
       [ 16.81745447, -54.48726426],
       [ 39.07990616,   9.95451969]])

In [15]:
rnx = [] ## list storing rnx results 
Ys = []  ## list of LD arrays
Ys.append(y1) ## add LDs
Ys.append(y2)

In [16]:
rnx = fmsne.eval_dr_quality_from_data_list(X_hds, Ys)

In [17]:
rnx

[(array([ 5.58414398e-01,  4.89316231e-01,  4.37937605e-01, ...,
         -2.42515185e-04, -1.81867782e-04, -9.09256232e-05]),
  0.4351182960581622),
 (array([0.53095736, 0.44312601, 0.40647448, ..., 0.02803793, 0.03864337,
         0.01991089]),
  0.4344524385413029)]