### Tests
Use of the agglomerative clustering with HR diagram.

We test here metrics to detect good solutions for clustering

In [1]:
import sys, os
sys.path.append('../../src')

from numba import jit

import matplotlib.pyplot as plt
from pylab import rcParams
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import pickle

from math import ceil
import math
import gaia_utils as gu
from sklearn import cluster
from sklearn.neighbors import kneighbors_graph
from astroML.correlation import two_point
from astroML.correlation import bootstrap_two_point_angular

%matplotlib inline

## directory
rootdir = "/home/stephane/Science/GAIA"
wdir    = "%s/products"%(rootdir)
datadir = "%s/master/notebooks/data"%(rootdir)

os.chdir(wdir)
rcParams['figure.figsize'] = 9, 6
###################################

clustername = "NGC 1647"
# voname = 'NGC 752-1.0deg.vot'
# voname = 'NGC 2682-3.0deg.vot'
voname = 'NGC 1647-2.0deg.vot'
voname = "Ruprecht 1-2.0deg.vot"
RADIUS   = 2.0
kCluster = 8
votable_disk = False
distclust = 572.0
WEIGHT = [3.,3.,11.,5.,5., 2., 2., 2.]
WEIGHT = [4.87863010104081, 4.87863010104081, 4.306272782136562, 2.5786331381796077, 2.5786331381796077, 1.4117964989460319, 1.4117964989460319, 1.4117964989460319]

## dscan
eps = 1.5
min_samples = 20
## Ward
neighbors = 30

Created TAP+ (v1.0.1) - Connection:
	Host: gea.esac.esa.int
	Use HTTPS: False
	Port: 80
	SSL Port: 443


In [2]:
## plot2D and plot3D


def plot2d(df, labels, ilab, cmap = "gist_stern" ,color = False):
    
    rcParams['figure.figsize'] = 14, 14
    f, axarr = plt.subplots(2, 2)
    
    if color:
        axarr[0,0].scatter(df[np.where(labels == ilab),0],df[np.where(labels == ilab),1],  s = 0.5, c= df[np.where(labels == ilab),2], cmap=cmap )
    else:
        axarr[0,0].scatter(df[np.where(labels == ilab),0],df[np.where(labels == ilab),1],  s = 0.5, c = "k")
    axarr[0,0].set_xlabel("l")
    axarr[0,0].set_ylabel("b")
    
    axarr[1,0].scatter(df[np.where(labels == ilab),0],df[np.where(labels == ilab),2] , s=0.5, c= df[np.where(labels == ilab),2], cmap=cmap)
    axarr[1,0].set_xlabel("l")
    axarr[1,0].set_ylabel("d (pc)")
    
    
    axarr[0,1].scatter(df[np.where(labels == ilab),3],df[np.where(labels == ilab),4] , s= 0.5, c= df[np.where(labels == ilab),2], cmap=cmap)
    axarr[0,1].set_xlabel("Vdra")
    axarr[0,1].set_ylabel("Vdec")
    
    axarr[1,1].scatter(df[np.where(labels == ilab),6],df[np.where(labels == ilab),5] , s = 0.5, c= df[np.where(labels == ilab),2], cmap=cmap)
    axarr[1,1].set_xlabel("G-R")
    axarr[1,1].set_ylabel("G")
    axarr[1,1].set_xlim(-1.,1.5)
    axarr[1,1].set_ylim(27.,10)
    
    plt.show()
    

In [3]:
## astrometric conversion
## 
def convert_to_cartesian(lgal, bga, dist, offCenter = [0., 0.]):
    "Convert ra,dec (ICRS) and distance (pc) to Cartesian reference. Off is the offset in Lgal,Bgal"
    
    xx = np.zeros(len(lgal))
    yy = np.zeros(len(lgal))
    zz = np.zeros(len(lgal))
    
    lgalOff = lgal - offCenter[0]
    bgalOff = bgal - offCenter[1]
    
    print(offCenter[0])
    print(offCenter[1])
    print(min(lgalOff))
    print(max(lgalOff))
    print(min(bgalOff))
    print(max(bgalOff))
    
    
    for i in range(len(lgal)):
        c = coord.SkyCoord(l=lgalOff[i]*u.degree, b=bgalOff[i]*u.degree, distance=dist[i]*u.pc, frame='galactic')
        
        xx[i] = c.cartesian.x.value
        yy[i] = c.cartesian.y.value
        zz[i] = c.cartesian.z.value
        
    print("## XX")
    print("min, max: %f , %f"%(min(xx),max(xx)))
    print("## YY")
    print("min, max: %f , %f"%(min(yy),max(yy)))
    print("## ZZ")
    print("min, max: %f , %f"%(min(zz),max(zz)))  
        
    return(xx,yy,zz)

In [4]:
## Read the data and do the conversion


source = gu.source(clustername)
source.weight = WEIGHT
#source.query(RADIUS, errtol = 0.2, dump = True)
source.read_votable(voname)
source.convert_filter_data(mag_range = [0., 40])
source.normalization_normal()
#source.normalization_minmax()

## Ruprecht 1-2.0deg.vot read...
## Total stars: 43294
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..


()

### Metrics

Metric to quantify goodness-of-solution for the clustering.

In [5]:
def metric1(df, labels, APERTURE = 0.2 , MAXRADIUS = 1. , NBOOTSTRAP =20 ):
    "Using the density contrat assuming the OC is at the center"
    
    xc   = np.mean(df[:,0])
    yc   = np.mean(df[:,1]) 
    
    nlab = max(labels)+1
    aper2 = APERTURE*APERTURE
    metric = {}
    metric['label'] = []
    metric['Q'] = []
    metric['Q_err'] = []
    
    for ilab in range(nlab):
        
        dflab = df[np.where(labels == ilab),:][0]
        radii = (dflab[:,0]- xc)*(dflab[:,0]- xc)+(dflab[:,1]- yc)*(dflab[:,1]- yc)
        nclust = radii[np.where(radii < aper2)]
        dens_clust = len(nclust) / aper2
        
        angle_out = np.random.uniform(0., 2*math.pi, NBOOTSTRAP)
        rad_out   = np.random.uniform(APERTURE,MAXRADIUS-APERTURE, NBOOTSTRAP)
        
        Q_c = np.zeros(NBOOTSTRAP)
        
        for k in range(NBOOTSTRAP): 
            xi = xc + rad_out[k]*math.cos(angle_out[k])
            yi = yc + rad_out[k]*math.sin(angle_out[k])
            radii_out = (dflab[:,0]- xi)*(dflab[:,0]- xi)+(dflab[:,1]- yi)*(dflab[:,1]- yi)
            nout = radii_out[np.where(radii_out < aper2)]
            dens_out_k = max(1,len(nout)) / aper2
            Q_c[k] = dens_clust / dens_out_k
            
        metric['label'].append(ilab)
        metric['Q'].append(np.mean(Q_c))
        metric['Q_err'].append(np.std(Q_c))
        
    return(metric)
                          
    
def metric2(df, labels, APERTURE = 0.2 , MAXRADIUS = 1. , NBOOTSTRAP = 50 , SIGCLIP = 0.):
    "Using the density contrat assuming the OC is at the center and the distribution around is regular (no holes)"
        
    epsilon = 0.1
    xc   = np.mean(df[:,0])
    yc   = np.mean(df[:,1]) 
    
    nlab = max(labels)+1
    aper2 = APERTURE*APERTURE
    metric = {}
    metric['label'] = []
    metric['Q'] = []
    metric['Q_err'] = []
    
    for ilab in range(nlab):
        
        dflab = df[np.where(labels == ilab),:][0]
        radii = (dflab[:,0]- xc)*(dflab[:,0]- xc)+(dflab[:,1]- yc)*(dflab[:,1]- yc)
        nclust = radii[np.where(radii < aper2)]
        dens_clust = len(nclust) / aper2
        
        angle_out = np.random.uniform(0., 2*math.pi, NBOOTSTRAP)
        rad_out   = np.random.uniform(APERTURE,MAXRADIUS-APERTURE, NBOOTSTRAP)
        
        nstarsout = np.zeros(NBOOTSTRAP)
        
        for k in range(NBOOTSTRAP): 
            xi = xc + rad_out[k]*math.cos(angle_out[k])
            yi = yc + rad_out[k]*math.sin(angle_out[k])
            radii_out = (dflab[:,0]- xi)*(dflab[:,0]- xi)+(dflab[:,1]- yi)*(dflab[:,1]- yi)
            nout = radii_out[np.where(radii_out < aper2)]
            nstarsout[k] = len(nout) + np.random.uniform(1., 1.+ epsilon)
                
        outmean = np.mean(nstarsout)
        outstd  = np.std(nstarsout)
        
        nstar_filtered = np.where( (nstarsout - outmean)/ outstd > SIGCLIP )

        dens_out = nstarsout[nstar_filtered] / aper2
        Q_c = np.zeros(len(dens_out))
        Q_c = dens_clust / dens_out
        
        metric['label'].append(ilab)
        metric['Q'].append(np.mean(Q_c))
        metric['Q_err'].append(np.std(Q_c))
        
    return(metric)                      

In [None]:
def iter_parameters(angmin,angmax,dmin,dmax,vmin,vmax,magmin,magmax, kmin, kmax ,ntrial, von = "test.vot", radius = 1):
    "Range of the weight for each group of parameters"
        
    s = gu.source(clustername)
    s.read_votable(von)

    metric = {}
    angle = np.linspace(angmin,angmax,ntrial)
    distance = np.linspace(dmin,dmax,ntrial)
    vel   = np.linspace(vmin, vmax,ntrial)
    mag   = np.linspace(magmin, magmax,ntrial)
    kclus = range(kmin,kmax)
    
    metric['kmeans'] = {}
    metric['kmeans']['weight'] = []
    metric['kmeans']['metric'] = []
                            
                            
    for a in angle:
        for v in vel:
            for m in mag:
                for d in distance:
                    for k in kclus:
                        WEIGHT = [a,a,d,v,v,m,m,m] 
                        s.weight = WEIGHT
                        s.convert_filter_data(mag_range = [0., 40])
                        s.normalization_normal()
    
                        print(WEIGHT)
        
                        kmeans = cluster.KMeans(n_clusters= kCluster, max_iter = 2000, n_init = 50)
                        kmeans.fit(s.dfnorm)
                        labels_k = kmeans.labels_
                        qk = metric1(s.df, labels_k, APERTURE = 0.2 , MAXRADIUS = radius , NBOOTSTRAP =10 )
                        metric['kmeans']['weight'].append(WEIGHT)
                        metric['kmeans']['metric'].append(qk)
                            
        
    return(metric)


def random_weighting(angmin,angmax,dmin,dmax,vmin,vmax,magmin,magmax, kmin, kmax , von = "test.vot", radius = 1, NBOOTSTRAP = 100, SCAN = None):
    "Sample with NBOOTSTRAP trial in the weight range to get the Q"
    
    np.random.seed()
    
    s = gu.source(clustername)
    s.read_votable(von)
    
    aper = 0.5
    
    if SCAN == None:
        metric = {}
        metric['kmeans'] = {}
        metric['kmeans']['weight'] = []
        metric['kmeans']['metric'] = []
        metric['ward'] = {}
        metric['ward']['weight'] = []
        metric['ward']['metric'] = []
        metric['spectral'] = {}
        metric['spectral']['weight'] = []
        metric['spectral']['metric'] = []
        metric['dbscan'] = {}
        metric['dbscan']['weight'] = []
        metric['dbscan']['metric'] = []
    else:
        metric = SCAN
        
    
    angle     = np.random.uniform(angmin, angmax, NBOOTSTRAP)
    distance  = np.random.uniform(dmin, dmax, NBOOTSTRAP)
    velocity  = np.random.uniform(vmin, vmax, NBOOTSTRAP)
    magnitude = np.random.uniform(magmin, magmax, NBOOTSTRAP)
    ncluster  = np.random.randint(kmin, kmax, NBOOTSTRAP)
    
            
    for i in range(NBOOTSTRAP):
        WEIGHT = [angle[i],angle[i],distance[i],velocity[i],velocity[i], magnitude[i],magnitude[i], magnitude[i]]
        nclust = ncluster[i]
        
        s.weight = WEIGHT
        s.convert_filter_data(mag_range = [0., 40])
        s.normalization_normal()
    
        print(WEIGHT)
        print(i)
        
        # kmeans
        kmeans = cluster.KMeans(n_clusters= nclust, max_iter = 2000, n_init = 50)
        kmeans.fit(s.dfnorm)
        labels_k = kmeans.labels_
        qk = metric2(s.df, labels_k, APERTURE = aper , MAXRADIUS = 0.9 * radius , NBOOTSTRAP =50 )
        metric['kmeans']['weight'].append(WEIGHT)
        metric['kmeans']['metric'].append(qk)
        print("## Best Q: %3.1f"%(max(qk['Q'])))
        print("# k-means done")
                            
        # ward
        connectivity = kneighbors_graph(source.dfnorm, n_neighbors= neighbors, include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        ward = cluster.AgglomerativeClustering(n_clusters= nclust, linkage='ward', connectivity=connectivity)
        ward.fit(s.dfnorm)
        labels_w = ward.labels_
        qw = metric2(s.df, labels_w, APERTURE = aper , MAXRADIUS = 0.9 * radius , NBOOTSTRAP =50 )
        metric['ward']['weight'].append(WEIGHT)
        metric['ward']['metric'].append(qw)
        print("## Best Q: %3.1f"%(max(qw['Q'])))        
        print("# Ward done")
        
        # Spectral
        spectral = cluster.SpectralClustering(n_clusters = nclust, eigen_solver='arpack', affinity="nearest_neighbors")
        # spectral.fit(s.dfnorm) !!!!
        # !!!!!!
        print("# !!! Spectral = Ward")
        labels_s = ward.labels_
        #!!!!!!!
        qs = metric2(s.df, labels_s, APERTURE = aper , MAXRADIUS = 0.9 * radius , NBOOTSTRAP =50 )
        metric['spectral']['weight'].append(WEIGHT)
        metric['spectral']['metric'].append(qs)
        print("## Best Q: %3.1f"%(max(qs['Q'])))        
        print("# Spectral done")
        
        # DBSCAN
        dbscan = cluster.DBSCAN(eps = eps, min_samples = min_samples)
        dbscan.fit(s.dfnorm)
        labels_d = dbscan.labels_
        unique_labels = set(labels_d)
        print(unique_labels)
        n_clusters_ = len(set(labels_d)) - (1 if -1 in labels_d else 0)
        if n_clusters_ > 0:
            qd = metric2(s.df, labels_d, APERTURE = aper , MAXRADIUS = 0.9 * radius , NBOOTSTRAP =50 )
            for i in range(max(labels_d)+1):
                print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_d[np.where(labels_d == i)]), np.median(source.df[np.where(labels_d == i),2]), np.std(source.df[np.where(labels_d == i),2]) ))
            print("##")
        else:
            qd = {}
            qd['Q'] = [0., 0.]
            qd['Q_err'] = [0.,0.]
            qd['label'] = [0,1]

        metric['dbscan']['weight'].append(WEIGHT)
        metric['dbscan']['metric'].append(qd)
        print("## Best Q: %3.1f"%(max(qd['Q'])))
        print("# DBSCAN done")
            
    return(metric)       
    

### Clustering

In [None]:
## testing loop on parameters ..
## Could be very long!!!
## To continue a previous scan..
#with open('dataQran.pickle', 'rb') as f:
#    previousMetric = pickle.load(f)

# q = iter_parameters(2.,5.,7.,12.,2.,4.,1.,3., 7,8 ,2, von = voname, radius = 2.)
q = random_weighting(1.,7.,3.,15.,1.,7.,1.,5., 5 ,12 , von = voname, radius = 2., NBOOTSTRAP = 250, 
                     SCAN = None)

with open('dataQran.pickle', 'wb') as f:
    pickle.dump(q, f, pickle.HIGHEST_PROTOCOL)

## Ruprecht 1-2.0deg.vot read...
## Total stars: 43294
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[1.3584658001091998, 1.3584658001091998, 4.059501375813087, 3.070450872236557, 3.070450872236557, 2.872387771335158, 2.872387771335158, 2.872387771335158]
0
## Best Q: 1.2
# k-means done
## Best Q: 1.1
# Ward done
# !!! Spectral = Ward
## Best Q: 1.1
# Spectral done
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, -1}
# Label     0 :   662  Dist: 1670.9 (161.8)
# Label     1 :    23  Dist:  935.7 ( 47.3)
# Label     2 :    41  Dist: 1883.1 ( 63.9)
# Label     3 :    20  Dist:  585.8 ( 79.3)
# Label     4 :   107  Dist: 1141.8 (107.2)
# Label     5 :    20  Dist: 1332.7 ( 65.2)
# Label     6 :    13  Dist: 1582.7 ( 62.2)
# Label     7 :    18  Dist:  889.1 ( 51.6)
# Label     8 :    49  Dist: 1593.0 ( 87.3)
# Label     9 :   151  Dist: 1383.0 (117.7)
# Label    10 :    82  Dist:  772.2 ( 69.9)
# Label    11 :   

## Best Q: 0.9
# k-means done
## Best Q: 1.1
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[2.251653900641867, 2.251653900641867, 13.229670402248505, 5.614270201905201, 5.614270201905201, 1.697984829185613, 1.697984829185613, 1.697984829185613]
18
## Best Q: 1.2
# k-means done
## Best Q: 1.2
# Ward done
# !!! Spectral = Ward
## Best Q: 1.2
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[4.819859461158827, 4.819859461158827, 3.9270248601201723, 3.2982386446135976, 3.2982386446135976, 1.7981825865867225, 1.7981825865867225, 1.7981825865867225]
19
## Best Q: 1.1
# k-means done
## Best Q: 1.2
# Ward done
# !!! Spectral = Ward
## Best Q: 1.1
# Spectral done
{0, -1}
# Label     0 :    40  Dist: 1576.9 ( 71.6)
##
## Best Q: 5.1
# DBSCAN do

{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[1.6912817890545315, 1.6912817890545315, 14.468300555727867, 1.2153762081363297, 1.2153762081363297, 1.1497681593369231, 1.1497681593369231, 1.1497681593369231]
33
## Best Q: 1.2
# k-means done
## Best Q: 1.0
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1}
# Label     0 : 17113  Dist: 1298.2 (373.8)
# Label     1 :    20  Dist:  280.0 ( 20.6)
# Label     2 :    17  Dist:  538.8 ( 14.0)
# Label     3 :   104  Dist:  535.4 ( 32.1)
# Label     4 :    82  Dist:  366.1 ( 23.3)
# Label     5 :    33  Dist:  433.7 ( 22.8)
# Label     6 :    32  Dist:  453.9 ( 20.0)
# Label     7 :    17  Dist:  626.9 ( 14.8)
# Label     8 :   128  Dist:  349.2 ( 44.5)
# Label     9 :    15  Dist:  334.7 ( 17.1)
# Label    10 :    31  Dist:  448.2 ( 20.3)
# Label    11 :    19  Dist:  442.3 ( 18.9)
# Label 

## Best Q: 1.0
# k-means done
## Best Q: 0.9
# Ward done
# !!! Spectral = Ward
## Best Q: 0.9
# Spectral done
{0, 1, 2, -1}
# Label     0 :    20  Dist: 1166.2 ( 87.3)
# Label     1 :    76  Dist: 1593.8 (130.0)
# Label     2 :    20  Dist: 1548.3 ( 78.9)
##
## Best Q: 4.9
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[2.598868887041249, 2.598868887041249, 6.377622320393149, 5.654411366582378, 5.654411366582378, 4.506946458509315, 4.506946458509315, 4.506946458509315]
52
## Best Q: 1.1
# k-means done
## Best Q: 1.1
# Ward done
# !!! Spectral = Ward
## Best Q: 1.1
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[3.4169359242388753, 3.4169359242388753, 3.641665906046138, 2.501545961827619, 2.501545961827619, 1.7073108219319524, 1.7073108219319524, 1.7073108219319524]
53
## Best Q: 1.3
# k-means done
## Best Q: 0.9
# Wa

## Best Q: 0.6
# k-means done
## Best Q: 1.1
# Ward done
# !!! Spectral = Ward
## Best Q: 1.1
# Spectral done
{0, -1}
# Label     0 :    29  Dist: 1548.2 ( 40.6)
##
## Best Q: 4.8
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[1.9203379024304121, 1.9203379024304121, 3.019639620909471, 1.4771248817511555, 1.4771248817511555, 2.34521182274744, 2.34521182274744, 2.34521182274744]
69
## Best Q: 1.2
# k-means done
## Best Q: 1.2
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{0, 1, 2, 3, 4, 5, 6, 7, 8, -1}
# Label     0 : 17064  Dist: 1274.9 (399.1)
# Label     1 :    20  Dist:  494.1 ( 79.5)
# Label     2 :    46  Dist:  534.9 ( 97.6)
# Label     3 :    21  Dist:  537.9 ( 82.0)
# Label     4 :    32  Dist:  367.9 ( 61.8)
# Label     5 :    20  Dist:  351.3 ( 88.4)
# Label     6 :    30  Dist:  492.7 ( 96.2)
# Label     7 :    16  Dist:  784.2 ( 95.6)
# Label     8 :    21  Dist:  454.7 ( 71.1)
##
## Best

## Best Q: 1.2
# k-means done
## Best Q: 1.3
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[6.04916190778706, 6.04916190778706, 14.184718561432769, 4.2555870576770145, 4.2555870576770145, 2.5111329771069117, 2.5111329771069117, 2.5111329771069117]
85
## Best Q: 0.7
# k-means done
## Best Q: 1.2
# Ward done
# !!! Spectral = Ward
## Best Q: 1.1
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[3.5054735028324924, 3.5054735028324924, 9.98822842379144, 3.5459484815654427, 3.5459484815654427, 3.0446836504101453, 3.0446836504101453, 3.0446836504101453]
86
## Best Q: 1.0
# k-means done
## Best Q: 1.0
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 334

## Best Q: 1.1
# k-means done
## Best Q: 1.3
# Ward done
# !!! Spectral = Ward
## Best Q: 1.3
# Spectral done
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, -1}
# Label     0 :    98  Dist: 1798.2 ( 28.7)
# Label     1 :    46  Dist: 1130.0 ( 25.5)
# Label     2 :    20  Dist: 1153.4 ( 19.5)
# Label     3 :    20  Dist: 1579.3 ( 13.0)
# Label     4 :    20  Dist: 1504.1 ( 17.2)
# Label     5 :   219  Dist: 1526.6 ( 80.3)
# Label     6 :    22  Dist: 1608.9 ( 17.5)
# Label     7 :    20  Dist:  598.1 ( 15.1)
# Label     8 :    48  Dist:  882.3 ( 22.4)
# Label     9 :    19  Dist:  846.2 ( 22.5)
# Label    10 :    70  Dist: 1646.8 ( 25.9)
# Label    11 :    52  Dist: 1361.5 ( 25.1)
# Label    12 :    35  Dist: 1354.8 ( 18.1)
# Label    13 :    15  Dist: 1581.3 ( 14.7)
# Label    14 :    20  Dist: 1459.5 ( 17.8)
# Label    15 :    20  Dist: 1941.6 ( 16.3)
# Label    16 :    22  Dist: 1789.5 ( 24.4)
# Label    17 :    22  Dist: 1860.8 ( 20.1)
# Label    18 :    25  Dist

## Best Q: 1.0
# k-means done
## Best Q: 1.0
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{0, 1, -1}
# Label     0 :    20  Dist: 1778.3 ( 49.3)
# Label     1 :    40  Dist: 1613.7 ( 60.3)
##
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[1.3149194569499072, 1.3149194569499072, 8.231864623658847, 5.153459602305474, 5.153459602305474, 4.494791890461496, 4.494791890461496, 4.494791890461496]
119
## Best Q: 1.1
# k-means done
## Best Q: 1.0
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[6.280362642987604, 6.280362642987604, 14.269049190771193, 4.398210626681156, 4.398210626681156, 3.890710634148034, 3.890710634148034, 3.890710634148034]
120
## Best Q: 0.9
# k-means done
## Best Q: 1.1
# Ward done
# !!! Spectral = Ward
## Best Q: 1.1
# 

## Best Q: 0.9
# k-means done
## Best Q: 1.0
# Ward done
# !!! Spectral = Ward
## Best Q: 1.1
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[4.55679149862854, 4.55679149862854, 14.392025474121464, 4.4341562074249135, 4.4341562074249135, 1.6865102249795543, 1.6865102249795543, 1.6865102249795543]
135
## Best Q: 1.1
# k-means done
## Best Q: 1.1
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[1.1084041895523637, 1.1084041895523637, 3.467362693393828, 4.234052093095108, 4.234052093095108, 4.634076782174589, 4.634076782174589, 4.634076782174589]
136
## Best Q: 1.2
# k-means done
## Best Q: 1.2
# Ward done
# !!! Spectral = Ward
## Best Q: 1.2
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497

## Best Q: 0.7
# k-means done
## Best Q: 1.2
# Ward done
# !!! Spectral = Ward
## Best Q: 1.3
# Spectral done
{0, -1}
# Label     0 :    49  Dist: 1578.7 ( 79.7)
##
## Best Q: 5.6
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[2.002401451964121, 2.002401451964121, 12.695862363052306, 2.868195954619253, 2.868195954619253, 3.5471372576779236, 3.5471372576779236, 3.5471372576779236]
155
## Best Q: 1.0
# k-means done
## Best Q: 1.0
# Ward done
# !!! Spectral = Ward
## Best Q: 1.0
# Spectral done
{-1}
## Best Q: 0.0
# DBSCAN done
## Conversion done...
## Stars selected: 33497
## Normalization Normal-Gauss done on filtered data..
[2.4133761251235226, 2.4133761251235226, 12.993219271462312, 3.7927781517594807, 3.7927781517594807, 1.2607550975508346, 1.2607550975508346, 1.2607550975508346]
156
## Best Q: 1.1
# k-means done
## Best Q: 1.1
# Ward done
# !!! Spectral = Ward
## Best Q: 1.1
# Spectral done
{0, -1}
# Label     0 : 

In [None]:
print("## k-means...")

# KMeans for each normalisation
kmeans = cluster.KMeans(n_clusters= kCluster, max_iter = 2000, n_init = 50)
kmeans.fit(source.dfnorm)
labels_k = kmeans.labels_
for i in range(kCluster):
    print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_k[np.where(labels_k == i)]), np.median(source.df[np.where(labels_k == i),2]), np.std(source.df[np.where(labels_k == i),2])))
print("##")

###########
print("## Ward... ")
# connectivity matrix for structured Ward

connectivity = kneighbors_graph(source.dfnorm, n_neighbors= neighbors, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)

ward = cluster.AgglomerativeClustering(n_clusters= kCluster, linkage='ward', connectivity=connectivity)
ward.fit(source.dfnorm)
labels_w = ward.labels_
for i in range(kCluster):
    print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_w[np.where(labels_w == i)]), np.median(source.df[np.where(labels_w == i),2]),np.std(source.df[np.where(labels_w == i),2])))
print("##")
    
############# 
print("## Spectral...")
spectral = cluster.SpectralClustering(n_clusters = kCluster, eigen_solver='arpack', affinity="nearest_neighbors")
spectral.fit(source.dfnorm)
labels_s = spectral.labels_
for i in range(kCluster):
    print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_s[np.where(labels_s == i)]), np.median(source.df[np.where(labels_s == i),2]),np.std(source.df[np.where(labels_s == i),2])))
print("##")


############# 
print("## DBSCAN...")
dbscan = cluster.DBSCAN(eps, min_samples)
dbscan.fit(source.dfnorm)
labels_d = dbscan.labels_
unique_labels = set(labels_d)
print(unique_labels)
for i in range(max(labels_d)+1):
    print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_d[np.where(labels_d == i)]), np.median(source.df[np.where(labels_d == i),2]), np.std(source.df[np.where(labels_d == i),2]) ))
print("##")


In [None]:
## Metrics of the solutions
np.random.seed(0)
labs = labels_k
qk = metric2(source.df, labs , APERTURE = 0.5 , MAXRADIUS = 0.9 * RADIUS, SIGCLIP = 0.0) 
labs = labels_w
qw = metric2(source.df, labs , APERTURE = 0.5 , MAXRADIUS = 0.9 * RADIUS, SIGCLIP = 0.0) 
labs = labels_s
qs = metric2(source.df, labs , APERTURE = 0.5 , MAXRADIUS = 0.9 * RADIUS, SIGCLIP = 0.0) 
labs = labels_d
qd = metric2(source.df, labs , APERTURE = 0.5 , MAXRADIUS = 0.9 * RADIUS, SIGCLIP = 0.0)  

plt.yscale("log", nonposy='clip')
plt.xlim([-1,kCluster+1])
plt.errorbar(qk['label'],qk['Q'], qk['Q_err'], label='k-means',fmt='.k', ecolor='gray', lw=1, capsize=5)
plt.errorbar(qw['label'],qw['Q'], qw['Q_err'], label='Ward', fmt='*r', ecolor='gray', lw=1, capsize=5)
plt.errorbar(qs['label'],qs['Q'], qs['Q_err'], label='Spectral', fmt='Db', ecolor='gray', lw=1, capsize=5)
plt.errorbar(qd['label'],qd['Q'], qd['Q_err'], label='DBSCAN', fmt='og', ecolor='gray', lw=1, capsize=5)
plt.legend(loc='upper right', shadow=True)
plt.xlabel("Label")
plt.show()

In [None]:
## separation distance
angl2pc = 3600. * 150e6 * distclust / 3.1e13
print("## Angular distance (1deg) : %3.1f pc"%(angl2pc))
plot2d(source.df, labels_d,0, cmap = "hsv")