### Tests
Use of the agglomerative clustering with HR diagram.

We test here the 2D correlation function to detect cluster in the classes.

In [10]:
import sys, os
sys.path.append('../../src')


import matplotlib.pyplot as plt
from pylab import rcParams
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd

from math import ceil
import math
import gaia_utils as gu
from sklearn import cluster
from sklearn.neighbors import kneighbors_graph
from astroML.correlation import two_point
from astroML.correlation import bootstrap_two_point_angular

%matplotlib inline

## directory
rootdir = "/home/stephane/Science/GAIA"
wdir    = "%s/products"%(rootdir)
datadir = "%s/master/notebooks/data"%(rootdir)

os.chdir(wdir)
rcParams['figure.figsize'] = 10, 10
###################################

clustername = "NGC 1039"
# voname = 'NGC 752-1.0deg.vot'
# voname = 'NGC 2682-3.0deg.vot'
voname = 'NGC 1039-3.0deg.vot'
RADIUS   = 3.0
kCluster = 10
votable_disk = False
distclust = 510.0
WEIGHT = [3.,3.,11.,4.,4., 3., 3., 3.]

## dscan
eps = 0.05
min_samples = 30
## Ward
neighbors = 30

In [11]:
## plot2D and plot3D


def plot2d(df, labels, ilab, cmap = "gist_stern" ,color = False):
    
    rcParams['figure.figsize'] = 14, 14
    f, axarr = plt.subplots(2, 2)
    
    if color:
        axarr[0,0].scatter(df[np.where(labels == ilab),0],df[np.where(labels == ilab),1],  s = 0.5, c= df[np.where(labels == ilab),2], cmap=cmap )
    else:
        axarr[0,0].scatter(df[np.where(labels == ilab),0],df[np.where(labels == ilab),1],  s = 0.5, c = "k")
    axarr[0,0].set_xlabel("l")
    axarr[0,0].set_ylabel("b")
    
    axarr[1,0].scatter(df[np.where(labels == ilab),0],df[np.where(labels == ilab),2] , s=0.5, c= df[np.where(labels == ilab),2], cmap=cmap)
    axarr[1,0].set_xlabel("l")
    axarr[1,0].set_ylabel("d (pc)")
    
    
    axarr[0,1].scatter(df[np.where(labels == ilab),3],df[np.where(labels == ilab),4] , s= 0.5, c= df[np.where(labels == ilab),2], cmap=cmap)
    axarr[0,1].set_xlabel("Vdra")
    axarr[0,1].set_ylabel("Vdec")
    
    axarr[1,1].scatter(df[np.where(labels == ilab),6],df[np.where(labels == ilab),5] , s = 0.5, c= df[np.where(labels == ilab),2], cmap=cmap)
    axarr[1,1].set_xlabel("G-R")
    axarr[1,1].set_ylabel("G")
    axarr[1,1].set_xlim(-1.,1.5)
    axarr[1,1].set_ylim(27.,10)
    
    plt.show()
    

In [12]:
## astrometric conversion
## 
def convert_to_cartesian(lgal, bga, dist, offCenter = [0., 0.]):
    "Convert ra,dec (ICRS) and distance (pc) to Cartesian reference. Off is the offset in Lgal,Bgal"
    
    xx = np.zeros(len(lgal))
    yy = np.zeros(len(lgal))
    zz = np.zeros(len(lgal))
    
    lgalOff = lgal - offCenter[0]
    bgalOff = bgal - offCenter[1]
    
    print(offCenter[0])
    print(offCenter[1])
    print(min(lgalOff))
    print(max(lgalOff))
    print(min(bgalOff))
    print(max(bgalOff))
    
    
    for i in range(len(lgal)):
        c = coord.SkyCoord(l=lgalOff[i]*u.degree, b=bgalOff[i]*u.degree, distance=dist[i]*u.pc, frame='galactic')
        
        xx[i] = c.cartesian.x.value
        yy[i] = c.cartesian.y.value
        zz[i] = c.cartesian.z.value
        
    print("## XX")
    print("min, max: %f , %f"%(min(xx),max(xx)))
    print("## YY")
    print("min, max: %f , %f"%(min(yy),max(yy)))
    print("## ZZ")
    print("min, max: %f , %f"%(min(zz),max(zz)))  
        
    return(xx,yy,zz)

In [13]:
## Read the data and do the conversion


source = gu.source(clustername)
source.weight = WEIGHT
#source.query(RADIUS, errtol = 0.2, dump = True)
source.read_votable(voname)
source.convert_filter_data(mag_range = [0., 40])
#source.normalization_normal()
source.normalization_minmax()

## NGC 1039-3.0deg.vot read...
## Total stars: 59908
## Conversion done...
## Stars selected: 52114
## Normalization minmax done on filtered data..


()

### Correlation function

In [None]:
def pt_corr(X, xrange = [0.1, RADIUS], nbins = 20):
    "Computing the 2-point correlation"
    
    rlogmin = math.log10(xrange[0])
    rlogmax = math.log10(xrange[1])
    
    bins = np.logspace(rlogmin, rlogmax, nbins)
    print(bins)
    corr = two_point(X, bins)

    if not np.allclose(corr, 0, atol=0.02):
        print("## Low correlation function")
    
    bin_centers = 0.5 * (bins[1:] + bins[:-1])
    
    return(bin_center, corr)


def angular_twoptcorr(x1, x2, xrange = [0.1, 1.], nbins = 20, Nbootstraps=10,  method='landy-szalay', rseed=0):
    "angular correlation using bootstraping"
    
    np.random.seed(rseed)
    rlogmin = math.log10(xrange[0])
    rlogmax = math.log10(xrange[1])
    
    bins = np.logspace(rlogmin, rlogmax, nbins)
    corr, corr_err, bootstraps = bootstrap_two_point_angular(x1, x2, bins=bins, method=method, Nbootstraps=Nbootstraps)

    bin_centers = 0.5 * (bins[1:] + bins[:-1])
    
    return(bin_centers, corr, corr_err)
    
def plot_corr_label(X, labels, xrange, nbins = 20, errorbar = True):
    "plot the correlation function for i1,i2 of the data"
    
    nclust = max(labels)
    rcParams['figure.figsize'] = 14, 14
    nrow = int(ceil(nclust / 3))
    ncol = 3  

    f, axarr = plt.subplots(ncol, nrow)

    corrmax = []
    corrlab = []
    
    for i in range(nclust):
        ilabel = np.where(labels == i)[0]
        row = int(i / 3)
        col = i % 3   
        
        Xcenter = X[ilabel,:]
        Xcenter[:,0] = Xcenter[:,0] - np.mean(Xcenter[:,0])
        Xcenter[:,1] = Xcenter[:,1] - np.mean(Xcenter[:,1])
        
        xymax = max(max(Xcenter[:,0]), max(Xcenter[:,1]))
        
        if not errorbar:
            bin_centers, corr = pt_corr(Xcenter, xrange = xrange, nbins = nbins)
        else:
            bin_centers, corr, corr_err = angular_twoptcorr(Xcenter[:,0], Xcenter[:,1] , xrange = xrange, nbins = nbins)
        
        if not errorbar:
            axarr[row,col].semilogx(bins, corr, "b-")
            axarr[row,col].set_xlabel("l(degree)")
            axarr[row,col].set_ylabel("corr")
        else:
            axarr[row,col].errorbar(bin_centers, corr, corr_err,fmt='.k', ecolor='gray', lw=1)            
            axarr[row,col].set_xlabel(r'$\theta\ (deg)$')
            axarr[row,col].set_ylabel(r'$\hat{w}(\theta)$')
            axarr[row,col].set_xscale("log", nonposx='clip')
            axarr[row,col].set_yscale("log", nonposy='clip')

 
        txt = "Label: %d"%(i)
        axarr[row,col].text(0.1,1.02, txt, size=12, ha="left", transform=axarr[row,col].transAxes)
        
        corrmax.append(max(corr))
        corrlab.append(i)

    arrcorr = list(zip(corrmax,corrlab))
    arrcorr.sort(reverse=True)
    plt.show()
                       
    return(arrcorr)

### Clustering

In [None]:
print("## k-means...")

# KMeans for each normalisation
kmeans = cluster.KMeans(n_clusters= kCluster, max_iter = 2000, n_init = 50)
kmeans.fit(source.dfnorm)
labels_k = kmeans.labels_
for i in range(kCluster):
    print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_k[np.where(labels_k == i)]), np.median(source.df[np.where(labels_k == i),2]), np.std(source.df[np.where(labels_k == i),2])))
print("##")

###########
print("## Ward... ")
# connectivity matrix for structured Ward

connectivity = kneighbors_graph(source.dfnorm, n_neighbors= neighbors, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)

ward = cluster.AgglomerativeClustering(n_clusters= kCluster, linkage='ward', connectivity=connectivity)
ward.fit(source.dfnorm)
labels_w = ward.labels_
for i in range(kCluster):
    print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_w[np.where(labels_w == i)]), np.median(source.df[np.where(labels_w == i),2]),np.std(source.df[np.where(labels_w == i),2])))
print("##")
    
############# 
print("## Spectral...")
spectral = cluster.SpectralClustering(n_clusters = kCluster, eigen_solver='arpack', affinity="nearest_neighbors")
spectral.fit(source.dfnorm)
labels_s = spectral.labels_
for i in range(kCluster):
    print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_s[np.where(labels_s == i)]), np.median(source.df[np.where(labels_s == i),2]),np.std(source.df[np.where(labels_s == i),2])))
print("##")


############# 
print("## DBSCAN...")
dbscan = cluster.DBSCAN(eps, min_samples)
dbscan.fit(source.dfnorm)
labels_d = spectral.labels_
for i in range(max(labels_d)):
    print("# Label %5d : %5d  Dist: %6.1f (%5.1f)"%(i,len(labels_d[np.where(labels_d == i)]), np.median(source.df[np.where(labels_d == i),2]), np.std(source.df[np.where(labels_d == i),2]) ))
print("##")


## k-means...
# Label     0 :  4322  Dist: 1471.2 (109.8)
# Label     1 :  5797  Dist:  513.1 ( 99.7)
# Label     2 :  5286  Dist:  846.9 (102.7)
# Label     3 :  5560  Dist: 1814.6 ( 97.6)
# Label     4 :  5953  Dist:  270.3 ( 85.2)
# Label     5 :  5306  Dist:  824.3 (101.5)
# Label     6 :  5387  Dist: 1146.8 (101.0)
# Label     7 :  4288  Dist: 1482.5 (108.9)
# Label     8 :  5549  Dist:  535.1 ( 99.2)
# Label     9 :  4666  Dist: 1143.5 (109.5)
##
## Ward... 


In [None]:
## 2 pt-correlation
icorr = [0,1]
labs = labels_d
corrlab = plot_corr_label(source.df[:,icorr], labs , xrange = [0.1,RADIUS], nbins = 20, errorbar = True)

print("## 2 pt-correlation:")
for correlation in corrlab:
    print("## Label %d, Corr: %4.3f, Dist: %3.1f"%(correlation[1], correlation[0],np.median(source.df[np.where(labs == correlation[1]),2])))


In [None]:
plot2d(source.df, labels_s,4, cmap = "nipy_spectral")