### Tests

First tests of dbscans on a list of OCs

In [None]:
import sys, os
sys.path.append('../../src')

import astropy.coordinates as coord

from astropy.io.votable import parse
from astropy.table import Table
from astropy import units as u

import matplotlib.pyplot as plt
from pylab import rcParams
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd

from math import ceil , pi , cos, sin
import gaia_utils as gu
from sklearn.cluster import KMeans

%matplotlib inline

## directory
rootdir = "/home/stephane/Science/GAIA"
wdir    = "%s/products"%(rootdir)
datadir = "%s/gaia-shock/notebooks/data"%(rootdir)

os.chdir(wdir)

#### Input Parameters########################################################
filelist = datadir+"/"+"BrowseTargets.18292.1530479692.gaia.selected.txt"
fileoutGaia = wdir + "/"+"BrowseTargets.18292.1530479692.gaia.dbscan.txt"

WEIGHT = np.ones(8)

In [None]:
from astroquery.gaia import Gaia

# tables = Gaia.load_tables(only_names=True)

#for table in (tables):
#    print (table.get_qualified_name())
    

In [None]:
## read the cluster list from HEASARC
def read_cluster_list(filelist):
    
    df = pd.read_csv(filelist, sep='|')
    
    return(df)

######
def init_SCgaia(filegaia, filelist):
    "init output if not there.."
    
    if not os.path.exists(filegaia):
        with open(filelist,"r") as f:
            header = f.readline()
        with open(filegaia,"w") as f:
            f.write(header)
    
#######
## write in fileoutGaia the selected cluster for GAIA
def write_SCgaia(filegaia, row):
    df = pd.DataFrame(row).T
    df.to_csv(filegaia,sep ="|", mode = "a", header= False, index = False)
    
    
######       
## find the last SC found.
def find_lastSC(filegaia,filelist, dfsc):
    if os.path.exists(filegaia):
        print(filegaia)
        dfcurrent = read_cluster_list(filegaia)
        last_cluster = dfcurrent['name'].iloc[[-1]].iloc[0]
        index = dfsc.index[dfsc['name'].str.contains(last_cluster)]
    else:
        init_SCgaia(filegaia, filelist)
        last_cluster = "No cluster"
        index = [0]
        
    return(last_cluster, index[0])

In [None]:
## plot results dbscans..

def plot2d(df, distsc,result, labels, ilab, cmap = "gist_stern" , figname = "test.dbscan.png", color = False):
    
    rcParams['figure.figsize'] = 14, 14
    f, axarr = plt.subplots(2, 2)
    
    if color:
        axarr[0,0].scatter(df[np.where(labels == ilab),0],df[np.where(labels == ilab),1],  s = 1.0, c= df[np.where(labels == ilab),2], cmap=cmap )
    else:
        axarr[0,0].scatter(df[np.where(labels == ilab),0],df[np.where(labels == ilab),1],  s = 1.0, c = "k")
    axarr[0,0].set_xlabel("X")
    axarr[0,0].set_ylabel("Y")
    
    axarr[1,0].scatter(df[np.where(labels == ilab),1],df[np.where(labels == ilab),2] , s=1.0, c= "k")
    axarr[1,0].set_xlabel("Y")
    axarr[1,0].set_ylabel("Z")
    
    
    axarr[0,1].scatter(df[np.where(labels == ilab),3],df[np.where(labels == ilab),4] , s= 1.0, c= df[np.where(labels == ilab),2], cmap=cmap)
    axarr[0,1].set_xlabel("Vdra")
    axarr[0,1].set_ylabel("Vdec")
    
    axarr[1,1].scatter(df[np.where(labels == ilab),6],df[np.where(labels == ilab),5] , s = 1.0, c= df[np.where(labels == ilab),2], cmap=cmap)
    axarr[1,1].set_xlabel("G-R")
    axarr[1,1].set_ylabel("G")
    axarr[1,1].set_xlim(-0.5,2.0)
    axarr[1,1].set_ylim(27.,10)
    
    ### properties
    nstars    = result['nstars'][ilab+1]
    posx      = result['pos'][ilab+1][0]
    posy      = result['pos'][ilab+1][1]
    posz      = result['pos'][ilab+1][2]
    velra     = result['vel'][ilab+1][0]
    veldec    = result['vel'][ilab+1][1]
    stdx      = result['pos_std'][ilab+1][0]
    stdy      = result['pos_std'][ilab+1][1]
    stdz      = result['pos_std'][ilab+1][2]
    stdvelra  = result['vel_std'][ilab+1][0] 
    stdveldec = result['vel_std'][ilab+1][1]
    
    txt = "Obs. dist: %3.1f pc"%(distsc)
    axarr[1,1].text(0.05, 0.05, txt, horizontalalignment='left',verticalalignment='center', transform=axarr[1,1].transAxes)
    txt = "N: %d"%(nstars)
    axarr[1,1].text(0.05, 0.10, txt, horizontalalignment='left',verticalalignment='center', transform=axarr[1,1].transAxes)
    txt = "X,Y,Z: %3.1f , %3.1f , %3.1f"%(posx,posy,posz)
    axarr[1,1].text(0.05, 0.2, txt, horizontalalignment='left',verticalalignment='center', transform=axarr[1,1].transAxes)
    txt = "Disp X,Y,Z: %3.1f , %3.1f , %3.1f"%(stdx,stdy,stdz)
    axarr[1,1].text(0.05, 0.25, txt, horizontalalignment='left',verticalalignment='center', transform=axarr[1,1].transAxes)
    txt = "Vel: %3.1f , %3.1f"%(velra, veldec)
    axarr[1,1].text(0.05, 0.3, txt, horizontalalignment='left',verticalalignment='center', transform=axarr[1,1].transAxes)
    txt = "Disp Vel: %3.1f , %3.1f"%(stdvelra, stdveldec)
    axarr[1,1].text(0.05, 0.35, txt, horizontalalignment='left',verticalalignment='center', transform=axarr[1,1].transAxes)
    txt = "Ell.: %3.1f , %3.1f"%(stdz / stdy, stdvelra / stdveldec)
    axarr[1,1].text(0.05, 0.4, txt, horizontalalignment='left',verticalalignment='center', transform=axarr[1,1].transAxes)
    
    
    plt.savefig(figname)
    
    plt.show()
  

def print_result(result, ilab):
    "print best result..."  
    
    posx = np.median(df[np.where(labels == ilab),0])
    posy = np.median(df[np.where(labels == ilab),1])
    posz = np.median(df[np.where(labels == ilab),2])
    velra = np.median(df[np.where(labels == ilab),3])
    veldec = np.median(df[np.where(labels == ilab),4])
    stdx = np.std(df[np.where(labels == ilab),0])
    stdy = np.std(df[np.where(labels == ilab),1])
    stdz = np.std(df[np.where(labels == ilab),2])
    stdist = np.std(df[np.where(labels == ilab),2]) 
    stdvelra = np.std(df[np.where(labels == ilab),3]) 
    stdveldec = np.std(df[np.where(labels == ilab),4]) 
    
    print("## Physical properties of the label %d"%(ilab))
    print("## Pos. mean (X, Y, Z): %4.2f , %4.2f , %4.2f"%(posx, posy, posz))
    print("## Disp. (X, Y, Z): %4.2f , %4.2f , %4.2f"%(stdx, stdy, stdz))
    print("##")
    print("## Vel. mean (RA,Dec): %4.2f , %4.2f"%(velra, veldec))
    print("## Vel. disp. (RA,Dec): %4.2f , %4.2f"%(stdvelra, stdveldec))
    print("##")
    print("## Ellipticities:")
    print("## Spatial (Y/X) (Z/X) (Z/Y) : (%4.2f) (%4.2f) (%4.2f)"%(stdy / stdx, stdz/stdx, stdz/stdy))
    print("## Vel. (RA/Dec) : %4.2f"%(stdvelra / stdveldec))  

In [None]:
def average_density_hyperspace(df):
    "compute the average density and distance"
    
    
    ndim = df.shape
    nstars = ndim[0]
    nbootstrap = 5000
    
    npts = df[np.random.randint(0,nstars, nbootstrap),:]
    dist = pdist(npts)
    distmin = np.min(dist[np.nonzero(dist)])
    distmean = np.mean(dist[np.nonzero(dist)])
    distmax = np.max(dist[np.nonzero(dist)])
    
    return(distmin, distmean, distmax)    

In [None]:
def iter_dbscan(s, neps):
    "Range of the parameters for dbscan. The selection can be done either on Q or nstar and Q..."

        
    nstar_max = 0
    cluster_best = {}
    result_best = {}
    label_best = []
    ilab_best = []
    
    epsilon_best = 0
    min_sample_best = 0
    ilab_best = -1
    
    ###############
    res = average_density_hyperspace(source.dfcartnorm)
    print(res)
    min_samples = 15
    epsmin = res[0] * math.sqrt(min_samples)
    epsmax = eps_min * 1.5

    epsilon    = np.linspace(epsmin,epsmax, neps)
    
    for e in epsilon:
        print('.' , end="")
        labels_d, result = s.dbscan_(e, sm, cartesian = True)
        n_clusters_ = len(result['label']) -1
                        
        if n_clusters_ > 0:
            for nstar, lab in zip(result['nstars'], result['label']):
                if  nstar > nstar_max:
                    cluster_best = result
                    label_best = labels_d
                    ilab_best = lab
                    result_best = result
                    epsilon_best = e
                    min_sample_best = sm
                    qmax = Q
                    nstar_max = nstar      
                    
                            
    print("\n## DBSCAN done!") 
    print("## Eps: %f - min sample: %d"%(epsilon_best, min_sample_best))

    # print(cluster_best)
    
    return(label_best, ilab_best, cluster_best, result_best, epsilon_best, min_sample_best)

In [None]:
###############################################################
######  Main loop
###############################################################

df_cluster = read_cluster_list(filelist)
print(df_cluster.index)
print(df_cluster.columns)

lastSC , lastrow = find_lastSC(fileoutGaia, filelist, df_cluster)

print(lastSC)

for index, row in df_cluster.iloc[lastrow:].iterrows():
    print("#####################")
    clustername = row['name'].strip()
    print("## Cluster: %s"%(clustername))
    print("## Distance: %3.1f pc"%(row['distance']))
    
    print("##")
    print("## Starting DBSCAN optimization....")
    print("##")
    
    rasplit = row['ra'].split(' ')
    decsplit = row['dec'].split(' ')
    racluster = "%sh%sm%ss"%(rasplit[0],rasplit[1],rasplit[2])
    deccluster = "%sd%sm"%(decsplit[0],decsplit[1])
    c = coord.SkyCoord(racluster, deccluster, frame='icrs')
    radius = float(row['cluster_radius'])
    RADIUS = max(2.0, radius * 3.5)
    aper = min(0.5, radius * 0.8)
    
    print("## Query radius: %3.1f"%(RADIUS))
    
    source = gu.source(clustername)
    source.weight = WEIGHT
    
    try:
        findCluster = True
        voname = source.query(RADIUS,  coordCluster = [c.ra.deg, c.dec.deg], errtol = 0.2, dump = True)
    # source.read_votable(voname)
    except:
        print("## GAIA query failed...")
        findCluster = False
        
    if findCluster:
        source.convert_filter_data(mag_range = [0., 50])
        source.normalization_normal()
        source.add_cartesian()

        block = [[0,1,2],[3,4],[5,6,7]]
        weightblock = [3.0, 3.0, 2.0]
        source.normalization_PerBlock(block, weightblock, cartesian = False ,norm = "averagestep")
        source.normalization_PerBlock(block, weightblock, cartesian = True , norm = "averagestep", density = True)


        labels_ , ilab_ , cluster_ , result_, eps_, min_sample_  = iter_dbscan(source , 10)
    
        if ilab_ != -1:
            print_result(result_, ilab_)
            figname = "%s-%3.1fdeg.dbscanBest.png"%(clustername,RADIUS)
            plot2d(source.df, row['distance'], result_, labels_ , ilab_, cmap = "hsv" , figname = figname)


    write_SCgaia(fileoutGaia,row)
    
print("## Main loop done...")
