## Cycle extraction

Testing cycle extraction by iterating on cycle of extraction+subtraction

In [1]:
using  PyCall
using  Statistics , StatsBase
using  DataFrames, Printf , CSV , Query

import PyPlot
sns= pyimport("seaborn")

## directory
rootdir = ENV["GAIA_ROOT"]
wdir    = "$rootdir/products"
votdir  = "$rootdir/products/votable.2020"
plotdir = "$rootdir/products/test"
sclist  = "$rootdir/e2e_products/sc-list-2020.csv"

push!(LOAD_PATH,"$rootdir/master/src")
using GaiaClustering

cd(wdir)

In [2]:
function _getdata(filevot)
    voname = filevot

    println("## Data filtered in distance ..")
    data       = read_votable(voname)
    df         = filter_data(data,[0., 500])
    dfcart     = add_cartesian(df)

    return(df, dfcart)
end

function _mcmc_params()
    minQ    = 2.7
    minstars = 40
    forcedminstars = 30
##
    epsmean   = 2.5
    epsdisp   = 1.5
    min_nei   = 10
    min_cl    = 15
    ncoredisp = 10
    w3dmean   = 6.0
    w3ddisp   = 4.0
    wvelmean  = 6.0
    wveldisp  = 4.0
    whrdmean  = 2.0
    whrddisp  = 1.5
## MCMC parameters
    nburnout  = 200
    niter     = 1500
##
    pinit = GaiaClustering.abcfull(minQ, minstars, forcedminstars, epsmean, epsdisp, min_nei, min_cl, ncoredisp, w3dmean, w3ddisp ,
    wvelmean, wveldisp, whrdmean, whrddisp, nburnout , niter)
    return(pinit)
end

_mcmc_params (generic function with 1 method)

In [3]:
function _remove_stars!(df, dfcart, ilab)
    println("## removed..")
    dfdata= df.data[:,setdiff(1:end,ilab)]
    dfraw= df.raw[:,setdiff(1:end,ilab)]
    dferr= df.err[:,setdiff(1:end,ilab)]
    
    dfcartdata= dfcart.data[:,setdiff(1:end,ilab)]
    dfcartraw= dfcart.raw[:,setdiff(1:end,ilab)]
    dfcarterr= dfcart.err[:,setdiff(1:end,ilab)]
    
    
    s=size(dfdata)
    println("new df....")
    println(s)
    
    df= GaiaClustering.Df(s[2],dfdata,dfraw,dferr)
    dfcart= GaiaClustering.Df(s[2],dfcartdata,dfcartraw,dfcarterr)
end

_remove_stars! (generic function with 1 method)

In [4]:
function _extraction_mcmc(mc)
    epsm = mean(mc.eps)
    epsd = std(mc.eps)
    mneim = mean(mc.mne)
    mneid = std(mc.mne)
    mclm = mean(mc.mcl)
    mcld = std(mc.mcl)
    qcm = mean(mc.qc)
    qnm = mean(mc.qn)
    qcd = std(mc.qc)
    qnd = std(mc.qn)
    w3dm = mean(mc.w3d)
    w3dd = std(mc.w3d)
    wvelm = mean(mc.wvel)
    wveld = std(mc.wvel)
    whrdm = mean(mc.whrd)
    whrdd = std(mc.whrd)
    
    println("## DBSCAN/MCMC stats:")
    println("### Ïµ : ",epsm," +/- ", epsd)
    println("### min_nei  : ", mneim," +/- ", mneid)
    println("### min_clus  : ", mclm,"+/- ", mcld)
    println("### W3d  : ", w3dm,"+/- ", w3dd)
    println("### Wvel  : ", wvelm,"+/- ", wveld)
    println("### Whrd  : ", whrdm,"+/- ", whrdd)
    println("### Qn  : ",qnm," +/- ", qnd)
    println("### Qc  : ",qcm," +/- ", qcd)
    println("##")
    
    res = DataFrame(votname=votname, epsm = epsm, epsd=epsd, mneim=mneim,mneid=mneid,mclm=mclm,mcld=mcld,
            qcm=qcm,qcd=qcd, qnm=qnm,qnd=qnd,
            w3dm=w3dm,w3dd=w3dd,wvelm=wvelm,wveld=wveld,whrdm=whrdm,whrdd=whrdd)
    return(res)
end

function _cycle_extraction(votname)
    df, dfcart= _getdata(votdir*"/"*votname)
    params= _mcmc_params()
    
    cyclerun= true
    cycle= 1
    cyclemax= 3
    minstarselection= 50     # minimum of stars to select solution in a cycle...
    
    maxstarstop= 50          #condition to stop cycling
    
    while cyclerun
        println("##\n## Starting cycle $cycle ...")
        ## extraction one cycle.. MCMC optimization
        mc = abc_mcmc_dbscan_full2(dfcart, params)
        plot_dbscanfull_mcmc(plotdir, votname, mc , false)


        ## get the cluster and plot it
        println("## Extracting the cluster using DBSCAN/WEIGHTING with:")
        res2= _extraction_mcmc(mc)
        eps= res2.epsm[1]
        min_nei= trunc(Int,res2.mneim[1] + 0.5)
        min_cl= trunc(Int,res2.mclm[1] + 0.5)
        w3d= res2.w3dm[1]
        wvel= res2.wvelm[1]
        whrd= res2.whrdm[1]

        mres = GaiaClustering.modelfull(eps,min_nei,min_cl,w3d,wvel,whrd)
        dfcartnorm = getDfcartnorm(dfcart, mres)
        labels = clusters(dfcartnorm.data ,eps  , 20, min_nei, min_cl)

        labelmax , nmax = find_cluster_label2(labels, df, dfcart)
        println("### Label solution: $labelmax")
        println("### N stars: $nmax")
        scproperties0 = get_properties_SC(labels[labelmax] , df, dfcart)
        # plot_cluster2(plotdir, "$votname.$cycle", labels[labelmax], scproperties0,  dfcart , false)
        
        println("### subtracting ALL solutions from Df...")
        solidx=[]
        for ilab in labels
             solidx= vcat(solidx,ilab)
        end
        println(length(solidx))
        _remove_stars!(df, dfcart, solidx)
        println(size(df.data))
        println(size(dfcart.data))
        
        if nmax < maxstarstop || cycle == cyclemax
            println("## Cycle stopped at $cycle \n")
            println("## Check the code!!!")
            cyclerun= false
        end
        cycle += 1
    end
end

_cycle_extraction (generic function with 1 method)

In [5]:
function main(votname)
    _cycle_extraction(votname)
end

main (generic function with 1 method)

In [None]:
votname= "NGC 869-3.0deg.vot"
main(votname)

## Data filtered in distance ..
## Votable /home/stephane/Science/GAIA/products/votable.2020/NGC 869-3.0deg.vot read
## Filtering done ...
## Stars selected: 15192
## Cartesian transformation done ...
##
## Starting cycle 1 ...
## ABC/MCMC for DBSCAN FULL (parameters+weighting)...
## ABC/MCMC v2
### Minimum Q : 2.7
### Minimum nstars : 40
### Maximum nstars : 5000
### Maximum iterations: 750000
### Checking the minQ and minStars conditions...
### Minimum good solutions 10
### Minimum Q : 2.7
### Minimum nstars : 40
### init done ...
### mini stats...
