## Cycle extraction

Testing cycle extraction by iterating on cycle of extraction+subtraction

In [1]:
using  PyCall
using  Statistics , StatsBase
using  DataFrames, Printf , CSV , Query
using  Dates, Glob

import PyPlot
sns= pyimport("seaborn")

## directory
rootdir = ENV["GAIA_ROOT"]
wdir    = "$rootdir/products"
votdir  = "$wdir/votable.2019"
plotdir = "$wdir/test"
ocdir   = "$wdir/octest"
sclist  = "$rootdir/e2e_products/sc-list-2020.csv"

push!(LOAD_PATH,"$rootdir/master/src")
using GaiaClustering

cd(wdir)


In [2]:
function _getdata(filevot)
    voname = filevot

    println("## Data filtered in distance ..")
    data       = read_votable(voname)
    df         = filter_data(data,[0., 2000])
    dfcart     = add_cartesian(df)

    return(df, dfcart)
end

_getdata (generic function with 1 method)

In [3]:
function _remove_stars(df, dfcart, idx)
    # println(size(df.data))
    # println("setdiff")
    # println(size(df.data[:,setdiff(1:end,idx)]))
    s=size(df.data)
    # println(size(df.data[:,setdiff(1:s[2],idx)]))
    diff= setdiff(1:s[2],idx)
    # println(length(diff))
    
    #println("end setdiff")
    
    dfdata= df.data[:,setdiff(1:end,idx)]
    dfraw= df.raw[:,setdiff(1:end,idx)]
    dferr= df.err[:,setdiff(1:end,idx)]
    
    dfcartdata= dfcart.data[:,setdiff(1:end,idx)]
    dfcartraw= dfcart.raw[:,setdiff(1:end,idx)]
    dfcarterr= dfcart.err[:,setdiff(1:end,idx)]
     
    s=size(dfdata)
    # println("##ndata")
    # println(s[2])
    # println("##ndata")
    
    dfnew= GaiaClustering.Df(s[2],dfdata,dfraw,dferr)
    dfcartnew= GaiaClustering.Df(s[2],dfcartdata,dfcartraw,dfcarterr)
    
    # println(size(dfnew.data))
    nrem= length(idx)
    println("### $nrem stars removed")
    return(dfnew, dfcartnew)
end

function _extraction_mcmc(mc, votname)
    epsm = median(mc.eps)
    epsd = std(mc.eps)
    mneim = median(mc.mne)
    mneid = std(mc.mne)
    mclm = median(mc.mcl)
    mcld = std(mc.mcl)
    qcm = median(mc.qc)
    qnm = median(mc.qn)
    qcd = std(mc.qc)
    qnd = std(mc.qn)
    w3dm = median(mc.w3d)
    w3dd = std(mc.w3d)
    wvelm = median(mc.wvel)
    wveld = std(mc.wvel)
    whrdm = median(mc.whrd)
    whrdd = std(mc.whrd)
    
    @printf("## DBSCAN/MCMC stats: \n")
    @printf("### ϵ : %3.3f +/- %3.3f \n", epsm, epsd)
    @printf("### min_nei  : %3.1f +/- %3.3f \n", mneim, mneid)
    @printf("### min_clus  : %3.1f +/- %3.3f \n", mclm, mcld)
    @printf("### W3d  : %3.3f +/- %3.3f \n", w3dm, w3dd)
    @printf("### Wvel  : %3.3f +/- %3.3f \n" , wvelm, wveld)
    @printf("### Whrd  : %3.3f +/- %3.3f \n", whrdm, whrdd)
    @printf("### Qn  : %3.3f +/- %3.3f \n",qnm, qnd)
    @printf("### Qc  : %3.3f +/- %3.3f \n",qcm, qcd)
    @printf("##")
    
    res = DataFrame(votname=votname, epsm = epsm, epsd=epsd, mneim=mneim,mneid=mneid,mclm=mclm,mcld=mcld,
            qcm=qcm,qcd=qcd, qnm=qnm,qnd=qnd,
            w3dm=w3dm,w3dd=w3dd,wvelm=wvelm,wveld=wveld,whrdm=whrdm,whrdd=whrdd)
    return(res)
end

function _score_cycle(qc, qn, nchain, iter)
    k= log10(qc*qn*nchain /iter)
    
    return(k)
end

_score_cycle (generic function with 1 method)

In [4]:
function _plot_check(dfcart)
    cart= DataFrame(X=dfcart.data[1,:], Y=dfcart.data[2,:], Z=dfcart.data[3,:])

    println("## check plot subtraction ...")
    
    PyPlot.plt.figure(figsize=(9.0,8.0))
    PyPlot.plt.subplot(1, 1, 1 , ylim=[100,450], xlim=[-50,50])
    PyPlot.plt.scatter(cart.Y, cart.X, s = 0.1 )
    PyPlot.plt.xlabel("Y (pc)")
    PyPlot.plt.ylabel("X (pc)")
    PyPlot.plt.grid(true)
    
    PyPlot.plt.show()

end

_plot_check (generic function with 1 method)

In [5]:
## update only the fileres with DF
function _updt!(fileres, df)
    if !isfile(fileres)
        CSV.write(fileres,df,delim=';')
        println("## $fileres created...")
        return(true)
    else
        res = DataFrames.copy(CSV.read(fileres, delim=";"))
        append!(res,df)
        CSV.write(fileres,res,delim=';')
        return(true)
    end
end



_updt! (generic function with 1 method)

In [6]:
## main cycle method
## filedebug is mainly to control the different time lapse
##
function _cycle_extraction(votname, filedebug, m::GaiaClustering.meta)
    println("############### extraction #########")
    df, dfcart= _getdata(votdir*"/"*votname)
    
    cyclerun= true
    cycle= 1
    
    cyclemax= m.cyclemax
    minstarselection=   m.minstarselection    # minimum of stars to select solution in a cycle...????
    minstarstop=   m.minstarstop         # condition to stop cycling
    minchainreached=  m.minchainreached      # minimum chain to analyze solution
    qcmin=  m.qcmin                # more condition on Qc to stop cycling after the first
    wratiomin=  m.wratiomin          # minimum ratio btwn w3d and wvel (otherwise not an OC)
    
    #if isfile(filedebug)
    #    println("## $filedebug removed...")
    #    rm(filedebug)
    #end
    
    println("##")
    println("##")
    while cyclerun
        tstart= now()
        println("##\n## starting cycle $cycle ...")
        ## extraction one cycle.. MCMC optimization
        mc , iter, flag= abc_mcmc_dbscan_full2(dfcart, m)
        println("## Flag: $flag")
        nchain= length(mc.qc)
        println("## $iter iterations performed...")
        println("## $nchain chains")
        
        if flag== -1 || nchain > minchainreached
            println("## optimization completed..")
            println("## analyzing solutions...")
            plot_dbscanfull_mcmc(plotdir, "$votname.$cycle", mc , false)

            ## get the cluster and plot it
            println("## extracting the cluster using DBSCAN/WEIGHTING with:")
            res2= _extraction_mcmc(mc, votname)
            eps= res2.epsm[1]
            min_nei= trunc(Int,res2.mneim[1] + 0.5)
            min_cl= trunc(Int,res2.mclm[1] + 0.5)
            w3d= res2.w3dm[1]
            wvel= res2.wvelm[1]
            whrd= res2.whrdm[1]

            mres = GaiaClustering.modelfull(eps,min_nei,min_cl,w3d,wvel,whrd)
            dfcartnorm = getDfcartnorm(dfcart, mres)
            labels = clusters(dfcartnorm.data ,eps  , 20, min_nei, min_cl)
            labelmax , nmax, qc = find_cluster_label2(labels, df, dfcart)
            println("## label $labelmax written to oc...")
            export_df("$votname.$cycle", ocdir, df , dfcart, labels , labelmax)
            scproperties0 = get_properties_SC(labels[labelmax] , df, dfcart)
            scproperties2 = get_properties_SC2(labels[labelmax] , df, dfcart)
        
            plot_cluster2(plotdir, "$votname.$cycle", labels[labelmax], scproperties0,  dfcart , false)
        
            println("###")
            println("### Label solution: $labelmax")
            println("### N stars: $nmax")
            println("### Qc: $qc")
            println("###")
        
             _plot_check(dfcart)
        
            k= _score_cycle(qc, nmax, nchain, iter)
            @printf("## score cycle %d: %3.3f \n",cycle, k)
                
            println("###")
            println("### subtracting BEST solution from Df...")
            dfnew, dfcartnew= _remove_stars(df, dfcart, labels[labelmax])
            df= dfnew
            dfcart= dfcartnew
        
            ######################### STOP conditions
            if nmax < minstarstop
                println("## extraction stopped at cycle $cycle")
                println("## nmax too low...")
                cyclerun= false
            end
            if cycle == cyclemax
                println("## extraction stopped at cycle $cycle")
                println("## cyclemax reached...")
                cyclerun= false
            end
            if qc < qcmin
                println("## extraction stopped at cycle $cycle")
                println("## Qc too low...")
                cyclerun= false
            end
            if w3d/wvel < wratiomin || wvel/w3d < wratiomin
                println("## extraction stopped at cycle $cycle")
                println("## weight ratio too low...")
                cyclerun= false
            end
            if flag != -1
                println("## extraction stopped at cycle $cycle")
                println("## chain iteration not performed completely")
                cyclerun= false
            end
            ### 
            
            tend= now()
            duration= Dates.value(tend-tstart) / (1000*1)
            nstar= size(df.raw)[2]
            timeperiterstar= duration / (iter*nstar)
            timeperchainstar= duration / (nchain*nstar)
            @printf("## Time: \n")
            @printf("## duration per cycle %3.3f sec \n", duration)
            @printf("## duration per iteration*star %3.3e sec \n", timeperiterstar)
            @printf("## duration per chain*star %3.3e sec \n", timeperchainstar)
            @printf("##\n")
        
            ## log the results of performances
            dfout= DataFrame(votname=votname, cycle=cycle, nstar=nstar, qc=qc, nmax=nmax, nchain=nchain, iter=iter,
            scorecycle=k, duration=duration, timeperiterstar=timeperiterstar ,
            timeperchainstar= timeperchainstar )
            _updt!(filedebug, dfout)
            cycle += 1
        else
            println("## nothing found, stopped...")
            cyclerun= false
        end
    end
end

_cycle_extraction (generic function with 1 method)

In [7]:
function main(votlist, metafile, debugfile= "cycle_extraction.debug.csv")
    m= read_params(metafile)
    for votname in votlist
        println("## Analyzing $votname")
        _cycle_extraction(votname, debugfile, m)
    end
end

main (generic function with 2 methods)

In [None]:
# votname= "NGC 869-3.0deg.vot"
metafile= "test.ext"
cd(votdir)
votlist= glob("NGC 2*vot")
cd(wdir)

main(votlist, metafile)

## All parameters set to default...
Any["minQc", "=", 2.7]
Any["minQn", "=", 40]
Any["maxQn", "=", 5000]
Any["forcedminstars", "=", 30]
Any["mingoodsolution", "=", 10]
Any["niterqminq", "=", 500]
Any["nburnout", "=", 1000]
Any["nchain", "=", 6000]
Any["maxiter", "=", 15000]
Any["cyclemax", "=", 4]
Any["minstarselection", "=", 50]
Any["minstarstop", "=", 50]
Any["minchainreached", "=", 100]
Any["qcmin", "=", 1.5]
Any["wratiomin", "=", 0.2]
Any["epsmean", "=", 2.5]
Any["epsdisp", "=", 1.5]
Any["min_nei", "=", 10]
Any["min_cl", "=", 15]
Any["ncoredisp", "=", 10]
Any["w3dmean", "=", 6.0]
Any["w3ddisp", "=", 4.0]
Any["wvelmean", "=", 6.0]
Any["wveldisp", "=", 4.0]
Any["whrdmean", "=", 2.0]
Any["whrddisp", "=", 1.5]
Any["aperture2d", "=", 1.5]
Any["maxaperture2d", "=", 15.0]
Any["aperture3d", "=", 3.0]
Any["maxaperture3d", "=", 20]
Any["aperturev", "=", 3.0]
Any["maxaperturev", "=", 20.0]
Any["nboot", "=", 50.0]
Any["labels", "=", "Qc"]
## Parameters read from test.ext
## Analyzing NGC 2026-

### Qc : 3.179 
### Qn : 517.820 
### chain:1000
### burnout done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
### chain:6000
## ABC/MCMC FULL done
##
## Flag: -1
## 10303 iterations performed...
## 6003 chains
## optimization completed..
## Analyzing solutions...
## Extracting the cluster using DBSCAN/WEIGHTING with:
### testing mcmc results with median instead of mean...
## DBSCAN/MCMC stats: 
### ϵ : 2.761 +/- 0.675 
### min_nei  : 7.0 +/- 4.717 
### min_clus  : 18.0 +/- 6.516 
### W3d  : 5.998 +/- 2.882 
### Wvel  : 6.835 +/- 3.084 
### Whrd  : 2.076 +/- 1.119 
### Qn  : 541.000 +/- 160.830 
### Qc  : 2.888 +/- 0.124 
#### Selecting best cluster based on Qc..
## Qc: Any[1.0000998108009058, 1.2806755704360178, 1.0645214390274629, 2.8862804680987786, 1.673394688175832]
## Label 4 written to oc...
### /home/stephane/Science/cluster/GAIA/products/octest/NGC 2112-3.1deg.1.oc.csv created 
### Cluster plot is centered in Y,Z...
### Label solution: 4
### N 

#### MinQ not reached yet... testing with 2.565
#### MinQ not reached yet... testing with 2.43675
#### MinQ not reached yet... testing with 2.3149124999999997
#### MinQ not reached yet... testing with 2.1991668749999995
#### MinQ not reached yet... testing with 2.0892085312499993
#### MinQ not reached yet... testing with 1.9847481046874993
#### MinQ not reached yet... testing with 1.8855106994531243
#### MinQ not reached yet... testing with 1.791235164480468
#### MinQ not reached yet... testing with 1.7016734062564445
#### MinQ not reached yet... testing with 1.616589735943622
#### MinQ not reached yet... testing with 1.5357602491464408
### Minimum Qc: 1.5357602491464408
### Minimum Qn: 19
### Minimum Qn forced to : 30
### init done ...
### mini stats...
### Qc : 1.633 
### Qn : 153.700 
### chain:1000
### burnout done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
### Maximum iteration reached, current solution returned...
## Flag: 3
## 15001 iterations 

#### MinQ not reached yet... testing with 2.3149124999999997
#### MinQ not reached yet... testing with 2.1991668749999995
#### MinQ not reached yet... testing with 2.0892085312499993
#### MinQ not reached yet... testing with 1.9847481046874993
#### MinQ not reached yet... testing with 1.8855106994531243
#### MinQ not reached yet... testing with 1.791235164480468
#### MinQ not reached yet... testing with 1.7016734062564445
#### MinQ not reached yet... testing with 1.616589735943622
### Minimum Qc: 1.616589735943622
### Minimum Qn: 20
### Minimum Qn forced to : 30
### init done ...
### mini stats...
### Qc : 1.639 
### Qn : 2106.130 
### chain:1000
### burnout done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
### chain:6000
## ABC/MCMC FULL done
##
## Flag: -1
## 12277 iterations performed...
## 6003 chains
## optimization completed..
## Analyzing solutions...
## Extracting the cluster using DBSCAN/WEIGHTING with:
### testing mcmc results with median inst

#### Selecting best cluster based on Qc..
## Qc: Any[2.6032046982022146, 1.434057899516957, 1.5247876628582704, 1.502029565541117, 1.4892948948141997, 1.9873502614949774, 1.134054285457392, 1.3311645074639842, 1.2535564260531384, 1.3249681985697244, 1.2007557128349307, 1.184143419849834, 1.1495908530961076, 1.4088612392285311, 1.2615685428991026, 1.6316022034098414, 1.2527423349869644, 1.2360706332289013, 1.2941483833453755, 0.9972385722429278, 1.1622878232559555, 1.1029479059316933, 1.4443600107301153, 1.5651397944047496, 1.3116241042112098, 1.2055539321890771, 0.9031966702895594, 1.2384785586565006, 1.0814329967767469, 1.4263635286672482, 1.0617663480038797, 1.1285303731265592, 0.9426273372042495, 0.9814577296005653, 1.360984136114452, 1.2532012475365564, 0.9596060713860532, 1.1373065938947928, 0.9964358864636114, 1.929271024383058, 1.1465717952317067, 1.5675563475046237, 0.920397699606166, 1.0062234477004774]
## Label 1 written to oc...
### /home/stephane/Science/cluster/GAIA/produc

### Minimum Qc: 1.0188547268453296
### Minimum Qn: 11
### Minimum Qn forced to : 30
### init done ...
### mini stats...
### Qc : 1.188 
### Qn : 167.340 
### chain:1000
### burnout done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
### chain:6000
## ABC/MCMC FULL done
##
## Flag: -1
## 11332 iterations performed...
## 6003 chains
## optimization completed..
## Analyzing solutions...
## Extracting the cluster using DBSCAN/WEIGHTING with:
### testing mcmc results with median instead of mean...
## DBSCAN/MCMC stats: 
### ϵ : 4.874 +/- 1.408 
### min_nei  : 5.0 +/- 4.078 
### min_clus  : 17.0 +/- 6.580 
### W3d  : 1.047 +/- 6.193 
### Wvel  : 4.993 +/- 3.588 
### Whrd  : 3.028 +/- 1.385 
### Qn  : 358.000 +/- 619.319 
### Qc  : 1.282 +/- 0.236 
#### Selecting best cluster based on Qc..
## Qc: Any[1.066426546122563, 1.0462717741259315, 0.34537025598027593, 0.552899087808869, 0.4401114628091891, 0.510647575354621, 0.36092203075178797]
## Label 1 written to oc.

#### MinQ not reached yet... testing with 2.0892085312499993
#### MinQ not reached yet... testing with 1.9847481046874993
### Minimum Qc: 1.9847481046874993
### Minimum Qn: 28
### Minimum Qn forced to : 30
### Maximum iteration reached, no solutions...
## Flag: 2
## 15001 iterations performed...
## 0 chains
##
## Starting cycle 1 ...
## ABC/MCMC for DBSCAN FULL (parameters+weighting)...
## ABC/MCMC v2
### Chains  : 6000
### Burn-in : 1000
### Minimum Qc : 2.7
### Minimum Qn : 40
### Maximum Qn : 5000
### Maximum iterations: 15000
#### Checking the minQc and minQn conditions...
#### Minimum good solutions 10
#### Number of iterations: 500, maxiter: 15000
#### MinQ not reached yet... testing with 2.565
#### MinQ not reached yet... testing with 2.43675
#### MinQ not reached yet... testing with 2.3149124999999997
#### MinQ not reached yet... testing with 2.1991668749999995
#### MinQ not reached yet... testing with 2.0892085312499993
#### MinQ not reached yet... testing with 1.9847481046874