## Testing Raw plot

In [1]:
## Testing an optimization on a small set of files
## It includes the optimization by cycle:detection+subtraction
##

## Main loop to extract in e2e the DBSCAN parameters.
## The list of votable is selected directly from the directory

using  PyCall
using DataFrames, Query
using CSV, Glob, Dates
using Statistics, Random
using Printf
using MultivariateStats, StatsBase

import PyPlot

rootdir =  ENV["GAIA_ROOT"]

push!(LOAD_PATH,"$rootdir/run/src")
using GaiaClustering

## directory
wdir    = "$rootdir/products"
votdir  = "$rootdir/e2e_products/votable.edr3.2021"
plotdir = "$rootdir/products/plot-TEST-slow"
ocdir   = "$rootdir/products/oc-TEST-slow"


## Maximum random votable for testing
MAX_VOTABLE = 30


┌ Info: Precompiling GaiaClustering [top-level]
└ @ Base loading.jl:1317


30

In [2]:
function readlist_votable(filelist::String)
    vot = CSV.read(filelist)
    return(vot)
end

function _getdata(filevot)
    voname = filevot

    data       = read_votable(voname)
    df         = filter_data(data, [0,2100])
    dfcart     = add_cartesian(df)
    blck       = [[1,2,3],[4,5], [6,7,8]]
    wghtblck   = [4.0,5.0,1.0]
    norm       = "identity"

    dfcartnorm , scale8 = normalization_PerBlock(dfcart, blck, wghtblck , norm, false)
    return(df, dfcart , dfcartnorm)
 end


## to check if done and record
## check and updt if votname analyzed. If not done return false
function _updt_votcompleted(fileres, votname , cycletot=1, flag= 0 , onlycheck=true)
    let
        if  onlycheck
            if !isfile(fileres)
                return(0, false)
            else
                res = CSV.File(fileres, delim=";") |> DataFrame
                if votname in res.votname
                    x = @from i in res begin
                        @where i.votname == votname
                        @select i
                        @collect DataFrame
                    end
                    return(x, true)
                else
                    return(0, false)
                end
            end
            ## UPDTE
        else
            if !isfile(fileres)
                res = DataFrame(votname=votname, cycle=cycletot, flag=flag)
                CSV.write(fileres,res,delim=';')
                println("## $fileres created...")
                return(res,true)
            else
                res= CSV.File(fileres, delim=";") |> DataFrame
                newrow = DataFrame(votname=votname,cycle=cycletot, flag=flag)
                append!(res,newrow)
                CSV.write(fileres,res,delim=';')
                return(res,true)
            end
        end
    end
end
## check for blacklist
##
function _read_blacklist(blackname)
    if isfile(blackname)
        df= CSV.read(blackname, DataFrame, delim=";")
        blacklist= df.votname
    else
        blacklist= [""]
    end

    return(blacklist)
end


_read_blacklist (generic function with 1 method)

In [3]:
# Output project PC is on the normalized data
#
function _compute_PC(df::GaiaClustering.Df, dfcart::GaiaClustering.Df, labels, labelmax)
        print("### Computing principal components... \n")
        s=size(labels[labelmax])
        data= zeros(8,s[1])
    
        X= dfcart.data[1, labels[labelmax]]
        Y= dfcart.data[2, labels[labelmax]]
        Z= dfcart.data[3, labels[labelmax]]
        vl= df.data[4,labels[labelmax]]
        vb= df.data[5,labels[labelmax]]
        gbar= df.raw[10,labels[labelmax]]
        rp= df.raw[11,labels[labelmax]]
        bp= df.raw[12,labels[labelmax]]
   
        data[1,:]= X 
        data[2,:]= Y
        data[3,:]= Z
        data[4,:]= vl
        data[5,:]= vb
        data[6,:]= gbar
        data[7,:]= gbar .- rp
        data[8,:]= bp .- gbar 
        
        # d=Array(data')
        dt= StatsBase.fit(ZScoreTransform, data)
        d2= StatsBase.transform(dt, data)
        M = fit(PCA, d2)
        Yt = MultivariateStats.transform(M, d2)
    
        totvar= tvar(M) 
        pvs= principalvars(M)
        ratioac= accumulate(+, pvs ./ totvar)
        
        if length(ratioac) >= 3
            pcres= 100 .* ratioac[1:3]
        else
            pcres= [100,100,100]
        end
        
        return(Yt, pcres)
end


_compute_PC (generic function with 1 method)

In [4]:
function _plot_rawdata(plotdir, voname, indx, sc::GaiaClustering.SCproperties2, df::GaiaClustering.Df, pc, ijump=100,
    showplot = true , extra= 0, cmap = "gist_stern")
    patch= pyimport("matplotlib.patches")

    nsize= size(df.data)
    nstar= nsize[2]
    iter= 1:ijump:nstar
    
    PyPlot.plt.rcParams["font.size"]= 25

    PyPlot.plt.figure(figsize=(13.0,12.0))
    PyPlot.plt.subplot(3, 3, 1)

    xx = df.data[2,1iter] 
    yy = df.data[3,iter] 
    PyPlot.plt.scatter(xx, yy , s = 0.1 )
    xx = df.data[2,indx] 
    yy = df.data[3,indx] 
    PyPlot.plt.scatter(xx, yy , s = 1, c="r", alpha=0.5 )    
    PyPlot.plt.xlabel("Y (pc)")
    PyPlot.plt.ylabel("Z (pc)")
    PyPlot.plt.grid(true)

    PyPlot.plt.subplot(3, 3, 2 )
    xx = df.data[1,iter]
    yy = df.data[3,iter] 
    PyPlot.plt.scatter(xx, yy , s = 0.1 )
    xx = df.data[1,indx] 
    yy = df.data[3,indx] 
    PyPlot.plt.scatter(xx, yy , s = 1, c="r", alpha=0.5 )     
    PyPlot.plt.xlabel("X (pc)")
    PyPlot.plt.ylabel("Z (pc)")
    PyPlot.plt.grid(true)

    PyPlot.plt.subplot(3, 3, 4 )
    xx = df.data[2,iter] 
    yy = df.data[1,iter]
    PyPlot.plt.scatter(xx, yy , s = 0.1 )
    xx = df.data[2,indx] 
    yy = df.data[1,indx] 
    PyPlot.plt.scatter(xx, yy , s = 1, c="r", alpha=0.5 )     
    PyPlot.plt.xlabel("Y (pc)")
    PyPlot.plt.ylabel("X (pc)")
    PyPlot.plt.grid(true)

    PyPlot.plt.subplot(3, 3, 3 )
    xx = df.data[1,iter]
    yy = df.raw[13,iter]
    PyPlot.plt.scatter(xx, yy , s = 0.1 )
    xx = df.data[1,indx]
    yy = df.raw[13,indx]  
    PyPlot.plt.scatter(xx, yy , s = 1, c="r", alpha=0.5 )     
    PyPlot.plt.xlabel("X(pc)")
    PyPlot.plt.ylabel("Vrad (km/s)")
    PyPlot.plt.grid(true)

    ## text to display
    axt= PyPlot.plt.subplot(3, 3, 5)
    PyPlot.plt.axis("off")
    ## text to display
    
    if extra != 0
        text =[]
        v1= "$(extra.votname[1])"
        txt = "Votable : $v1" ; push!(text,txt)
        v = nstar ; txt = "N stars in the field  : $v" ; push!(text,txt)
        v1= "$(extra.cycle[1])"
        txt = "Cycle : $v1" ; push!(text,txt)
        v1= @sprintf("%3.3f",extra.qc[1])
        txt = "Qc : $v1" ; push!(text,txt)
        v1= @sprintf("%3.3f",extra.score_cycle[1])
        txt = "Score : $v1" ; push!(text,txt)
        v1= @sprintf("%3.3f",sc.offdeg)
        txt = "Offset : $v1 (degree)" ; push!(text,txt)
        v1= @sprintf("%3.3f",sc.edgratg)
        txt = "Edge ratio(g) : $v1 " ; push!(text,txt)
        v1= @sprintf("%3.3f",sc.edgratm)
        txt = "Edge ratio(m) : $v1 " ; push!(text,txt)
        txt= @sprintf("PC1: %3.1f , PC2: %3.1f , PC3: %3.1f", extra.pc1[1], extra.pc2[1], extra.pc3[1]) 
        push!(text,txt)
        
        show_text(-0.01,-0.1, text , 1.1 )
        rec= patch.Rectangle((-0.07, -0.15), 1.2, 1.15, color="skyblue", alpha= 0.4, clip_on=false)
        axt.add_artist(rec)
    end

    PyPlot.plt.subplot(3, 3, 7 )
    PyPlot.plt.axis("on")
    xx = df.data[7,iter]
    yy = -df.data[6,iter]
    PyPlot.plt.scatter(xx, yy , s = 0.1 )
    xx = df.data[7,indx]
    yy = -df.data[6,indx]  
    PyPlot.plt.scatter(xx, yy , s = 1, c="r", alpha=0.5 ) 
    PyPlot.plt.xlabel("G-Rp")
    PyPlot.plt.ylabel("G")
    PyPlot.plt.grid(true)

    PyPlot.plt.subplot(3, 3, 8 )
    xx = df.data[4,iter]
    yy = df.data[5,iter]
    PyPlot.plt.scatter(xx, yy , s = 0.1 )
    xx = df.data[4,indx]
    yy = df.data[5,indx]  
    PyPlot.plt.scatter(xx, yy , s = 1, c="r", alpha=0.5 ) 
    PyPlot.plt.xlabel("Vl (km/s)")
    PyPlot.plt.ylabel("Vb (km/s)")
    PyPlot.plt.grid(true)

    PyPlot.plt.subplot(3, 3, 9 )
    xx = pc[1,:]
    yy = pc[2,:]  
    PyPlot.plt.scatter(xx, yy , s = 1, c="r" ) 
    PyPlot.plt.xlabel("PC1")
    PyPlot.plt.ylabel("PC2")
    PyPlot.plt.grid(true)   
    

    figname = plotdir*"/"*voname*".raw.png"
    PyPlot.plt.savefig(figname)

    if showplot PyPlot.plt.show() end
end

_plot_rawdata (generic function with 5 methods)

In [5]:
function _cycle_extraction(df::GaiaClustering.Df, dfcart::GaiaClustering.Df, m::GaiaClustering.meta)
    
        println("############### cycle_extraction #########")

        votname= m.votname
        cyclerun= true ; cycle= 1 ; FLAG= 0

        sclist= [] ; mcmclist= [] ; perflist= [] ; chainlist= []
    
        println("##")
        while cyclerun
            FLAG= -1
            tstart= now()
            println("###############")
            println("## starting cycle $cycle ...")
            @printf("## starting time: %s \n",tstart)
            ## extraction one cycle.. MCMC optimization
            mc , iter, FLAGmcmc= abc_mcmc_dbscan_full2(dfcart, m)
            println("## ABC/MCMC flag: $FLAGmcmc")
            nchain= length(mc.qc)
            println("## $iter iterations performed...")
            println("## $nchain chains")

            if FLAGmcmc== -1 || nchain > m.minchainreached
                println("## optimization completed..")
                println("## analyzing solutions...")
                plot_dbscanfull_mcmc(m.plotdir, "$votname.$cycle", mc , false)

                ## get the cluster and plot it
                println("## extracting the cluster using DBSCAN/WEIGHTING with:")
                res= extraction_mcmc(mc, m.votname)
                eps= res.epsm[1]
                min_nei= trunc(Int,res.mneim[1] + 0.5)
                min_cl= trunc(Int,res.mclm[1] + 0.5)
                w3d= res.w3dm[1]
                wvel= res.wvelm[1]
                whrd= res.whrdm[1]

                mres = GaiaClustering.modelfull(eps,min_nei,min_cl,w3d,wvel,whrd)
                dfcartnorm = getDfcartnorm(dfcart, mres)
                labels = clusters(dfcartnorm.data ,eps  , 20, min_nei, min_cl)
                labelmax , nmax, qc = find_cluster_label2(labels, df, dfcart, m)
                println("## label $labelmax written to oc...")
                export_df("$votname.$cycle", m.ocdir, df , dfcart , labels , labelmax)
            
                ## Principal components
                pc, pcres= _compute_PC(df, dfcart, labels, labelmax)

                edgeratio1, edgeratio2= edge_ratio(dfcart, labels[labelmax])
                scproperties = get_properties_SC2(labels[labelmax] , df, dfcart)
                scdf= convertStruct2Df(scproperties)
                insertcols!(scdf, 1, :votname => votname)
                s=size(scdf)
                insertcols!(scdf, 2, :cycle => cycle)
                insertcols!(scdf, 3, :pc3 => pcres[3])
                insertcols!(scdf, 3, :pc2 => pcres[2]) 
                insertcols!(scdf, 3, :pc1 => pcres[1])
            
                insertcols!(res, 2,  :cycle => cycle)
                push!(sclist, scdf)
                push!(mcmclist, res)

                ## create DF chain
                dfchain= create_DFchain(mc, votname, cycle)
                push!(chainlist,dfchain)

                println("###")
                println("### label solution: $labelmax")
                @printf("### PC1: %3.1f , PC2: %3.1f , PC3: %3.1f \n", pcres[1], pcres[2], pcres[3])
                println("### Offdeg: $(scproperties.offdeg)")
                println("### Edge ratio: $(scproperties.edgratm)")
                println("### N stars: $nmax")
                println("### Qc: $qc")
                println("###")

                k= score_cycle(qc, nmax, nchain, iter)
                @printf("## score cycle %d: %3.3f \n",cycle, k)

                extraplot= DataFrame(cycle=cycle, score_cycle=k, qc=qc, votname=votname, pc1=pcres[1],pc2=pcres[2], pc3=pcres[3])

                plot_cluster2(m.plotdir, "$votname.$cycle", labels[labelmax], scproperties,
                    dfcart , false, extraplot)
                
                jump= 50  # how many stars to jump in the plot
                _plot_rawdata(m.plotdir, "$votname.$cycle", labels[labelmax], scproperties,
                    dfcart , pc, jump, true, extraplot)

                println("###")
                println("### subtracting BEST solution from Df...")
                dfnew, dfcartnew= remove_stars(df, dfcart, labels[labelmax])
                df= dfnew
                dfcart= dfcartnew

                ########################### STOP conditions #########
                FLAG= 0
                if nmax < m.minstarstop
                    FLAG= FLAG | (1<<0)
                    println("### extraction stopped at cycle $cycle")
                    println("### nmax too low...")
                    cyclerun= false
                end
                if cycle == m.cyclemax
                    FLAG= FLAG | (1<<1)
                    println("### extraction stopped at cycle $cycle")
                    println("### cyclemax reached...")
                    cyclerun= false
                end
                if qc < m.qcminstop
                    FLAG= FLAG | (1<<2)
                    println("### extraction stopped at cycle $cycle")
                    println("### Qc too low...")
                    cyclerun= false
                end
                if w3d/wvel < m.wratiominstop || wvel/w3d < m.wratiominstop
                    FLAG= FLAG | (1<<3)
                    println("### extraction stopped at cycle $cycle")
                    println("### weight ratio too low...")
                    cyclerun= false
                end
                if FLAGmcmc == 3 && nchain > m.minchainreached
                    FLAG= FLAG | (1<<4)
                    println("## extraction stopped at cycle $cycle")
                    println("## chain iteration not performed completely but sufficient to keep...")
                    cyclerun= false
                end
                #####################################################
                ## Time
                tend= now()
                duration= Dates.value(tend-tstart) / (1000*1)
                nstar= size(df.raw)[2]
                timeperiterstar= duration / (iter*nstar)
                timeperchainstar= duration / (nchain*nstar)
                @printf("## \n")
                @printf("## Time: \n")
                @printf("## duration per cycle %3.3f sec \n", duration)
                @printf("## duration per iteration*star %3.3e sec \n", timeperiterstar)
                @printf("## duration per chain*star %3.3e sec \n", timeperchainstar)
                @printf("##\n")

                ## log the results of performances
                dfout= DataFrame(votname=votname, cycle=cycle, nstar=nstar, qc=qc, nmax=nmax, nchain=nchain, iter=iter,
                scorecycle=k, duration=duration, timeperiterstar=timeperiterstar ,
                timeperchainstar= timeperchainstar )
                push!(perflist, dfout)

                cycle += 1
            else
                println("## nothing found, stopped...")
                FLAG= 0
                cyclerun= false
            end
        end
        if cycle >= 2
            save_cycle(sclist, mcmclist, perflist, chainlist, m)
        end
        return(cycle-1, FLAG)
    
end

_cycle_extraction (generic function with 1 method)

In [6]:
function main(filelist, metafile)
    
        wd= pwd() ; nfile= size(filelist)[1] ; totalTime= 0.
        println("## Starting main loop using optimization with cycles...")
        println("## It can be very long but will be resumed to the last reduced file.")
        println("## Working directory: $wd")
        println("## $nfile files to analyze...")

        m= read_params(metafile, false)
        m.plotdir= plotdir
        m.ocdir= ocdir
        fileres= "$(m.prefile).done.csv"

        # read a possible votname blacklist
        blackname= "blacklist-test.csv"
        blacklist= _read_blacklist(blackname)
        println("## blacklist read...")
        for i in 1:nfile
            votname = filelist[i]
            res, votfound= _updt_votcompleted(fileres, votname, 0, 0, true)

            ## test blacklist
            if votname in blacklist
                println("##")
                println("## $votname in blacklist...")
                println("##")
                votfound= true
            end

        
            if !votfound
                tstart= now()
                println("###########################")
                println("## Starting with $votname")
                println("## Starting at $tstart")

                df , dfcart , dfcartnorm = _getdata(votdir*"/"*votname)
                m.votname= votname
                cycle, flag= _cycle_extraction(df, dfcart, m)
                res,  votfound= _updt_votcompleted(fileres, votname , cycle, flag, false)

                tend= now()
                println("## Ending at $tend")
                println("## number of cycle: $cycle , flag:$flag ")
                println("##########################")
                println("##")

                duration= Dates.value(tend-tstart) / (1000*3600)
                totalTime += duration
                meanTime= totalTime / i
                ETA= meanTime * (nfile-i) / 24
                nleft= nfile-i
                ETAstr= @sprintf("%3.3f", ETA) ; durationstr= @sprintf("%3.3f", duration)
                @printf("## %s \n",specialstr("Duration: $durationstr hours","YELLOW"))
                @printf("## %s \n",specialstr("ETA: $ETAstr days","YELLOW"))
                @printf("## %s \n",specialstr("Votable done: $votname","YELLOW"))
                @printf("## %s \n",specialstr("Files analyzed: $i","YELLOW"))
                @printf("## %s \n",specialstr("Files to go: $nleft","YELLOW"))
                println("##\n##")

            end
        end
    
    println("## Main loop done.")
end

main (generic function with 1 method)

In [None]:
header_extract()

cd(votdir)
votlist= glob("NGC*.vot")
cd(wdir)

# rng = MersenneTwister()
# shuffle!(rng, votlist)

main(votlist[1:MAX_VOTABLE],"configAll.ext.test")
PyPlot.plt.show()

#
#
# >>>>>>>>>> Extracting Stellar Clusters from GAIA data <<<<<<<<<<
# Version: 1.2.0  (https://github.com/bosscha/gaia-shock)
#
#
## Starting main loop using optimization with cycles...
## It can be very long but will be resumed to the last reduced file.
## Working directory: /home/stephane/Science/GAIA/products
## 30 files to analyze...
## All parameters set to default...
## Parameters read from configAll.ext.test
## blacklist read...
###########################
## Starting with NGC 1027-2.3deg.vot
## Starting at 2021-06-04T17:42:10.674
## Votable /home/stephane/Science/GAIA/e2e_products/votable.edr3.2021/NGC 1027-2.3deg.vot read
## Filtering done ...
## Stars selected: 77534
## Cartesian transformation done ...
## Normalization identity done...
### [1pc,1pc,1pc,1km/s,1km/s,1mag,1mag,1mag] equivalent to [0.39801487608399566, 0.39801487608399566, 0.39801487608399566, 0.4975185951049946, 0.4975185951049946, 0.09950371902099892, 0.09950371902099892, 0.09950371902099892]
##
###########

## Votable /home/stephane/Science/GAIA/e2e_products/votable.edr3.2021/NGC 110-2.0deg.vot read
## Filtering done ...
## Stars selected: 50966
## Cartesian transformation done ...
## Normalization identity done...
### [1pc,1pc,1pc,1km/s,1km/s,1mag,1mag,1mag] equivalent to [0.39801487608399566, 0.39801487608399566, 0.39801487608399566, 0.4975185951049946, 0.4975185951049946, 0.09950371902099892, 0.09950371902099892, 0.09950371902099892]
##
############### cycle_extraction #########
##
###############
## starting cycle 1 ...
## starting time: 2021-06-05T07:14:58.992 
## ABC/MCMC for DBSCAN FULL (parameters+weighting)...
## ABC/MCMC v2
### Burn-in : 500
### Chains  : 5000
### Minimum Qc : 2.6
### Minimum Qn : 40
### Maximum Qn : 5000
### Maximum iterations: 50000
###
#### Checking the minQc and minQn conditions...
#### Minimum good solutions: 10
#### Number of iterations: 500, maxiter: 15000
#### MinQ not reached yet... testing with 2.470
#### MinQ not reached yet... testing with 2.346
####

## Votable /home/stephane/Science/GAIA/e2e_products/votable.edr3.2021/NGC 133-2.0deg.vot read
## Filtering done ...
## Stars selected: 67282
## Cartesian transformation done ...
## Normalization identity done...
### [1pc,1pc,1pc,1km/s,1km/s,1mag,1mag,1mag] equivalent to [0.39801487608399566, 0.39801487608399566, 0.39801487608399566, 0.4975185951049946, 0.4975185951049946, 0.09950371902099892, 0.09950371902099892, 0.09950371902099892]
##
############### cycle_extraction #########
##
###############
## starting cycle 1 ...
## starting time: 2021-06-05T18:44:47.522 
## ABC/MCMC for DBSCAN FULL (parameters+weighting)...
## ABC/MCMC v2
### Burn-in : 500
### Chains  : 5000
### Minimum Qc : 2.6
### Minimum Qn : 40
### Maximum Qn : 5000
### Maximum iterations: 50000
###
#### Checking the minQc and minQn conditions...
#### Minimum good solutions: 10
#### Number of iterations: 500, maxiter: 15000
### Minimum Qc: 2.6
### Minimum Qn: 40
### init done ...
### mini stats...
### Qc : 2.416 
### Qn : 

## extracting the cluster using DBSCAN/WEIGHTING with:
## DBSCAN/MCMC stats: 
### ϵ : 1.660 +/- 0.534 
### min_nei  : 7.0 +/- 4.555 
### min_clus  : 17.0 +/- 6.457 
### W3d  : 6.156 +/- 3.167 
### Wvel  : 6.983 +/- 3.261 
### Whrd  : 2.118 +/- 1.147 
### Qn  : 529.000 +/- 401.769 
### Qc  : 3.127 +/- 0.319 
#### Selecting best cluster based on Qc..
## Qc: Any[3.4388059741289183, 2.153046206326567, 1.7859309421529024, 1.71068452966632, 1.9316129822462895, 1.4736017591270418, 1.414781655459137, 2.623763103355442]
## label 1 written to oc...
### /home/stephane/Science/GAIA/products/oc-TEST-slow/NGC 1333-5.7deg.1.oc.csv created 
### Computing principal components... 
###
### label solution: 1
### PC1: 33.4 , PC2: 55.1 , PC3: 71.9 
### Offdeg: 3.2062530609334603
### Edge ratio: 0.6223383431842363
### N stars: 411
### Qc: 3.4388059741289183
###
## score cycle 1: 2.989 
### Cluster plot is centered in Y,Z...
###
### subtracting BEST solution from Df...
### 411 stars removed
## 
## Time: 
## d

### Minimum Qc: 2.6
### Minimum Qn: 40
### init done ...
### mini stats...
### Qc : 3.018 
### Qn : 331.950 
### burn-in done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
## ABC/MCMC FULL done
##
## ABC/MCMC flag: -1
## 7007 iterations performed...
## 5003 chains
## optimization completed..
## analyzing solutions...
## extracting the cluster using DBSCAN/WEIGHTING with:
## DBSCAN/MCMC stats: 
### ϵ : 2.365 +/- 0.655 
### min_nei  : 7.0 +/- 4.310 
### min_clus  : 16.0 +/- 6.318 
### W3d  : 6.753 +/- 2.901 
### Wvel  : 8.659 +/- 3.250 
### Whrd  : 2.234 +/- 1.187 
### Qn  : 309.000 +/- 53.359 
### Qc  : 3.005 +/- 0.196 
#### Selecting best cluster based on Qc..
## Qc: Any[2.862896118851445, 1.3315439320060554]
## label 1 written to oc...
### /home/stephane/Science/GAIA/products/oc-TEST-slow/NGC 1342-2.8deg.1.oc.csv created 
### Computing principal components... 
###
### label solution: 1
### PC1: 34.7 , PC2: 51.3 , PC3: 64.8 
### Offdeg: 0.184928390820546

#### MinQ not reached yet... testing with 2.470
#### MinQ not reached yet... testing with 2.346
#### MinQ not reached yet... testing with 2.229
### Minimum Qc: 2.2291749999999997
### Minimum Qn: 34
### init done ...
### mini stats...
### Qc : 2.203 
### Qn : 8000.190 
### burn-in done...
### iteration: 50000 (ETA:34.472 h)
### Maximum iteration reached, current solution returned...
## ABC/MCMC flag: 3
## 50001 iterations performed...
## 611 chains
## optimization completed..
## analyzing solutions...
## extracting the cluster using DBSCAN/WEIGHTING with:
## DBSCAN/MCMC stats: 
### ϵ : 1.195 +/- 0.307 
### min_nei  : 10.0 +/- 5.203 
### min_clus  : 23.0 +/- 5.195 
### W3d  : 0.360 +/- 0.339 
### Wvel  : 9.220 +/- 2.500 
### Whrd  : 2.021 +/- 1.276 
### Qn  : 416.000 +/- 332.836 
### Qc  : 2.293 +/- 0.083 
#### Selecting best cluster based on Qc..
## Qc: Any[1.833066636100633, 1.5523194623668182, 2.042523045277927, 2.2464597415034757, 1.79804266539552, 1.2460469179235907, 1.5355005278102

### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
## ABC/MCMC FULL done
##
## ABC/MCMC flag: -1
## 17261 iterations performed...
## 5003 chains
## optimization completed..
## analyzing solutions...
## extracting the cluster using DBSCAN/WEIGHTING with:
## DBSCAN/MCMC stats: 
### ϵ : 2.578 +/- 0.575 
### min_nei  : 6.0 +/- 3.954 
### min_clus  : 17.0 +/- 6.693 
### W3d  : 4.795 +/- 2.490 
### Wvel  : 8.417 +/- 3.067 
### Whrd  : 2.167 +/- 1.178 
### Qn  : 206.000 +/- 64.834 
### Qc  : 2.719 +/- 0.104 
#### Selecting best cluster based on Qc..
## Qc: Any[2.6913237141489748]
## label 1 written to oc...
### /home/stephane/Science/GAIA/products/oc-TEST-slow/NGC 1502-3.0deg.1.oc.csv created 
### Computing principal components... 
###
### label solution: 1
### PC1: 36.1 , PC2: 55.8 , PC3: 70.0 
### Offdeg: 0.17099564673951237
### Edge ratio: 0.33815630999822754
### N stars: 197
### Qc: 2.6913237141489748
###
## score cycle 1: 2.187 
### Cluster plot is centered in Y,Z

### Minimum Qc: 2.6
### Minimum Qn: 40
### init done ...
### mini stats...
### Qc : 1.935 
### Qn : 228.490 
### burn-in done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
## ABC/MCMC FULL done
##
## ABC/MCMC flag: -1
## 20956 iterations performed...
## 5003 chains
## optimization completed..
## analyzing solutions...
## extracting the cluster using DBSCAN/WEIGHTING with:
## DBSCAN/MCMC stats: 
### ϵ : 2.369 +/- 0.532 
### min_nei  : 6.0 +/- 4.032 
### min_clus  : 17.0 +/- 6.173 
### W3d  : 4.156 +/- 2.641 
### Wvel  : 8.972 +/- 2.610 
### Whrd  : 2.190 +/- 1.176 
### Qn  : 210.000 +/- 47.351 
### Qc  : 2.703 +/- 0.097 
#### Selecting best cluster based on Qc..
## Qc: Any[1.3958134088186633, 1.2496900186921878, 1.2872060381988086, 2.402457721327348, 1.1356600173257907, 1.8835439909578364, 1.5964106874393138, 1.375217383100223]
## label 4 written to oc...
### /home/stephane/Science/GAIA/products/oc-TEST-slow/NGC 1513-2.0deg.2.oc.csv created 
### Computing

## extracting the cluster using DBSCAN/WEIGHTING with:
## DBSCAN/MCMC stats: 
### ϵ : 2.540 +/- 0.559 
### min_nei  : 7.0 +/- 4.634 
### min_clus  : 19.0 +/- 6.567 
### W3d  : 5.497 +/- 3.128 
### Wvel  : 7.723 +/- 3.232 
### Whrd  : 2.085 +/- 1.157 
### Qn  : 264.000 +/- 87.486 
### Qc  : 2.830 +/- 0.148 
#### Selecting best cluster based on Qc..
## Qc: Any[0.986378979227762, 1.8060595210549026, 2.607433151873669, 1.3450794597786722, 1.1305999828573081, 2.4337262072012056, 1.478596453624146, 1.1757561625731896]
## label 3 written to oc...
### /home/stephane/Science/GAIA/products/oc-TEST-slow/NGC 1528-4.5deg.1.oc.csv created 
### Computing principal components... 
###
### label solution: 3
### PC1: 36.4 , PC2: 54.7 , PC3: 68.0 
### Offdeg: 0.19429509809006995
### Edge ratio: 0.17513764023974948
### N stars: 311
### Qc: 2.607433151873669
###
## score cycle 1: 2.647 
### Cluster plot is centered in Y,Z...
###
### subtracting BEST solution from Df...
### 311 stars removed
## 
## Time: 
##

###
### subtracting BEST solution from Df...
### 311 stars removed
## 
## Time: 
## duration per cycle 1822.055 sec 
## duration per iteration*star 4.188e-06 sec 
## duration per chain*star 7.914e-06 sec 
##
###############
## starting cycle 2 ...
## starting time: 2021-06-11T02:04:32.507 
## ABC/MCMC for DBSCAN FULL (parameters+weighting)...
## ABC/MCMC v2
### Burn-in : 500
### Chains  : 5000
### Minimum Qc : 2.6
### Minimum Qn : 40
### Maximum Qn : 5000
### Maximum iterations: 50000
###
#### Checking the minQc and minQn conditions...
#### Minimum good solutions: 10
#### Number of iterations: 500, maxiter: 15000
### Minimum Qc: 2.6
### Minimum Qn: 40
### init done ...
### mini stats...
### Qc : 2.467 
### Qn : 122.030 
### burn-in done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
## ABC/MCMC FULL done
##
## ABC/MCMC flag: -1
## 25716 iterations performed...
## 5003 chains
## optimization completed..
## analyzing solutions...
## extracting the cluster u

#### MinQ not reached yet... testing with 2.470
#### MinQ not reached yet... testing with 2.346
### Minimum Qc: 2.3465
### Minimum Qn: 36
### init done ...
### mini stats...
### Qc : 2.089 
### Qn : 1582.870 
### burn-in done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
## ABC/MCMC FULL done
##
## ABC/MCMC flag: -1
## 43586 iterations performed...
## 5003 chains
## optimization completed..
## analyzing solutions...
## extracting the cluster using DBSCAN/WEIGHTING with:
## DBSCAN/MCMC stats: 
### ϵ : 2.137 +/- 0.514 
### min_nei  : 7.0 +/- 4.235 
### min_clus  : 18.0 +/- 6.209 
### W3d  : 3.393 +/- 2.315 
### Wvel  : 10.482 +/- 3.349 
### Whrd  : 2.490 +/- 1.265 
### Qn  : 189.000 +/- 178.079 
### Qc  : 2.429 +/- 0.086 
#### Selecting best cluster based on Qc..
## Qc: Any[1.4988816508513485, 1.645474329009398, 1.9572708561211698, 1.790955346943435, 2.335934500000261, 0.905910127132628]
## label 5 written to oc...
### /home/stephane/Science/GAIA/products/

#### MinQ not reached yet... testing with 2.470
### Minimum Qc: 2.4699999999999998
### Minimum Qn: 38
### init done ...
### mini stats...
### Qc : 1.704 
### Qn : 45.730 
### burn-in done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### chain:5000
## ABC/MCMC FULL done
##
## ABC/MCMC flag: -1
## 28522 iterations performed...
## 5003 chains
## optimization completed..
## analyzing solutions...
## extracting the cluster using DBSCAN/WEIGHTING with:
## DBSCAN/MCMC stats: 
### ϵ : 2.070 +/- 0.507 
### min_nei  : 6.0 +/- 4.348 
### min_clus  : 19.0 +/- 6.271 
### W3d  : 2.987 +/- 1.908 
### Wvel  : 9.615 +/- 3.202 
### Whrd  : 2.154 +/- 1.145 
### Qn  : 170.000 +/- 123.541 
### Qc  : 2.547 +/- 0.079 
#### Selecting best cluster based on Qc..
## Qc: Any[1.2967890006405347, 1.3340733558201057, 1.7068681871866145, 1.139668492452186, 1.7618547613189548, 0.9226191581138292, 2.416555360579723, 0.8189689878814956, 2.2040798365561027, 0.8257870857268264, 1.0620775027477254, 1.5607

###
### subtracting BEST solution from Df...
### 235 stars removed
## 
## Time: 
## duration per cycle 6373.391 sec 
## duration per iteration*star 5.231e-06 sec 
## duration per chain*star 9.784e-06 sec 
##
###############
## starting cycle 2 ...
## starting time: 2021-06-14T17:49:36.400 
## ABC/MCMC for DBSCAN FULL (parameters+weighting)...
## ABC/MCMC v2
### Burn-in : 500
### Chains  : 5000
### Minimum Qc : 2.6
### Minimum Qn : 40
### Maximum Qn : 5000
### Maximum iterations: 50000
###
#### Checking the minQc and minQn conditions...
#### Minimum good solutions: 10
#### Number of iterations: 500, maxiter: 15000
#### MinQ not reached yet... testing with 2.470
#### MinQ not reached yet... testing with 2.346
#### MinQ not reached yet... testing with 2.229
### Minimum Qc: 2.2291749999999997
### Minimum Qn: 34
### init done ...
### mini stats...
### Qc : 2.078 
### Qn : 64.530 
### burn-in done...
### chain:1000
### chain:2000
### chain:3000
### chain:4000
### iteration: 50000 (ETA:1.265 

###
### subtracting BEST solution from Df...
### 319 stars removed
## 
## Time: 
## duration per cycle 14317.410 sec 
## duration per iteration*star 6.160e-06 sec 
## duration per chain*star 2.460e-05 sec 
##
###############
## starting cycle 2 ...
## starting time: 2021-06-15T10:18:54.229 
## ABC/MCMC for DBSCAN FULL (parameters+weighting)...
## ABC/MCMC v2
### Burn-in : 500
### Chains  : 5000
### Minimum Qc : 2.6
### Minimum Qn : 40
### Maximum Qn : 5000
### Maximum iterations: 50000
###
#### Checking the minQc and minQn conditions...
#### Minimum good solutions: 10
#### Number of iterations: 500, maxiter: 15000
#### MinQ not reached yet... testing with 2.470
#### MinQ not reached yet... testing with 2.346
### Minimum Qc: 2.3465
### Minimum Qn: 36
### init done ...
### mini stats...
### Qc : 2.117 
### Qn : 18871.850 
### burn-in done...
### chain:1000
### chain:2000
### chain:3000
### iteration: 50000 (ETA:4.928 h)
### Maximum iteration reached, current solution returned...
## ABC/M

#### Selecting best cluster based on Qc..
## Qc: Any[1.3192833435350044, 1.3081208838437774, 1.7499202143436756, 2.821970874367427, 1.048426993798903]
## label 4 written to oc...
### /home/stephane/Science/GAIA/products/oc-TEST-slow/NGC 1746-6.0deg.2.oc.csv created 
### Computing principal components... 
###
### label solution: 4
### PC1: 36.8 , PC2: 60.3 , PC3: 76.1 
### Offdeg: 0.3119537671885763
### Edge ratio: 0.13719710685070952
### N stars: 191
### Qc: 2.821970874367427
###
## score cycle 2: 2.350 
### Cluster plot is centered in Y,Z...
###
### subtracting BEST solution from Df...
### 191 stars removed
## 
## Time: 
## duration per cycle 24310.547 sec 
## duration per iteration*star 7.184e-06 sec 
## duration per chain*star 1.731e-05 sec 
##
###############
## starting cycle 3 ...
## starting time: 2021-06-16T14:00:05.477 
## ABC/MCMC for DBSCAN FULL (parameters+weighting)...
## ABC/MCMC v2
### Burn-in : 500
### Chains  : 5000
### Minimum Qc : 2.6
### Minimum Qn : 40
### Maximum 

# Testing PCA to discard bad extractions

By computing PCA on extracted OC and filtering on the PC1 or PC2 variance ratio

In [None]:
function compute_PCA(oclist)
    
    for oc in oclist
        print("## $oc \n")
        
        df= CSV.File(joinpath(ocdir,oc)) |> DataFrame
        s=size(df)
        data= zeros(8,s[1])
        data[1,:]= df.X  
        data[2,:]= df.Y
        data[3,:]= df.Z
        data[4,:]= df.vl
        data[5,:]= df.vb
        data[6,:]= df.gbar
        data[7,:]= df.gbar .- df.rp
        data[8,:]= df.bp .- df.gbar 
        
        # d=Array(data')
        dt= StatsBase.fit(ZScoreTransform, data)
        d2= StatsBase.transform(dt, data)
        M = fit(PCA, d2)
        
        println(M)
        totvar= tvar(M) 
        pvs= principalvars(M)
        ratioac= accumulate(+, pvs ./ totvar)
        println(ratioac)
        restxt= @sprintf(" PC1: %3.1f , PC2: %3.1f , PC3: %3.1f", 100*ratioac[1], 100*ratioac[2], 100*ratioac[3])
        
        Yt = MultivariateStats.transform(M, d2)
        pc1= Yt[1,:]
        pc2= Yt[2,:]
        pc3= Yt[3,:]
        
        dct= Dict("color"=> "black", "fontsize"=> 11)
        PyPlot.plt.figure(figsize=(7.0,5.0))
        
        PyPlot.plt.title(oc)
        # PyPlot.plt.subplot(1, 2, 1)
        #PyPlot.plt.scatter(pc1, pc2 , s = 1, c="r", alpha=0.5 )
        
        PyPlot.plt.subplot(1, 1, 1)
        PyPlot.plt.text(-2,-2,restxt, dct)
        PyPlot.plt.xlabel("PC1")
        PyPlot.plt.xlabel("PC2")       
        PyPlot.plt.scatter(pc1, pc2 , s = 1, c="b", alpha=0.5 )         
        
        PyPlot.plt.show()
    end
    
end


In [None]:
MAX_OC= 5

cd(ocdir)
oclist= glob("*.csv")
cd(wdir)

rng = MersenneTwister()
shuffle!(rng, oclist)

compute_PCA(oclist[1:MAX_OC])
