## Cycle extraction

Testing cycle extraction by iterating on cycle of extraction+subtraction

In [None]:
using  PyCall
using  Statistics , StatsBase, Random
using  DataFrames, Printf , CSV , Query
using  Dates, Glob

import PyPlot
sns= pyimport("seaborn")

## directory
rootdir = ENV["GAIA_ROOT"]
wdir    = "$rootdir/products"
votdir  = "$rootdir/e2e_products/votable.edr3.2021"
plotdir = "$wdir/test"
ocdir   = "$wdir/octest"
sclist  = "$rootdir/e2e_products/sc-list-2020.csv"

push!(LOAD_PATH,"$rootdir/master/src")
using GaiaClustering 

cd(wdir)

In [None]:
function _getdata(filevot)
    voname = filevot

    println("## Data filtered in distance ..")
    data       = read_votable(voname)
    df         = filter_data(data,[0., 2000])
    dfcart     = add_cartesian(df)

    return(df, dfcart)
end

In [None]:
function _remove_stars(df, dfcart, idx)
    # println(size(df.data))
    # println("setdiff")
    # println(size(df.data[:,setdiff(1:end,idx)]))
    s=size(df.data)
    # println(size(df.data[:,setdiff(1:s[2],idx)]))
    diff= setdiff(1:s[2],idx)
    # println(length(diff))
    
    #println("end setdiff")
    
    dfdata= df.data[:,setdiff(1:end,idx)]
    dfraw= df.raw[:,setdiff(1:end,idx)]
    dferr= df.err[:,setdiff(1:end,idx)]
    
    dfcartdata= dfcart.data[:,setdiff(1:end,idx)]
    dfcartraw= dfcart.raw[:,setdiff(1:end,idx)]
    dfcarterr= dfcart.err[:,setdiff(1:end,idx)]
     
    s=size(dfdata)
    # println("##ndata")
    # println(s[2])
    # println("##ndata")
    
    dfnew= GaiaClustering.Df(s[2],dfdata,dfraw,dferr)
    dfcartnew= GaiaClustering.Df(s[2],dfcartdata,dfcartraw,dfcarterr)
    
    # println(size(dfnew.data))
    nrem= length(idx)
    println("### $nrem stars removed")
    return(dfnew, dfcartnew)
end

function _extraction_mcmc(mc, votname)
    epsm = median(mc.eps)
    epsd = std(mc.eps)
    mneim = median(mc.mne)
    mneid = std(mc.mne)
    mclm = median(mc.mcl)
    mcld = std(mc.mcl)
    qcm = median(mc.qc)
    qnm = median(mc.qn)
    qcd = std(mc.qc)
    qnd = std(mc.qn)
    w3dm = median(mc.w3d)
    w3dd = std(mc.w3d)
    wvelm = median(mc.wvel)
    wveld = std(mc.wvel)
    whrdm = median(mc.whrd)
    whrdd = std(mc.whrd)
    
    @printf("## DBSCAN/MCMC stats: \n")
    @printf("### ϵ : %3.3f +/- %3.3f \n", epsm, epsd)
    @printf("### min_nei  : %3.1f +/- %3.3f \n", mneim, mneid)
    @printf("### min_clus  : %3.1f +/- %3.3f \n", mclm, mcld)
    @printf("### W3d  : %3.3f +/- %3.3f \n", w3dm, w3dd)
    @printf("### Wvel  : %3.3f +/- %3.3f \n" , wvelm, wveld)
    @printf("### Whrd  : %3.3f +/- %3.3f \n", whrdm, whrdd)
    @printf("### Qn  : %3.3f +/- %3.3f \n",qnm, qnd)
    @printf("### Qc  : %3.3f +/- %3.3f \n",qcm, qcd)
    @printf("##")
    
    res = DataFrame(votname=votname, epsm = epsm, epsd=epsd, mneim=mneim,mneid=mneid,mclm=mclm,mcld=mcld,
            qcm=qcm,qcd=qcd, qnm=qnm,qnd=qnd,
            w3dm=w3dm,w3dd=w3dd,wvelm=wvelm,wveld=wveld,whrdm=whrdm,whrdd=whrdd)
    return(res)
end

function _score_cycle(qc, qn, nchain, iter)
    k= log10(qc*qn*nchain /iter)
    
    return(k)
end

In [None]:
## to compute a ratio of the cluster to the edge of the data
##
function _edge_ratio(dfcart::GaiaClustering.Df, ind)    
    r2= dfcart.data[2,ind] .* dfcart.data[2,ind] .+ dfcart.data[3,ind] .* dfcart.data[3,ind]
    minX= minimum(dfcart.data[1,ind])
    maxX= maximum(dfcart.data[1,ind])
    xg= median(dfcart.data[2,ind]) ; yg= median(dfcart.data[3,ind]) ; dg= median(dfcart.data[1,ind])
    
    indx= (dfcart.data[1,:] .<= maxX) .& (dfcart.data[1,:] .>= minX)
    rtot2=  dfcart.data[2,indx] .* dfcart.data[2,indx]  .+ dfcart.data[3,indx] .* dfcart.data[3,indx]
    # ratio= sqrt(maximum(r2) / maximum(rtot2))
    ratio= sqrt((xg^2+yg^2) / maximum(rtot2))
    
    ## test
    alpha= maximum(sqrt.(rtot2)) / maxX
    r0= alpha*minX
    rg= alpha*dg
    # ratio_2= sqrt(maximum(r2) / r0^2)
    # ratio_2= sqrt((xg^2+yg^2) / rg^2)
    ratio_2= sqrt(maximum(r2)) / rg
    println("## Edge ratio: $ratio")
    println("## Edge ratio_2: $(ratio_2)")
    return(ratio)
end

In [None]:
function _plot_check(dfcart)
    cart= DataFrame(X=dfcart.data[1,:], Y=dfcart.data[2,:], Z=dfcart.data[3,:])

    println("## check plot subtraction ...")
    
    PyPlot.plt.figure(figsize=(9.0,8.0))
    PyPlot.plt.subplot(1, 1, 1 , xlim=[-50,50])
    PyPlot.plt.scatter(cart.Y, cart.X, s = 0.1 )
    PyPlot.plt.xlabel("Y (pc)")
    PyPlot.plt.ylabel("X (pc)")
    PyPlot.plt.grid(true)
    
    PyPlot.plt.show()

end

In [None]:
## update only the fileres with DF
function _updt!(fileres, df)
    if !isfile(fileres)
        CSV.write(fileres,df,delim=';')
        println("## $fileres created...")
        return(true)
    else
        res = DataFrames.copy(CSV.read(fileres, delim=";"))
        append!(res,df)
        CSV.write(fileres,res,delim=';')
        return(true)
    end
end

In [None]:
## main cycle method
## filedebug is mainly to control the different time lapse
##
function _cycle_extraction(votname, filedebug, m::GaiaClustering.meta)
    println("############### extraction #########")
    df, dfcart= _getdata(votdir*"/"*votname)
    
    cyclerun= true
    cycle= 1
    
    cyclemax= m.cyclemax
    minstarselection=   m.minstarselection    # minimum of stars to select solution in a cycle...????
    minstarstop=   m.minstarstop         # condition to stop cycling
    minchainreached=  m.minchainreached      # minimum chain to analyze solution
    qcmin=  m.qcminstop                # more condition on Qc to stop cycling after the first
    wratiomin=  m.wratiominstop          # minimum ratio btwn w3d and wvel (otherwise not an OC)
    
    
    scdf= []
    mcmcdf= []
    
    println("##")
    while cyclerun
        tstart= now()
        println("#####################")
        println("##\n## starting cycle $cycle ...")
        @printf("## %s \n", Dates.now())
        ## extraction one cycle.. MCMC optimization
        mc , iter, flag= abc_mcmc_dbscan_full2(dfcart, m)
        println("## Flag: $flag")
        nchain= length(mc.qc)
        println("## $iter iterations performed...")
        println("## $nchain chains")
        
        if flag== -1 || nchain > minchainreached
            println("## optimization completed..")
            println("## analyzing solutions...")
            plot_dbscanfull_mcmc(plotdir, "$votname.$cycle", mc , false)

            ## get the cluster and plot it
            println("## extracting the cluster using DBSCAN/WEIGHTING with:")
            res2= _extraction_mcmc(mc, votname)
            eps= res2.epsm[1]
            min_nei= trunc(Int,res2.mneim[1] + 0.5)
            min_cl= trunc(Int,res2.mclm[1] + 0.5)
            w3d= res2.w3dm[1]
            wvel= res2.wvelm[1]
            whrd= res2.whrdm[1]

            mres = GaiaClustering.modelfull(eps,min_nei,min_cl,w3d,wvel,whrd)
            dfcartnorm = getDfcartnorm(dfcart, mres)
            labels = clusters(dfcartnorm.data ,eps  , 20, min_nei, min_cl)
            labelmax , nmax, qc = find_cluster_label2(labels, df, dfcart, m)
            println("## label $labelmax written to oc...")
            export_df("$votname.$cycle", ocdir, df , dfcart, labels , labelmax)
            scproperties = get_properties_SC2(labels[labelmax] , df, dfcart)
        
            _edge_ratio(dfcart, labels[labelmax])
            plot_cluster2(plotdir, "$votname.$cycle", labels[labelmax], scproperties,  dfcart , false)
        
            println("###")
            println("### Label solution: $labelmax")
            println("### N stars: $nmax")
            println("### Qc: $qc")
            println("###")
        
            _plot_check(dfcart)
        
            k= _score_cycle(qc, nmax, nchain, iter)
            @printf("## score cycle %d: %3.3f \n",cycle, k)
                
            println("###")
            println("### subtracting BEST solution from Df...")
            dfnew, dfcartnew= _remove_stars(df, dfcart, labels[labelmax])
            df= dfnew
            dfcart= dfcartnew
        
            ######################### STOP conditions
            if nmax < minstarstop
                println("## extraction stopped at cycle $cycle")
                println("## nmax too low...")
                cyclerun= false
            end
            if cycle == cyclemax
                println("## extraction stopped at cycle $cycle")
                println("## cyclemax reached...")
                cyclerun= false
            end
            if qc < qcmin
                println("## extraction stopped at cycle $cycle")
                println("## Qc too low...")
                cyclerun= false
            end
            if w3d/wvel < wratiomin || wvel/w3d < wratiomin
                println("## extraction stopped at cycle $cycle")
                println("## weight ratio too low...")
                cyclerun= false
            end
            if flag == 3 && nchain > m.minchainreached
                println("## extraction stopped at cycle $cycle")
                println("## chain iteration not performed completely but sufficient to keep...")
                cyclerun= false
            end
            ### 
            
            tend= now()
            duration= Dates.value(tend-tstart) / (1000*1)
            nstar= size(df.raw)[2]
            timeperiterstar= duration / (iter*nstar)
            timeperchainstar= duration / (nchain*nstar)
            @printf("## Time: \n")
            @printf("## duration per cycle %3.3f sec \n", duration)
            @printf("## duration per iteration*star %3.3e sec \n", timeperiterstar)
            @printf("## duration per chain*star %3.3e sec \n", timeperchainstar)
            @printf("##\n")
        
            ## log the results of performances
            dfout= DataFrame(votname=votname, cycle=cycle, nstar=nstar, qc=qc, nmax=nmax, nchain=nchain, iter=iter,
            scorecycle=k, duration=duration, timeperiterstar=timeperiterstar ,
            timeperchainstar= timeperchainstar )
            _updt!(filedebug, dfout)
            cycle += 1
        else
            println("## nothing found, stopped...")
            cyclerun= false
        end
    end
end

In [None]:
function main(votlist, metafile, debugfile= "cycle_extraction.debug.csv")
    m= read_params(metafile)
    for votname in votlist
        println("## Analyzing $votname")
 
        _cycle_extraction(votname, debugfile, m)
        
        ## df, dfcart= _getdata(votdir*"/"*votname)
        ## m.votname= votname
        ## m.plotdir= plotdir
        ## m.ocdir= ocdir
        ## cycle, flag= cycle_extraction(df, dfcart, m)
        ## println("## number of cycle: $cycle , flag:$flag ")
        ## println("##########################")
        ## println("##")
    end
end

In [None]:
# votname= "NGC 869-3.0deg.vot"
metafile= "configAll.ext.test"
cd(votdir)
votlist= glob("NGC*vot")
cd(wdir)

rng = MersenneTwister()
shuffle!(rng, votlist)
main(votlist, metafile)