# Introduction

This notebook explores several density estimation methods on the same dataset to understand possible systematics and limitations of each one.

Some methods included are
- Histograms (the classic)
- Rolling histograms (continuously moving bins)
- Kernel Density Estimation (KDE)

In [None]:
import Pkg
Pkg.activate("/Users/daniel/Arya.jl/")


In [None]:
using FITSIO
using CairoMakie

import NaNMath as nm
using Measurements

In [None]:
using Revise

using Arya
import LilGuys as lguys


In [None]:
import KernelDensity as KD

# Load Sample

In [None]:
samplename = "./sculptor/fiducial_sample.fits"

f = FITS(samplename, "r")
members = DataFrame(f[2])
close(f)

In [None]:
ecc = 0.33
PA = 94
centre_method="mean"
mass_column = :probability
normalize = true
weights = ones(size(members, 1));

In [None]:
ra0, dec0 = lguys.calc_centre2D(members.ra, members.dec, weights, centre_method)

In [None]:
xi, eta = lguys.to_tangent(members.ra, members.dec, ra0, dec0)

In [None]:
r_max = sqrt(maximum(xi .^ 2 .+ eta .^ 2))

In [None]:
b = sqrt(1-ecc)
a = 1/b

r_ell = lguys.calc_r_ell(xi, eta, a, b, PA-90)

r_ell_max = r_max .* b
r_ell = r_ell[r_ell .< r_ell_max]


r_ell .*= 60; # to arcmin

In [None]:
x = log10.(r_ell);

In [None]:
"""
Given a PDF sampled at points x with errors, returns the 2D density estimate
"""
function Σ_from_pdf(x, pdf, err)
    
    Σ =  Σ_from_pdf(x, pdf)
    Σ_err = Σ .* err ./ pdf
    return Σ, Σ_err
end

In [None]:
function Σ_from_pdf(x, pdf)
    r = 10 .^ x
    dx = Arya.gradient(x)
    mass = pdf .* dx
    
    println(sum(mass))
    
    dA = 2π .* r .* Arya.gradient(r)
    Σ = mass ./ dA
    return Σ
end

In [None]:
"""
Given a raw histogram (mass / bin, not density)
and its bins (x) and errors
calculates the 2D density profile
"""
function Σ_from_hist(bins, hist, err)
    r = 10 .^ bins

    dA = π .* diff(r .^ 2)
    
    println(sum(hist))
    Σ = hist ./ dA
    Σ_err = Σ .* (err ./ hist)
    return Σ, Σ_err
end

In [None]:
function plot_Σ_hist!(bins, pdf, err; kwargs...)
    y, y_err = Σ_from_hist(bins, pdf, err)
    filt = isfinite.(y) .&& (y .> y_err)
    y = y[filt]
    y_err = y_err[filt]
    
    y1 = log10.(y)
    y1l = log10.(y .- y_err)
    y1h = log10.(y .+ y_err)
    
    yerr = collect(zip(y1 .- y1l, y1h .- y1))
    errscatter!(midpoints(bins)[filt], y1, yerr=yerr; kwargs...)
end
    

In [None]:
function plot_Σ!(log_r, pdf, err; kwargs...)
    y, y_err= Σ_from_pdf(log_r, pdf, err)

    filt = isfinite.(y) .&& (y .> nextfloat.(y_err, 5))
    y = y[filt]
    y_err = y_err[filt]
    
    y1 = log10.(y)
    y1l = log10.(y .- y_err)
    y1h = log10.(y .+ y_err)
    yerr = collect(zip(y1 .- y1l, y1h .- y1))

    #lines!(log_r[filt], y[filt]; kwargs...)
    errscatter!(log_r[filt], y1, yerr=yerr; kwargs...)

end
    

In [None]:
function plot_Σ!(log_r, pdf; kwargs...)
    y= (Σ_from_pdf(log_r, pdf))
    filt = isfinite.(y)

    lines!(log_r[filt], log10.(y[filt]); kwargs...)

end
    


# Histogram methods

In [None]:
h = Arya.histogram(x, normalization=:pdf)

In [None]:
h_raw = Arya.histogram(x, normalization=:probabilitymass, errors=:poisson)

In [None]:

fig, ax = FigAxis(
    
)

plot_Σ_hist!(h_raw.bins, h_raw.values, h_raw.err, color=COLORS[1], alpha=0.9)

plot_Σ!(midpoints(h.bins), h.values, h.err, alpha=0.5, color=COLORS[2])

fig

note that the largest errorbars are from pm sqrt(1) and pm 1/sqrt(2) errors on a log scale (streching out the - part of the \pm)

In [None]:
counts = Arya.histogram(r_ell, weights=nothing)

# KDE and rolling histograms

In [None]:
h = Arya.histogram(x, normalization=:pdf)

In [None]:
dx = diff(h.bins)[1]

In [None]:
kd = Arya.calc_kde(x, dx/2)

In [None]:
hr = Arya.rolling_histogram(x, dx/3, normalization=:pdf)

In [None]:
η = 50
k = 50
akde = Arya.calc_kde(x, η=η, k=k)

In [None]:
xlabel = L"\log r / \textrm{arcmin}"

In [None]:
fig, ax = FigAxis(
    xlabel=xlabel,
    ylabel="density",
    limits=(-1.3, 1.9, nothing, nothing),
    )


lines!(h, label="histogram")
lines!(hr.x, hr.values, label="rolling histogram")
lines!(kd.x, kd.values, label="KDE")

lines!(akde.x, akde.values, label="AKDE")


axislegend(position=:lt)
fig

In [None]:
fig, ax = FigAxis(
    xlabel=xlabel,
    ylabel="density",
    limits=(-1.3, 1.9, nothing, nothing),
    yscale=log10
    )


errscatter!(midpoints(h.bins), h.values, yerr=h.err, label="histogram", color=:black)
lines!(hr.x, hr.values, label="rolling histogram")
lines!(kd.x, kd.values, label="KDE")

lines!(akde.x, akde.values, label="AKDE")


axislegend(position=:lt)
fig

In [None]:
fig, ax = FigAxis(
    ylabel=L"\Sigma \ / \ \textrm{fraction arcmin^{-2}}",
    xlabel=xlabel,
    limits=(-1, 2, -8, -1)
)

plot_Σ_hist!(hu.bins, hu.values, hu.err, label="histogram", color=:black)

plot_Σ!(hr.x, hr.values, label="rolling")

plot_Σ!(kd.x, kd.values, label="KDE")
plot_Σ!(akde.x, akde.values, label="AKDE")

axislegend(position=:lb)

fig

In [None]:
let 
    fig, ax = FigAxis(
        xlabel="log r",
        ylabel="log distance to nearest neighbor"
    )
    
	scatter!(x, log10.(akde.bandwidth ./ η), label="kde knn=$k")
	scatter!(Arya.midpoint(x), log10.(diff(x)), alpha=0.1, label="exact")
    axislegend()
	fig
end

A validation plot for the AKDE method

# Bandwidth effects

In [None]:
fig, ax = FigAxis(
    xlabel=xlabel,
    ylabel="density"
)

for factor in [1/10, 1/3, 1, 3, 10]
    hr = Arya.rolling_histogram(x, dx * factor, normalization=:pdf)
    lines!(hr.x, hr.values, label=string(round(factor * dx, digits=4)))
end

axislegend("bandwidth")

fig

In [None]:
fig, ax = FigAxis(
    xlabel=xlabel,
    ylabel="density"
)

for factor in [1/10, 1/3, 1, 3, 10]
    hr = Arya.calc_kde(x, dx/2 * factor)
    lines!(hr.x, hr.values, label=string(round(factor * dx, digits=4)))
end

axislegend("bandwidth")

fig

In [None]:
fig, ax = FigAxis(
    xlabel=xlabel,
    ylabel="density"
)


h_ew = Arya.histogram(x, 50, normalization=:pdf)
scatter!(midpoints(h_ew.bins), h_ew.values, label="equal width")



h_en = Arya.histogram(x, Arya.bins_equal_number, normalization=:pdf, n=20)
scatter!(midpoints(h_en.bins), h_en.values, label="equal number")


h_bb = Arya.histogram(x, Arya.bayesian_blocks, normalization=:pdf)
scatter!(midpoints(h_bb.bins), h_bb.values, label="bayesian blocks")

axislegend(position=:lt)
fig

In [None]:
plot_Σ_hist!(h::Arya.Histogram; kwargs...) = plot_Σ_hist!(h.bins, h.values, h.err; kwargs...)

plot_Σ!(h::Arya.Histogram; kwargs...) = plot_Σ!(midpoints(h.bins), h.values, h.err; kwargs...)

In [None]:
fig, ax = FigAxis(
    ylabel=L"\Sigma \ / \ \textrm{fraction arcmin^{-2}}",
    xlabel=xlabel,
)


plot_Σ!(h_ew, label="EW")

h_en = Arya.histogram(x, Arya.bins_equal_number, normalization=:probabilitymass, n=32)
plot_Σ_hist!(h_en, label="EN")

h_bb = Arya.histogram(x, Arya.bayesian_blocks, normalization=:pdf)

plot_Σ!(h_bb, label="BB")


h_bb = Arya.histogram(x, Arya.bayesian_blocks, normalization=:probabilitymass)
plot_Σ_hist!(h_bb, label="BB (exact)")


axislegend(position=:lb)

fig

In [None]:
Arya.bins_knuth(x[1:10])

# 2 dimensional

In [None]:

fig, ax = FigAxis(aspect=DataAspect(), xgridvisible=false,
ygridvisible=false,
    limits=(-2, 2, -2, 2)
)
scatter!(xi, eta, markersize=3, alpha=0.1, color=:black)
fig

In [None]:
fig, ax = FigAxis(aspect=DataAspect())

h = heatmap!(h2d, colorscale=log10, colorrange=(1, maximum(h2d.values)))

#scatter!(members.xi, members.eta, color=:black, markersize=1)
Colorbar(fig[1, 2], h)
fig

In [None]:
h2d = Arya.histogram2d(xi, eta, 100, limits=(-2, 2, -2, 2))

In [None]:
bw = Arya.bandwidth_knn(vcat(members.xi', members.eta'))

In [None]:
akde2d = Arya.kde2d(members.xi, members.eta, 5bw, bins=256, kernel=Arya.kernel
limits=(-2, 2, -2, 2))

In [None]:
fig, ax = FigAxis(aspect=DataAspect())

h = heatmap!(akde2d, colorscale=log10, colorrange=(1e-2, maximum(akde2d.values)))

#scatter!(members.xi, members.eta, color=:black, markersize=1)
Colorbar(fig[1, 2], h)
fig

In [None]:
fig, ax = FigAxis(aspect=DataAspect())

contour!(akde2d.x, akde2d.y, Makie.pseudolog10.(akde2d.values ./ 1e-1), levels=20)

fig

In [None]:
kd2d = Arya.kde2d(members.xi, members.eta, fill(0.05, length(members.xi)), bins=256,
limits=(-2, 2, -2, 2))


In [None]:
fig, ax = FigAxis(aspect=DataAspect())

h = heatmap!(kd2d, colorscale=log10, colorrange=(1e-2, maximum(kd2d.values)))

#scatter!(members.xi, members.eta, color=:black, markersize=1)
Colorbar(fig[1, 2], h)
fig

In [None]:
fig, ax = FigAxis(aspect=DataAspect())

cs = contour!(kd2d.x, kd2d.y, Makie.pseudolog10.(kd2d.values ./ 1e-1), levels=20)

fig

In [None]:
kdkde = KD.kde((members.xi, members.eta),
    boundary=((-2,2), (-2, 2)),
    bandwidth=(0.05, 0.05)
    )


In [None]:
fig, ax = FigAxis(aspect=DataAspect())

h = heatmap!(kdkde.x, kdkde.y, 
    kdkde.density, 
    colorscale=log10,
    colorrange=(1e-2, maximum(kdkde.density))
)
Colorbar(fig[1, 2], h)
fig

In [None]:
fig, ax = FigAxis(aspect=DataAspect())

contour!(kdkde.x, kdkde.y, Makie.pseudolog10.(kdkde.density ./ 1e-1), levels=20)

fig

In [None]:
res = vec(kdkde.density .- kd2d.values)

hhh = Arya.histogram(res, 1000)

fig, ax = FigAxis(
    yscale=log10, 
    limits=(nothing, (nothing, 1e5)),
    xlabel="residual KernelDensity.jl - Arya.jl for density",
    ylabel="count"
)
scatter!(hhh)
fig


In [None]:

fig, ax = FigAxis(
    yscale=log10, 
    xlabel="log density",
    ylabel="count",
    limits=(nothing, (1, 1e4))
)


res = log10.(kd2d.values)
res = res[isfinite.(res)]

hhh = Arya.histogram(res, 100)

scatter!(hhh, label="Arya")


res = log10.(kdkde.density)
res = res[isfinite.(res)]

hhh = Arya.histogram(res, 100)

scatter!(hhh, label="Kernel Density.jl")


axislegend()
fig


## 1D from the 2D

# Gaussian Process Regression

# Bootstrapping

In [None]:
function resample(M, N=M)
    return rand(1:M, N)
end

In [None]:
function bootstrap(data, statistic; iterations=1000, sample_size=length(data), )
    stats = [statistic(data)]
    for _ in 1:iterations
        idx = resample(length(data), sample_size)
        push!(stats, statistic(data[idx]))
    end
    return stats
end

In [None]:
function bootstrap(data, weights, statistic; iterations=1000, sample_size=length(data), )
    stats = [statistic(data, weights)]
    N = length(data)
    for _ in 1:iterations
        idx = resample(N, sample_size)
        x = data[idx]
        w = weights[idx]
        
        push!(stats, statistic(x, w))
    end
    return stats
end

In [None]:
kds = bootstrap(x, xx->Arya.calc_kde(xx).values)

In [None]:
hs = bootstrap(r_ell, weights, (x, w)->Arya.simple_hist(x, h.bins, w)[1], 
    iterations=1000,)
    

In [None]:
m = hcat(hs...)

In [None]:
fig = Figure()
ax = Axis(fig[1, 1])

for hi in hs
    lines!(Arya.midpoint(h.bins), hi, color=:black, alpha=0.1)
end

fig

In [None]:
sigma_simp = μ ./ sqrt.(counts)

In [None]:
μ = vec(Arya.mean(m, dims=2))
σ = vec(Arya.std(m, dims=2))

fig, _, _ = errscatter(Arya.midpoint(h.bins), μ, yerr=σ)
#errscatter!(Arya.midpoint(h.bins), μ, yerr=sqrt.(μ))
scatter!(Arya.midpoint(h.bins), m[:, 1], color=COLORS[3])

fig

In [None]:
scatter(Arya.midpoint(h.bins), sigma_simp ./ σ)

In [None]:
errscatter(Arya.midpoint(h.bins), μ, yerr=sqrt.(μ))


# MCMC histograms 

In [None]:
x = log10.(r_ell)

In [None]:
δx = x .- log10.(abs.(r_ell))

In [None]:
weights

In [None]:
δweights = 0.5weights

In [None]:
function resample(x, dx, w, dw, size)
    N = length(x)
    idx = resample(N, size)
    
    x1 = x[idx] .+ dx[idx] .* randn(size)
    w1 = w[idx] .+ dw[idx] .* randn(size)
    return x1, w1
end

In [None]:
resample(x, δx, weights, δweights, 10)

In [None]:
function bootstrap(data, ddata, weights, dweights, statistic; iterations=1000, sample_size=length(data), )
    stats = [statistic(data, weights)]
    for _ in 1:iterations
        x, w = resample(data, ddata, weights, dweights,sample_size)

        
        push!(stats, statistic(x, w))
    end
    return stats
end

In [None]:
h = Arya.histogram(x)

In [None]:
hs = bootstrap(x, δx, weights, δweights, (x, w)->Arya.simple_hist(x, h.bins, w)[1], 
    iterations=1000,)

In [None]:
m = hcat(hs...)

In [None]:
fig = Figure()
ax = Axis(fig[1, 1])

for hi in hs
    lines!(Arya.midpoint(h.bins), hi, color=:black, alpha=0.01)
end

fig

In [None]:
μ = vec(Arya.mean(m, dims=2))
σ = vec(Arya.std(m, dims=2))

fig, _, _ = errscatter(Arya.midpoint(h.bins), μ, yerr=σ)
#errscatter!(Arya.midpoint(h.bins), μ, yerr=sqrt.(μ))
scatter!(Arya.midpoint(h.bins), m[:, 1], color=COLORS[3])

fig

In [None]:
counts = Arya.histogram(x, h.bins).values;

In [None]:
dw_mean = Arya.histogram(x, h.bins, weights=δweights).values ./ sqrt.(counts);

In [None]:
w_mean = Arya.histogram(x, h.bins, weights=weights).values ./ sqrt.(counts);

In [None]:
sigma_simp = sigma_simp = (μ) ./ sqrt.(counts) .+ μ .* 0.5 ./ (counts)

In [None]:

fig, _, _ = errscatter(Arya.midpoint(h.bins), sigma_simp ./ σ,
    yerr = 0.3 ./ sqrt.(counts),
    axis=(; limits=(nothing, (0, 2)), ylabel="err analytic / bootstrap")
)
hlines!(1)
fig