In [116]:
using Distributions
using Random
using Plots
using DataFrames
using Query
include("./polya.jl")
using .Polya



In [None]:
function draw_dirichlet_multinomial(n::Integer, α::Vector{T1}, T::Real, K::Integer) where {T1 <: Real}
    cdf = DirichletMultinomial(n, T .* α ./ sum(α))
    return rand(cdf, K)
end

In [None]:
K = 10
p = ones(K) ./ K
T = 0.5
Xk = draw_dirichlet_multinomial(100, p, T, 10000)

In [None]:
Polya.mle(DirichletMultinomial, Xk)

In [62]:
function compute_KLD(share::Array{Float64,N}, p::Vector{Float64}) where {N}
    all(sum(share, dims=1) .≈ 1) || throw(ArgumentError("Shares have to sum to 1.")) 
    deviation = log.(share ./ p)
    # log(0) is fine because 0*log(0) = 0
    replace!(deviation, -Inf=>0)
    # convert to vector instead of 1xK array
    return sum(share .* deviation, dims=1)
end

compute_KLD (generic function with 1 method)

In [None]:
KLD = compute_KLD(Xk ./ sum(Xk, dims=1), p)
histogram(KLD[:])

In [None]:
P = DirichletMultinomial(20, T*p)
pmf = Polya.simulate_ECDF(P, 
    x -> compute_KLD(x ./ sum(x, dims=1),
        p), digits=3)

In [None]:
x = support(pmf)
p = cumsum(probs(pmf))
plot(x, p)

In [None]:
cdf(pmf, 2.2)

In [None]:
mean(pmf)

In [117]:
function compute_p_values(A)
    # exclude zero rows
    data = A[vec(maximum(A, dims=2) .> 0), :]

    K, N = size(data)
    H0_params = Polya.mle(DirichletMultinomial, data, tol=1e-4)
    H0_shares = H0_params.α ./ sum(H0_params.α)
    actual_KLD = compute_KLD(data ./ sum(data, dims=1), H0_shares)
    
    p = zeros(Float64, N)
    for i = 1:N
        # actual number of shipments treated as a parameter
        H1 = DirichletMultinomial(sum(data[:,i]), H0_params.α)
        pmf = Polya.simulate_ECDF(H1, 
            x -> compute_KLD(x ./ sum(x, dims=1), 
                    H0_params.α ./ H0_params.α0), 
            maxiter=1000, digits=2)
        p[i] = 1 - cdf(pmf, actual_KLD[1,i])
    end
    return p
end

compute_p_values (generic function with 1 method)

In [None]:
compute_p_values(Xk)

In [15]:
data = readtable("../temp/shipment-clean.csv")
"So as not to make notebook big."

UndefVarError: UndefVarError: readtable not defined

In [64]:
function get_destination_matrix(data; country::String, year::Int = 2017)
    return filter(row -> row.iso2_d == country && row.year == year, data)[:,4:end-1]
end

get_destination_matrix (generic function with 1 method)

In [65]:
russia = get_destination_matrix(data; country="RU", year=2017)

Unnamed: 0_level_0,shipments1,shipments2,shipments3,shipments4,shipments5,shipments6,shipments7
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,190,1,0,29,28,148,126
2,93,8,304,15,43,35,205
3,2,1,0,8,8,1,71
4,6,2,0,0,0,7,1
5,45,36,77,1676,6,1,91
6,3441,0,5061,2435,709,263,1188
7,1312,122,7411,44,138,80,7
8,116,7,365,1,2,16,71
9,6,0,2690,377,28,1,157
10,81,188,99,48,7,29,125


In [71]:
function flip(df::DataFrame) :: Array
    return Array(Array(df)')
end

flip (generic function with 1 method)

In [127]:
destinations = DataFrame(iso2_d=unique(data.iso2_d))
years = DataFrame(year=unique(data.year))
ps = copy(data[1:0,1:3])
ps.p = zeros(Float64, size(ps, 1))
for d in eachrow(destinations)
    for t in eachrow(years) 
        println(d[1], t[1])
        subset = get_destination_matrix(data, country=d[1], year=t[1])
        p = compute_p_values(flip(subset))
        new_batch = filter(row -> row.iso2_d == d[1] && row.year == t[1], data)[:,1:3]
        new_batch[:,:p] = p
        append!(ps, new_batch)
    end
end


AD2001
AD2002
AD2003
AD2004
AD2005
AD2006
AD2007
AD2008
AD2009
AD2010
AD2011
AD2012
AD2013
AD2014


InterruptException: InterruptException:

In [128]:
ps

Unnamed: 0_level_0,iso2_o,iso2_d,year,p
Unnamed: 0_level_1,String,String,Int64,Float64
1,AT,AD,2001,0.61962
2,BE,AD,2001,0.1001
3,BG,AD,2001,0.533534
4,CY,AD,2001,0.610611
5,CZ,AD,2001,0.901902
6,DE,AD,2001,0.219219
7,DK,AD,2001,0.001001
8,EE,AD,2001,0.00700701
9,ES,AD,2001,1.0
10,FI,AD,2001,0.0


In [77]:
Polya.gmm(DirichletMultinomial, flip(russia))

116.54745432892767


DirichletMultinomial{Float64}(
n: 3797679
α: [1.0889495101313467, 1.0072202239725665, 1.394056104553309, 1.079051026183397, 1.02293137920529, 1.2131008703527693, 1.0648582032481535, 1.05595673578739, 1.1110923991735677, 1.2408088056698747  …  2.4492384032733425, 1.3791008572455112, 3.0853218892030876, 1.0310769387411276, 1.0097877043382624, 1.0174510467495785, 1.855492277587067, 1.2723549286402664, 1.2695072410265378, 1.051831824438442]
)


In [74]:
size(russia)

(28, 96)

In [93]:
AD2001 = get_destination_matrix(data, country="AD", year=2001)

Unnamed: 0_level_0,shipments1,shipments2,shipments3,shipments4,shipments5,shipments6,shipments7
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0
6,0,0,0,3,1,0,0
7,0,0,0,10,0,0,0
8,0,0,0,0,0,0,0
9,52,980,616,788,6,50,195
10,0,0,0,0,0,0,0


In [97]:
A = [1 2; 0 0; 3 4; 5 6]

4×2 Array{Int64,2}:
 1  2
 0  0
 3  4
 5  6

In [122]:
A[1:0]

0-element Array{Int64,1}