In [6]:
using Distributions
using Random
using DataFrames
using CSVFiles
include("./polya.jl")
using .Polya



In [2]:
function compute_KLD(share::Array{Float64,N}, p::Vector{Float64}) where {N}
    all(sum(share, dims=1) .≈ 1) || throw(ArgumentError("Shares have to sum to 1.")) 
    deviation = log.(share ./ p)
    # log(0) is fine because 0*log(0) = 0
    replace!(deviation, -Inf=>0)
    # convert to vector instead of 1xK array
    return sum(share .* deviation, dims=1)
end

compute_KLD (generic function with 1 method)

In [19]:
function compute_p_values(A)
    # exclude zero rows
    data = A[vec(maximum(A, dims=2) .> 0), :]

    K, N = size(data)
    H0_params = Polya.mle(DirichletMultinomial, data, tol=1e-4)
    H0_shares = H0_params.α ./ sum(H0_params.α)
    actual_KLD = compute_KLD(data ./ sum(data, dims=1), H0_shares)
    
    p = zeros(Float64, N)
    for i = 1:N
        # actual number of shipments treated as a parameter
        H1 = DirichletMultinomial(sum(data[:,i]), H0_params.α)
        pmf = Polya.simulate_ECDF(H1, 
            x -> compute_KLD(x ./ sum(x, dims=1), 
                    H0_params.α ./ H0_params.α0), 
            maxiter=10000, digits=3)
        p[i] = 1 - cdf(pmf, actual_KLD[1,i])
    end
    return p
end

compute_p_values (generic function with 1 method)

In [7]:
data = DataFrame(load("../temp/shipment-clean.csv"))
"So as not to make notebook big."

"So as not to make notebook big."

In [8]:
function get_destination_matrix(data; country::String, year::Int = 2017)
    return filter(row -> row.iso2_d == country && row.year == year, data)[:,4:end-1]
end

get_destination_matrix (generic function with 1 method)

In [9]:
function flip(df::DataFrame) :: Array
    return Array(Array(df)')
end

flip (generic function with 1 method)

In [20]:
#destinations = unique(data.iso2_d)
#years = unique(data.year)
destinations = ["ME", "MK", "AL", "TR", "BA", "AM", "AZ", "BY", "MD", "GE", "UA", "DZ", "EG", "IL", "JO", "LB", "LY", "MA", "PS", "SY", "TN", "RU"]
years = [2016, 2017]
ps = copy(data[1:0,1:3])
ps.p = zeros(Float64, size(ps, 1))
for d in destinations
    for t in years 
        println(d, t)
        subset = get_destination_matrix(data, country=d, year=t)
        p = compute_p_values(flip(subset))
        new_batch = filter(row -> row.iso2_d == d && row.year == t, data)[:,1:3]
        new_batch[:,:p] = p
        append!(ps, new_batch)
    end
end


ME2016
ME2017
MK2016
MK2017
AL2016
AL2017
TR2016
TR2017
BA2016
BA2017
AM2016
AM2017
AZ2016
AZ2017
BY2016
BY2017
MD2016
MD2017
GE2016
GE2017
UA2016
UA2017
DZ2016
DZ2017
EG2016
EG2017
IL2016
IL2017
JO2016
JO2017
LB2016
LB2017
LY2016
LY2017
MA2016
MA2017
PS2016
PS2017
SY2016
SY2017
TN2016
TN2017
RU2016
RU2017


In [17]:
ps

Unnamed: 0_level_0,iso2_o,iso2_d,year,p
Unnamed: 0_level_1,String,String,Int64,Float64
1,AT,ME,2017,0.977978
2,BE,ME,2017,0.0
3,BG,ME,2017,0.141141
4,CY,ME,2017,0.035035
5,CZ,ME,2017,0.0
6,DE,ME,2017,0.94995
7,DK,ME,2017,0.998999
8,EE,ME,2017,0.676677
9,ES,ME,2017,0.002002
10,FI,ME,2017,0.00600601


In [21]:
save("../temp/p-values.csv", ps)