In [6]:
using Distributions
using Random
using DataFrames
using CSVFiles
include("./polya.jl")
using .Polya



In [2]:
function compute_KLD(share::Array{Float64,N}, p::Vector{Float64}) where {N}
    all(sum(share, dims=1) .≈ 1) || throw(ArgumentError("Shares have to sum to 1.")) 
    deviation = log.(share ./ p)
    # log(0) is fine because 0*log(0) = 0
    replace!(deviation, -Inf=>0)
    # convert to vector instead of 1xK array
    return sum(share .* deviation, dims=1)
end

compute_KLD (generic function with 1 method)

In [3]:
function compute_p_values(A)
    # exclude zero rows
    data = A[vec(maximum(A, dims=2) .> 0), :]

    K, N = size(data)
    H0_params = Polya.mle(DirichletMultinomial, data, tol=1e-4)
    H0_shares = H0_params.α ./ sum(H0_params.α)
    actual_KLD = compute_KLD(data ./ sum(data, dims=1), H0_shares)
    
    p = zeros(Float64, N)
    for i = 1:N
        # actual number of shipments treated as a parameter
        H1 = DirichletMultinomial(sum(data[:,i]), H0_params.α)
        pmf = Polya.simulate_ECDF(H1, 
            x -> compute_KLD(x ./ sum(x, dims=1), 
                    H0_params.α ./ H0_params.α0), 
            maxiter=1000, digits=2)
        p[i] = 1 - cdf(pmf, actual_KLD[1,i])
    end
    return p
end

compute_p_values (generic function with 1 method)

In [7]:
data = DataFrame(load("../temp/shipment-clean.csv"))
"So as not to make notebook big."

"So as not to make notebook big."

In [8]:
function get_destination_matrix(data; country::String, year::Int = 2017)
    return filter(row -> row.iso2_d == country && row.year == year, data)[:,4:end-1]
end

get_destination_matrix (generic function with 1 method)

In [9]:
function flip(df::DataFrame) :: Array
    return Array(Array(df)')
end

flip (generic function with 1 method)

In [None]:
#destinations = unique(data.iso2_d)
#years = unique(data.year)
destinations = ["ME", "MK", "AL", "TR", "BA", "AM", "AZ", "BY", "MD", "GE", "UA", "DZ", "EG", "IL", "JO", "LB", "LY", "MA", "PS", "SY", "TN", "RU", "UK"]
years = [2017]
ps = copy(data[1:0,1:3])
ps.p = zeros(Float64, size(ps, 1))
for d in destinations
    for t in years 
        println(d, t)
        subset = get_destination_matrix(data, country=d, year=t)
        p = compute_p_values(flip(subset))
        new_batch = filter(row -> row.iso2_d == d && row.year == t, data)[:,1:3]
        new_batch[:,:p] = p
        append!(ps, new_batch)
    end
end


ME2017
MK2017
AL2017
TR2017
BA2017
AM2017
AZ2017
BY2017
MD2017
GE2017
UA2017
DZ2017
EG2017
IL2017
JO2017
LB2017
LY2017
MA2017
PS2017
SY2017
TN2017
RU

In [13]:
ps

Unnamed: 0_level_0,iso2_o,iso2_d,year,p
Unnamed: 0_level_1,String,String,Int64,Float64
1,AT,RU,2017,0.026026
2,BE,RU,2017,0.0
3,BG,RU,2017,0.162162
4,CY,RU,2017,0.0
5,CZ,RU,2017,0.551552
6,DE,RU,2017,1.0
7,DK,RU,2017,0.201201
8,EE,RU,2017,0.964965
9,ES,RU,2017,0.993994
10,FI,RU,2017,0.586587


In [14]:
save("../temp/p-values.csv", ps)