# Step 1: Manipulate the Kleven-Best data, without splitting ages

In [1]:
# Load libraries
using Parameters
using StatFiles
using DataFrames
using Plots
using Trapz
using NumericalIntegration
using SpecialFunctions
using JLD2

In [2]:
# Add libraries
# using Pkg
# Pkg.add("StatFiles")

In [3]:
# Load data
dta = Float64.(DataFrame(load("data/new_dta/psid/psidtaxsim_20mil.dta")));
ndta = size(dta)[1]; # Data size


In [4]:
# Add a struct of parameters from the data
# Not sure where some of these parameters came from, need to check
@with_kw struct Dta_params
    e
    zmin
    α
    γ ::Float64 = 10
    R ::Int64 = 4000
end;

In [5]:
dtapars = Dta_params(e = dta.zparameters[1],
                     zmin = dta.zparameters[2],
                     α = dta.zparameters[3]);

#### Define some useful functions

In [6]:
# Function to make a distribution monotonic (increasing)
function make_monotone(dist)
    new = copy(dist)
    ndata = length(dist)
    for i in 2:ndata
        if new[i] <= new[i-1]
            new[i] = new[i-1]
            nextind = findfirst(new .> new[i-1])[2]
            next = new[nextind]
            new[i] = (next + new[i-1]*(nextind-i)) / (nextind-i+1)
        end
    end
    return new
end;

In [7]:
# Function that smoothes distributions
# (Same procedure as in Saez)
function smooth_dist(dist, niter)
    old = copy(dist)
    new = copy(dist)
    ndata = length(dist)
    for i in 1:niter
        for j in 2:ndata-1
            new[j] = 0.3*old[j-1] + 0.4*old[j] + 0.3*old[j+1]
        end
        old = copy(new)
    end
    return new
end;

#### Compute the distributions

In [8]:
# Compute the ability levels, make monotone, and smooth
# (n = ability)
τ = dta.mtr./100;
n = dta.z ./ (1 .- τ).^dtapars.e
n = make_monotone(n');
n = smooth_dist(n', 1000);


In [9]:
# Compute the ability distribution and smooth
fn = diff(dta.Hz) ./ diff(n);
push!(fn, 0);
fn = smooth_dist(fn, 1000);

# Create the CDF and normalize so sum = 1
Fn = cumul_integrate(n, fn); # CDF
fn = fn/Fn[end]; # Normalize
Fn = cumul_integrate(n, fn); # Normalize

In [10]:
# Pareto tails for the ability distribution
# Above z = 2,000,000

# Calculate the pareto distribution
haz = (n.*fn)./(1 .- Fn); # Clarify what this is?
pareto_ind = findfirst(dta.z .> 2000000);
pareto_α = haz[pareto_ind];
φ = 1 - Fn[pareto_ind];
pareto_lb = n[pareto_ind] * φ^(1/pareto_α);

# Update the distributions fn, Fn
fn[pareto_ind:end] = pareto_α .* pareto_lb^pareto_α ./ n[pareto_ind:end].^(1+pareto_α);
Fn[pareto_ind:end] = 1 .- (pareto_lb^pareto_α ./ n[pareto_ind:end].^pareto_α);

# Trim the ability distribution at 27,200
#ntop = findfirst(n .> 27200) - 1;
ntop = length(dta.z) # NEW/adjust

n = n[1:ntop];
fnnew = fn[1:ntop];
Fnnew = Fn[1:ntop];

# Re-normalize distributions
fn = fnnew / Fnnew[ntop];
Fn = Fnnew / Fnnew[ntop];


In [11]:
# Squeeze the top of the z grid (clarify?)
z2 = zeros(ntop);
for i = 1:ntop
    z2[i] = exp(5 + ((i-1) * (11.52/(ntop-1))))
end;

In [12]:
# Save as a JLD2 file
jldsave("data/zprimitives.jld2", z=dta.z, n=n, fn=fn, ntop=ntop, γ=dtapars.γ, e=dtapars.e)

jldsave("data/zprimitives2.jld2", z=z2, n=n, fn=fn, ntop=ntop, γ=dtapars.γ, e=dtapars.e)
