## Import libraries

In [1]:
import pandas as pd
import numpy as np
import numpyro
import numpyro.distributions as dist
from numpyro.distributions import Distribution, constraints
from numpyro.distributions.util import validate_sample
from numpyro.infer import MCMC, NUTS, Predictive, init_to_median
import jax
from jax import random
from jax.scipy.stats import gaussian_kde
import jax.numpy as jnp
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az
import os
import pickle
import yaml
import json

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Create random seed for JAX
rng_key = random.PRNGKey(0)

I0000 00:00:1698929558.171491    4381 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.


## Define Models

In [4]:
DISTRIBUTIONS = {
    "normal": dist.Normal,
    "half_normal": dist.HalfNormal,
    "student_t": dist.StudentT,
    "laplace": dist.Laplace,
    "uniform": dist.Uniform,
    "gamma": dist.Gamma,
    "log-normal": dist.LogNormal,
    "exponential": dist.Exponential,
}

In [23]:
def pooled(X, y, ind, features_names, from_posterior=None, **init_params_kwargs):
    prior_dist = init_params_kwargs.get("prior_dist", "normal")
    prior_params = init_params_kwargs.get("prior_params", {"loc": 0, "scale": 1})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    if from_posterior is None:
        avg_salary = numpyro.sample("avg_salary", DISTRIBUTIONS[prior_dist](**prior_params))
        priors = []
        for i, feature in enumerate(features_names):
            priors.append(numpyro.sample(f"beta_{feature}", DISTRIBUTIONS[prior_dist](**prior_params)))
    else:
        avg_salary = numpyro.sample("avg_salary", DISTRIBUTIONS[prior_dist](from_posterior["avg_salary"].mean(), from_posterior["avg_salary"].std()))
        priors = []
        for i, feature in enumerate(features_names):
            priors.append(numpyro.sample(f"beta_{feature}", DISTRIBUTIONS[prior_dist](from_posterior[f"beta_{feature}"].mean(), from_posterior[f"beta_{feature}"].std())))
    shape = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary
    for i, prior in enumerate(priors):
        mu += prior * X[:,i]
    mu = jnp.exp(mu)
    rate = shape / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape, rate=rate), obs=y)

def no_pooled_ind(X, y, ind, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    prior_dist = init_params_kwargs.get("prior_dist", "normal")
    prior_params = init_params_kwargs.get("prior_params", {"loc": 0, "scale": 1})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    # Priors
    if from_posterior is None:
        with numpyro.plate("industry", 16):
            avg_salary = numpyro.sample("avg_salary", DISTRIBUTIONS[prior_dist](**prior_params))
            priors = []
            for i, feature in enumerate(features_names):
                priors.append(numpyro.sample(f"beta_{feature}", DISTRIBUTIONS[prior_dist](**prior_params)))
            shape = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))
    else:
        with numpyro.plate("industry", 16):
            avg_salary = numpyro.sample("avg_salary", 
                                        DISTRIBUTIONS[prior_dist](from_posterior["avg_salary"].mean(axis=0), from_posterior["avg_salary"].std(axis=0)))
            priors = []
            for i, feature in enumerate(features_names):
                priors.append(numpyro.sample(f"beta_{feature}", 
                                             DISTRIBUTIONS[prior_dist](from_posterior[f"beta_{feature}"].mean(axis=0), from_posterior[f"beta_{feature}"].std(axis=0))))
            shape = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary[ind]
    for i, prior in enumerate(priors):
        mu += prior[ind] * X[:,i]
    mu = jnp.exp(mu)
    rate = shape[ind] / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape[ind], rate=rate), obs=y)

def no_pooled_occ(X, y, occ, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    prior_dist = init_params_kwargs.get("prior_dist", "normal")
    prior_params = init_params_kwargs.get("prior_params", {"loc": 0, "scale": 1})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    # Priors
    if from_posterior is None:
        with numpyro.plate("occupation", 24):
            avg_salary = numpyro.sample("avg_salary", DISTRIBUTIONS[prior_dist](**prior_params))
            priors = []
            for i, feature in enumerate(features_names):
                priors.append(numpyro.sample(f"beta_{feature}", DISTRIBUTIONS[prior_dist](**prior_params)))
            shape = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))
    else:
        with numpyro.plate("occupation", 24):
            avg_salary = numpyro.sample("avg_salary", 
                                        DISTRIBUTIONS[prior_dist](from_posterior["avg_salary"].mean(axis=0), from_posterior["avg_salary"].std(axis=0)))
            priors = []
            for i, feature in enumerate(features_names):
                priors.append(numpyro.sample(f"beta_{feature}", 
                                             DISTRIBUTIONS[prior_dist](from_posterior[f"beta_{feature}"].mean(axis=0), from_posterior[f"beta_{feature}"].std(axis=0))))
            shape = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary[occ]
    for i, prior in enumerate(priors):
        mu += prior[occ] * X[:,i]
    mu = jnp.exp(mu)
    rate = shape[occ] / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape[occ], rate=rate), obs=y)

def hierarchical_ind(X, y, ind, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    mu_dist = init_params_kwargs.get("mu_dist", "normal")
    mu_params = init_params_kwargs.get("mu_params", {"loc": 0, "scale": 3})
    sigma_dist = init_params_kwargs.get("sigma_dist", "half_normal")
    sigma_params = init_params_kwargs.get("sigma_params", {"scale": 3})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    # Hyperpriors
    mus = []
    sigmas = []
    if from_posterior is None:
        mu_avg_salary = numpyro.sample("mu_avg_salary", DISTRIBUTIONS[mu_dist](**mu_params))
        sigma_avg_salary = numpyro.sample("sigma_avg_salary", DISTRIBUTIONS[sigma_dist](**sigma_params))
        
        for feature in features_names:
            mus.append(numpyro.sample(f"mu_{feature}", DISTRIBUTIONS[mu_dist](**mu_params)))
            sigmas.append(numpyro.sample(f"sigma_{feature}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
    else:
        mu_avg_salary = numpyro.sample("mu_avg_salary", 
                                       DISTRIBUTIONS[mu_dist](from_posterior["mu_avg_salary"].mean(axis=0), from_posterior["mu_avg_salary"].std(axis=0)))
        sigma_avg_salary = numpyro.sample("sigma_avg_salary", 
                                          DISTRIBUTIONS[sigma_dist](from_posterior["sigma_avg_salary"].mean(axis=0)))
        
        for feature in features_names:
            mus.append(numpyro.sample(f"mu_{feature}", 
                                      DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}"].mean(axis=0), from_posterior[f"mu_{feature}"].std(axis=0))))
            sigmas.append(numpyro.sample(f"sigma_{feature}", 
                                         DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}"].mean(axis=0))))

    with numpyro.plate(f"industry", 16):
        offset_avg_salary = numpyro.sample("offset_avg_salary", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary = numpyro.deterministic("avg_salary", mu_avg_salary + offset_avg_salary * sigma_avg_salary)
        priors = []
        for i, feature in enumerate(features_names):
            offset = numpyro.sample(f"offset_{feature}", DISTRIBUTIONS["normal"](loc=0, scale=1))
            priors.append(numpyro.deterministic(f"beta_{feature}", mus[i] + offset * sigmas[i]))
        shape = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary[ind]
    for i, feature in enumerate(features_names):
        mu += priors[i][ind] * X[:,i]

    mu = jnp.exp(mu)
    rate = shape[ind] / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape[ind], rate=rate), obs=y)

def hierarchical_occ(X, y, occ, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    mu_dist = init_params_kwargs.get("mu_dist", "normal")
    mu_params = init_params_kwargs.get("mu_params", {"loc": 0, "scale": 3})
    sigma_dist = init_params_kwargs.get("sigma_dist", "half_normal")
    sigma_params = init_params_kwargs.get("sigma_params", {"scale": 3})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    # Hyperpriors
    mus = []
    sigmas = []
    if from_posterior is None:
        mu_avg_salary = numpyro.sample("mu_avg_salary", DISTRIBUTIONS[mu_dist](**mu_params))
        sigma_avg_salary = numpyro.sample("sigma_avg_salary", DISTRIBUTIONS[sigma_dist](**sigma_params))
        
        for feature in features_names:
            mus.append(numpyro.sample(f"mu_{feature}", DISTRIBUTIONS[mu_dist](**mu_params)))
            sigmas.append(numpyro.sample(f"sigma_{feature}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
    else:
        mu_avg_salary = numpyro.sample("mu_avg_salary", 
                                       DISTRIBUTIONS[mu_dist](from_posterior["mu_avg_salary"].mean(axis=0), from_posterior["mu_avg_salary"].std(axis=0)))
        sigma_avg_salary = numpyro.sample("sigma_avg_salary", 
                                          DISTRIBUTIONS[sigma_dist](from_posterior["sigma_avg_salary"].mean(axis=0)))
        
        for feature in features_names:
            mus.append(numpyro.sample(f"mu_{feature}", 
                                      DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}"].mean(axis=0), from_posterior[f"mu_{feature}"].std(axis=0))))
            sigmas.append(numpyro.sample(f"sigma_{feature}", 
                                         DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}"].mean(axis=0))))

    with numpyro.plate(f"occupation", 24):
        offset_avg_salary = numpyro.sample("offset_avg_salary", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary = numpyro.deterministic("avg_salary", mu_avg_salary + offset_avg_salary * sigma_avg_salary)
        priors = []
        for i, feature in enumerate(features_names):
            offset = numpyro.sample(f"offset_{feature}", DISTRIBUTIONS["normal"](loc=0, scale=1))
            priors.append(numpyro.deterministic(f"beta_{feature}", mus[i] + offset * sigmas[i]))
        shape = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary[occ]
    for i, feature in enumerate(features_names):
        mu += priors[i][occ] * X[:,i]

    mu = jnp.exp(mu)
    rate = shape[occ] / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape[occ], rate=rate), obs=y)

def hierarchical_ind_occ(X, y, ind, occ, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    mu_dist = init_params_kwargs.get("mu_dist", "normal")
    mu_params = init_params_kwargs.get("mu_params", {"loc": 0, "scale": 3})
    sigma_dist = init_params_kwargs.get("sigma_dist", "half_normal")
    sigma_params = init_params_kwargs.get("sigma_params", {"scale": 3})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    # Hyperpriors
    mus_ind = []
    sigmas_ind = []
    mus_occ = []
    sigmas_occ = []
    for dim in ["ind", "occ"]:
        if from_posterior is None:
            if dim == "ind":
                mu_avg_salary_ind = numpyro.sample(f"mu_avg_salary_ind", DISTRIBUTIONS[mu_dist](**mu_params))
                sigma_avg_salary_ind = numpyro.sample(f"sigma_avg_salary_ind", DISTRIBUTIONS[sigma_dist](**sigma_params))
                for feature in features_names:
                    mus_ind.append(numpyro.sample(f"mu_{feature}_{dim}", DISTRIBUTIONS[mu_dist](**mu_params)))
                    sigmas_ind.append(numpyro.sample(f"sigma_{feature}_{dim}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
            else:
                mu_avg_salary_occ = numpyro.sample(f"mu_avg_salary_occ", DISTRIBUTIONS[mu_dist](**mu_params))
                sigma_avg_salary_occ = numpyro.sample(f"sigma_avg_salary_occ", DISTRIBUTIONS[sigma_dist](**sigma_params))
                for feature in features_names:
                    mus_occ.append(numpyro.sample(f"mu_{feature}_{dim}", DISTRIBUTIONS[mu_dist](**mu_params)))
                    sigmas_occ.append(numpyro.sample(f"sigma_{feature}_{dim}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
            
        else:
            if dim == "ind":
                mu_avg_salary_ind = numpyro.sample(f"mu_avg_salary_ind", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_avg_salary_ind"].mean(axis=0), 
                                                                   from_posterior[f"mu_avg_salary_ind"].std(axis=0)))
                sigma_avg_salary_ind = numpyro.sample(f"sigma_avg_salary_ind", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_avg_salary_ind"].mean(axis=0)))
                for feature in features_names:
                    mus_ind.append(numpyro.sample(f"mu_{feature}_{dim}", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}_{dim}"].mean(axis=0), 
                                                                   from_posterior[f"mu_{feature}_{dim}"].std(axis=0))))
                    sigmas_ind.append(numpyro.sample(f"sigma_{feature}_{dim}", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}_{dim}"].mean(axis=0))))
            else:
                mu_avg_salary_occ = numpyro.sample(f"mu_avg_salary_occ", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_avg_salary_occ"].mean(axis=0), 
                                                                   from_posterior[f"mu_avg_salary_occ"].std(axis=0)))
                sigma_avg_salary_occ = numpyro.sample(f"sigma_avg_salary_occ", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_avg_salary_occ"].mean(axis=0)))
                for feature in features_names:
                    mus_occ.append(numpyro.sample(f"mu_{feature}_{dim}", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}_{dim}"].mean(axis=0), 
                                                                   from_posterior[f"mu_{feature}_{dim}"].std(axis=0))))
                    sigmas_occ.append(numpyro.sample(f"sigma_{feature}_{dim}", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}_{dim}"].mean(axis=0))))
            
    priors_ind = []
    priors_occ = []
    with numpyro.plate(f"industry", 16):
        offset_avg_salary_ind = numpyro.sample(f"offset_avg_salary_ind", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary_ind = numpyro.deterministic(f"avg_salary_ind", mu_avg_salary_ind + offset_avg_salary_ind * sigma_avg_salary_ind)
        for i, feature in enumerate(features_names):
            offset = numpyro.sample(f"offset_{feature}_ind", DISTRIBUTIONS["normal"](loc=0, scale=1))
            priors_ind.append(numpyro.deterministic(f"beta_{feature}_ind", mus_ind[i] + offset * sigmas_ind[i]))
        shape_ind = numpyro.sample("shape_ind", DISTRIBUTIONS[shape_dist](**shape_params))
    
    with numpyro.plate(f"occupation", 24):
        offset_avg_salary_occ = numpyro.sample(f"offset_avg_salary_occ", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary_occ = numpyro.deterministic(f"avg_salary_occ", mu_avg_salary_occ + offset_avg_salary_occ * sigma_avg_salary_occ)
        for i, feature in enumerate(features_names):
            offset = numpyro.sample(f"offset_{feature}_occ", DISTRIBUTIONS["normal"](loc=0, scale=1))
            priors_occ.append(numpyro.deterministic(f"beta_{feature}_occ", mus_occ[i] + offset * sigmas_occ[i]))
        shape_occ = numpyro.sample("shape_occ", DISTRIBUTIONS[shape_dist](**shape_params))


    # Expected value
    mu = avg_salary_ind[ind] + avg_salary_occ[occ]
    for i, feature in enumerate(features_names):
        mu += priors_ind[i][ind] * X[:,i] + priors_occ[i][occ] * X[:,i]

    shape = shape_ind[ind] + shape_occ[occ]

    mu = jnp.exp(mu)
    rate = shape / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape, rate=rate), obs=y)

def hierarchical_lognormal(X, y, ind, occ, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    mu_dist = init_params_kwargs.get("mu_dist", "normal")
    mu_params = init_params_kwargs.get("mu_params", {"loc": 0, "scale": 3})
    sigma_dist = init_params_kwargs.get("sigma_dist", "half_normal")
    sigma_params = init_params_kwargs.get("sigma_params", {"scale": 3})
    shape_dist = init_params_kwargs.get("shape_dist", "exponential")
    shape_params = init_params_kwargs.get("shape_params", {"rate": 1})
    target_dist = init_params_kwargs.get("target_dist", "log-normal")

    # Hyperpriors
    mus_ind = []
    sigmas_ind = []
    mus_occ = []
    sigmas_occ = []
    for dim in ["ind", "occ"]:
        if from_posterior is None:
            if dim == "ind":
                mu_avg_salary_ind = numpyro.sample(f"mu_avg_salary_ind", DISTRIBUTIONS[mu_dist](**mu_params))
                sigma_avg_salary_ind = numpyro.sample(f"sigma_avg_salary_ind", DISTRIBUTIONS[sigma_dist](**sigma_params))
                for feature in features_names:
                    mus_ind.append(numpyro.sample(f"mu_{feature}_{dim}", DISTRIBUTIONS[mu_dist](**mu_params)))
                    sigmas_ind.append(numpyro.sample(f"sigma_{feature}_{dim}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
            else:
                mu_avg_salary_occ = numpyro.sample(f"mu_avg_salary_occ", DISTRIBUTIONS[mu_dist](**mu_params))
                sigma_avg_salary_occ = numpyro.sample(f"sigma_avg_salary_occ", DISTRIBUTIONS[sigma_dist](**sigma_params))
                for feature in features_names:
                    mus_occ.append(numpyro.sample(f"mu_{feature}_{dim}", DISTRIBUTIONS[mu_dist](**mu_params)))
                    sigmas_occ.append(numpyro.sample(f"sigma_{feature}_{dim}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
            
        else:
            if dim == "ind":
                mu_avg_salary_ind = numpyro.sample(f"mu_avg_salary_ind", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_avg_salary_ind"].mean(axis=0), from_posterior[f"mu_avg_salary_ind"].std(axis=0)))
                sigma_avg_salary_ind = numpyro.sample(f"sigma_avg_salary_ind", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_avg_salary_ind"].mean(axis=0)))
                for feature in features_names:
                    mus_ind.append(numpyro.sample(f"mu_{feature}_{dim}", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}_{dim}"].mean(axis=0), from_posterior[f"mu_{feature}_{dim}"].std(axis=0))))
                    sigmas_ind.append(numpyro.sample(f"sigma_{feature}_{dim}", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}_{dim}"].mean(axis=0))))
            else:
                mu_avg_salary_occ = numpyro.sample(f"mu_avg_salary_occ", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_avg_salary_occ"].mean(axis=0), from_posterior[f"mu_avg_salary_occ"].std(axis=0)))
                sigma_avg_salary_occ = numpyro.sample(f"sigma_avg_salary_occ", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_avg_salary_occ"].mean(axis=0)))
                for feature in features_names:
                    mus_occ.append(numpyro.sample(f"mu_{feature}_{dim}", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}_{dim}"].mean(axis=0), from_posterior[f"mu_{feature}_{dim}"].std(axis=0))))
                    sigmas_occ.append(numpyro.sample(f"sigma_{feature}_{dim}", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}_{dim}"].mean(axis=0))))
            
    priors_ind = []
    priors_occ = []
    with numpyro.plate(f"industry", 16):
        offset_avg_salary_ind = numpyro.sample(f"offset_avg_salary_ind", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary_ind = numpyro.deterministic(f"avg_salary_ind", mu_avg_salary_ind + offset_avg_salary_ind * sigma_avg_salary_ind)
        for i, feature in enumerate(features_names):
                offset = numpyro.sample(f"offset_{feature}_ind", DISTRIBUTIONS["normal"](loc=0, scale=1))
                priors_ind.append(numpyro.deterministic(f"beta_{feature}_ind", mus_ind[i] + offset * sigmas_ind[i]))
    
    with numpyro.plate(f"occupation", 24):
        offset_avg_salary_occ = numpyro.sample(f"offset_avg_salary_occ", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary_occ = numpyro.deterministic(f"avg_salary_occ", mu_avg_salary_occ + offset_avg_salary_occ * sigma_avg_salary_occ)
        for i, feature in enumerate(features_names):
                offset = numpyro.sample(f"offset_{feature}_occ", DISTRIBUTIONS["normal"](loc=0, scale=1))
                priors_occ.append(numpyro.deterministic(f"beta_{feature}_occ", mus_ind[i] + offset * sigmas_ind[i]))

    sigma = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary_ind[ind] + avg_salary_occ[occ]
    for i, feature in enumerate(features_names):
        mu += priors_ind[i][ind] * X[:,i] + priors_occ[i][occ] * X[:,i]

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](loc=mu, scale=sigma), obs=y)

def hierarchical_normal(X, y, ind, occ, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    mu_dist = init_params_kwargs.get("mu_dist", "normal")
    mu_params = init_params_kwargs.get("mu_params", {"loc": 0, "scale": 3})
    sigma_dist = init_params_kwargs.get("sigma_dist", "half_normal")
    sigma_params = init_params_kwargs.get("sigma_params", {"scale": 3})
    shape_dist = init_params_kwargs.get("shape_dist", "half_normal")
    shape_params = init_params_kwargs.get("shape_params", {"rate": 10})
    target_dist = init_params_kwargs.get("target_dist", "normal")

    # Hyperpriors
    mus_ind = []
    sigmas_ind = []
    mus_occ = []
    sigmas_occ = []
    for dim in ["ind", "occ"]:
        if from_posterior is None:
            if dim == "ind":
                mu_avg_salary_ind = numpyro.sample(f"mu_avg_salary_ind", DISTRIBUTIONS[mu_dist](**mu_params))
                sigma_avg_salary_ind = numpyro.sample(f"sigma_avg_salary_ind", DISTRIBUTIONS[sigma_dist](**sigma_params))
                for feature in features_names:
                    mus_ind.append(numpyro.sample(f"mu_{feature}_{dim}", DISTRIBUTIONS[mu_dist](**mu_params)))
                    sigmas_ind.append(numpyro.sample(f"sigma_{feature}_{dim}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
            else:
                mu_avg_salary_occ = numpyro.sample(f"mu_avg_salary_occ", DISTRIBUTIONS[mu_dist](**mu_params))
                sigma_avg_salary_occ = numpyro.sample(f"sigma_avg_salary_occ", DISTRIBUTIONS[sigma_dist](**sigma_params))
                for feature in features_names:
                    mus_occ.append(numpyro.sample(f"mu_{feature}_{dim}", DISTRIBUTIONS[mu_dist](**mu_params)))
                    sigmas_occ.append(numpyro.sample(f"sigma_{feature}_{dim}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
            
        else:
            if dim == "ind":
                mu_avg_salary_ind = numpyro.sample(f"mu_avg_salary_ind", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_avg_salary_ind"].mean(axis=0), from_posterior[f"mu_avg_salary_ind"].std(axis=0)))
                sigma_avg_salary_ind = numpyro.sample(f"sigma_avg_salary_ind", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_avg_salary_ind"].mean(axis=0)))
                for feature in features_names:
                    mus_ind.append(numpyro.sample(f"mu_{feature}_{dim}", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}_{dim}"].mean(axis=0), from_posterior[f"mu_{feature}_{dim}"].std(axis=0))))
                    sigmas_ind.append(numpyro.sample(f"sigma_{feature}_{dim}", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}_{dim}"].mean(axis=0))))
            else:
                mu_avg_salary_occ = numpyro.sample(f"mu_avg_salary_occ", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_avg_salary_occ"].mean(axis=0), from_posterior[f"mu_avg_salary_occ"].std(axis=0)))
                sigma_avg_salary_occ = numpyro.sample(f"sigma_avg_salary_occ", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_avg_salary_occ"].mean(axis=0)))
                for feature in features_names:
                    mus_occ.append(numpyro.sample(f"mu_{feature}_{dim}", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}_{dim}"].mean(axis=0), from_posterior[f"mu_{feature}_{dim}"].std(axis=0))))
                    sigmas_occ.append(numpyro.sample(f"sigma_{feature}_{dim}", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}_{dim}"].mean(axis=0))))
            
    priors_ind = []
    priors_occ = []
    with numpyro.plate(f"industry", 16):
        offset_avg_salary_ind = numpyro.sample(f"offset_avg_salary_ind", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary_ind = numpyro.deterministic(f"avg_salary_ind", mu_avg_salary_ind + offset_avg_salary_ind * sigma_avg_salary_ind)
        for i, feature in enumerate(features_names):
                offset = numpyro.sample(f"offset_{feature}_ind", DISTRIBUTIONS["normal"](loc=0, scale=1))
                priors_ind.append(numpyro.deterministic(f"beta_{feature}_ind", mus_ind[i] + offset * sigmas_ind[i]))
    
    with numpyro.plate(f"occupation", 24):
        offset_avg_salary_occ = numpyro.sample(f"offset_avg_salary_occ", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary_occ = numpyro.deterministic(f"avg_salary_occ", mu_avg_salary_occ + offset_avg_salary_occ * sigma_avg_salary_occ)
        for i, feature in enumerate(features_names):
                offset = numpyro.sample(f"offset_{feature}_occ", DISTRIBUTIONS["normal"](loc=0, scale=1))
                priors_occ.append(numpyro.deterministic(f"beta_{feature}_occ", mus_ind[i] + offset * sigmas_ind[i]))

    sigma = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary_ind[ind] + avg_salary_occ[occ]
    for i, feature in enumerate(features_names):
        mu += priors_ind[i][ind] * X[:,i] + priors_occ[i][occ] * X[:,i]

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](loc=mu, scale=sigma), obs=y)

## Define functions

In [6]:
def filter_data(year, data, columns=None, occ_dim=False, samples=None):
    # Prepare data for running the model
    if columns is None:
        columns = ["exp","sex","no_edu","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure", "union", "public_sector", "self_emp"]
        
    if samples is None:
        dataset = data.query(f'year == {year}').copy()
    else:
        dataset = data.query(f'year == {year}').sample(samples, random_state=0).copy()

    X = dataset[columns].values
    y = dataset["salary"].values
    ind = dataset["ind_codes"].values
    occ = dataset["occ_codes"].values
    if occ_dim:
        return X, y, ind, occ
    else:
        return X, y, ind

In [7]:
def create_model(model_type):
    if model_type == "pooled":
        return pooled
    elif model_type == "no_pooled_ind":
        return no_pooled_ind
    elif model_type == "no_pooled_occ":
        return no_pooled_occ
    elif model_type == "hierarchical_ind":
        return hierarchical_ind
    elif model_type == "hierarchical_occ":
        return hierarchical_occ
    elif model_type == "hierarchical_ind_occ":
        return hierarchical_ind_occ
    elif model_type == "hierarchical_lognormal":
        return hierarchical_lognormal
    elif model_type == "hierarchical_normal":
        return hierarchical_normal
    else:
        raise ValueError("Invalid model type")

In [8]:
def set_coords(mcmc, dimensions, categories, data):
    model_coords = {"coords": {dim: categories[i] for i, dim in enumerate(dimensions)}}
    model_coords["coords"]["obs"] = np.arange(0,data.shape[0])
    model_coords["dims"] = {}
    for latent_var in mcmc._states['z'].keys():
        if any(latent_var.startswith(field) for field in ["avg_","beta_"]):
            model_coords["dims"][latent_var] = ["industry"] if latent_var.endswith("ind") else ["occupation"]
    return model_coords

In [9]:
def export_model_outputs(mcmc, model, path, *model_params, **model_coords):
    # Export mcmc
    with open(f"{path}/model.pickle", "wb") as file:
        pickle.dump(mcmc, file)
    # Create posterior predictive samples
    predictive = Predictive(model, mcmc.get_samples())
    posterior_samples = predictive(rng_key, *model_params)
    # Add posterior predictive samples to trace
    if model_coords=={}:
        trace = az.from_numpyro(mcmc, posterior_predictive=posterior_samples)
    else:
        trace = az.from_numpyro(mcmc, posterior_predictive=posterior_samples, coords=model_coords["coords"], dims=model_coords["dims"])
    # Export trace
    trace.to_netcdf(f"{path}/trace.nc")
    # Export summary
    summary = az.summary(trace, round_to=5)
    summary.to_csv(f"{path}/summary.csv")
    # Return max Rhat
    return summary["r_hat"].max()   

## Import Data

In [10]:
# Load data and workflow
data = pd.read_csv('../datasets/model_dataset_cleaned.csv')

In [11]:
# Convert industries and occupations to categorical and create codes columns
ind_cat = [
    'agriculture',
    'forestry/oil/mining',
    'utilities',
    'construction',
    'manufacturing',
    'trade',
    'transportation',
    'info/culture',
    'finance/real estate',
    'scientific/technical',
    'business support',
    'education',
    'health/social',
    'accommodation/food',
    'other services',
    'public admin']
data["industry"] = pd.Categorical(data["industry"], categories=ind_cat)
data["ind_codes"] = data["industry"].cat.codes

occ_cat = ['senior management',
    'middle management',
    'business/finance professional',
    'secretarial/administrative',
    'natural/sciences professional',
    'technical specialist',
    'health professional',
    'health assistant',
    'teachers/professors',
    'government/religion services',
    'protective services',
    'childcare/home support',
    'art/culture occupations',
    'clerical/supervisor',
    'chefs/food services',
    'sales/service',
    'clerks/cashiers',
    'construction trades',
    'transport/equipment operators',
    'trade helper/labourer',
    'trades contractors/supervisors',
    'other trades',
    'operators/assemblers',
    'manufacturing labourer']
data["occup"] = pd.Categorical(data["occup"], categories=occ_cat)
data["occ_codes"] = data["occup"].cat.codes

In [12]:
# Get unique years and sort them
years = data["year"].unique()
years.sort()

# Define features
feature_names = ["exp","sex","no_edu","elementary_edu", "highschool_edu", "postsec_edu",
"undergrad_edu", "graduate_edu", "age", "tenure", "union", "public_sector", "self_emp"]

In [13]:
# Data split (training and testing)
# NOTE: Data before 2008 is used for training and data after 2008 is used for validating the model
data = data.query("year < 2008").copy()

In [14]:
# Create dictionary to store standardization parameters
standardization_params = {
    "exp": {"mean": data["exp"].mean(), "std": data["exp"].std()},
    "age": {"mean": data["age"].mean(), "std": data["age"].std()},
    "tenure": {"mean": data["tenure"].mean(), "std": data["tenure"].std()}
}

# Export standardization parameters (for be used in the validation step)
with open("src/standardization_params.json", "w") as file:
    json.dump(standardization_params, file)

# Standardize data
data["exp"] = (data["exp"] - data["exp"].mean()) / data["exp"].std()
data["age"] = (data["age"] - data["age"].mean()) / data["age"].std()
data["tenure"] = (data["tenure"] - data["tenure"].mean()) / data["tenure"].std()

## Run Models

### Pooled

In [2]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "pooled"
model_type = "pooled" # NOTE: pooled, no_pooled, hierarchical

In [22]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind = filter_data(year, data)
    # Run model
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, feature_names]
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:10<00:00,  5.39it/s]


Max Rhat: 1.00421
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:12<00:00, 15.14it/s]


Max Rhat: 1.00822
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:09<00:00, 28.60it/s]


Max Rhat: 1.00494
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:34<00:00, 57.66it/s]


Max Rhat: 1.0022
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:24<00:00, 81.09it/s] 


Max Rhat: 1.0026
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:25<00:00, 78.76it/s] 


Max Rhat: 1.0022
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:22<00:00, 88.95it/s] 


Max Rhat: 1.00231
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:23<00:00, 83.64it/s] 


Max Rhat: 1.00228
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:21<00:00, 91.92it/s] 


Max Rhat: 1.00228
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:29<00:00, 66.84it/s] 


Max Rhat: 1.00326
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:23<00:00, 83.86it/s] 


Max Rhat: 1.00224
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:24<00:00, 82.54it/s] 


Max Rhat: 1.00303


### Pooled | Regularized 

In [23]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "pooled-reg"
model_type = "pooled" # NOTE: pooled, no_pooled, hierarchical
init_params_kwargs = {
    "prior_dist": "laplace",
    "prior_params": {"loc": 0, "scale": 0.01},
    "shape_dist": "uniform",
    "shape_params": {"low": 1, "high": 100},
    "target_dist": "gamma"
}

In [24]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind = filter_data(year, data)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, **init_params_kwargs)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, feature_names]
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, samples, **init_params_kwargs)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:20<00:00, 24.97it/s]


Max Rhat: 1.00299
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:55<00:00, 35.93it/s]


Max Rhat: 1.00339
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:59<00:00, 33.80it/s]


Max Rhat: 1.00283
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:01<00:00, 32.54it/s]


Max Rhat: 1.00389
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:08<00:00, 29.12it/s]


Max Rhat: 1.00514
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:49<00:00, 18.32it/s]


Max Rhat: 1.00588
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:05<00:00, 30.51it/s]


Max Rhat: 1.00248
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:03<00:00, 31.69it/s]


Max Rhat: 1.00512
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:09<00:00, 28.74it/s]


Max Rhat: 1.00308
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:03<00:00, 31.64it/s]


Max Rhat: 1.00199
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:05<00:00, 30.34it/s]


Max Rhat: 1.00332
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:00<00:00, 32.94it/s]


Max Rhat: 1.00396


### No-pooled | Industry

In [16]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "no-pooled-ind"
model_type = "no_pooled_ind" # NOTE: pooled, no_pooled, hierarchical

In [17]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind = filter_data(year, data)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, feature_names]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [50:59<00:00,  1.53s/it] 


Max Rhat: 1.00839
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [16:49<00:00,  1.98it/s] 


Max Rhat: 1.00877
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:08<00:00,  2.99it/s] 


Max Rhat: 1.00583
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:50<00:00,  3.77it/s] 


Max Rhat: 1.00576
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:57<00:00,  4.79it/s] 


Max Rhat: 1.00641
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:12<00:00, 27.60it/s]


Max Rhat: 1.00541
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:26<00:00,  5.17it/s] 


Max Rhat: 1.00596
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:31<00:00,  5.11it/s] 


Max Rhat: 1.00603
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:24<00:00,  5.20it/s] 


Max Rhat: 1.0051
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:27<00:00,  5.16it/s] 


Max Rhat: 1.00529
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:03<00:00,  4.72it/s] 


Max Rhat: 1.00464
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:20<00:00,  5.26it/s] 


Max Rhat: 1.00449


### No-pooled | Occupation

In [15]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "no-pooled-occ"
model_type = "no_pooled_occ" # NOTE: pooled, no_pooled, hierarchical

In [16]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, _, occ = filter_data(year, data, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, occ, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, occ, feature_names]
        model_coords = set_coords(mcmc, "occupation", occ_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, occ, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:27:50<00:00,  2.64s/it]


Max Rhat: 1.00993
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [18:01<00:00,  1.85it/s] 


Max Rhat: 1.00603
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:49<00:00,  2.82it/s] 


Max Rhat: 1.00564
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [09:52<00:00,  3.37it/s] 


Max Rhat: 1.0052
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:03<00:00,  4.14it/s] 


Max Rhat: 1.0076
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:20<00:00, 24.96it/s]


Max Rhat: 1.00821
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:41<00:00,  4.33it/s] 


Max Rhat: 1.00706
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:27<00:00,  4.47it/s] 


Max Rhat: 1.00718
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:50<00:00,  4.87it/s] 


Max Rhat: 1.00694
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:26<00:00,  4.48it/s] 


Max Rhat: 1.00778
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:54<00:00,  4.21it/s] 


Max Rhat: 1.0081
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:06<00:00,  4.68it/s] 


Max Rhat: 1.00797


### No-pooled | Regularized

In [25]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "no-pooled-reg"
model_type = "no_pooled" # NOTE: pooled, no_pooled, hierarchical
init_params_kwargs = {
    "prior_dist": "laplace",
    "prior_params": {"loc": 0, "scale": 0.01},
    "shape_dist": "uniform",
    "shape_params": {"low": 1, "high": 100},
    "target_dist": "gamma"
}

In [26]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind = filter_data(year, data)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, **init_params_kwargs)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, feature_names]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, samples, **init_params_kwargs)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:50<00:00,  4.25it/s]


Max Rhat: 1.00939
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:02<00:00,  4.73it/s]


Max Rhat: 1.00933
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:34<00:00,  4.40it/s]


Max Rhat: 1.00907
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:25<00:00,  5.19it/s]


Max Rhat: 1.00947
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:44<00:00,  4.95it/s]


Max Rhat: 1.0085
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:12<00:00,  5.37it/s]


Max Rhat: 1.00769
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [05:15<00:00,  6.33it/s]


Max Rhat: 1.00787
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:51<00:00,  4.86it/s]


Max Rhat: 1.01007
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [05:54<00:00,  5.63it/s]


Max Rhat: 1.00809
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [05:02<00:00,  6.61it/s]


Max Rhat: 1.00754
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [05:04<00:00,  6.56it/s]


Max Rhat: 1.01012
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [05:16<00:00,  6.32it/s]


Max Rhat: 1.0074


### Hierarchical | Industry

In [20]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "hierarchical-ind"
model_type = "hierarchical_ind" # NOTE: pooled, no_pooled, hierarchical

In [21]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind = filter_data(year, data)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, feature_names]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:19:46<00:00,  2.39s/it]


Max Rhat: 1.01987
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [44:57<00:00,  1.35s/it] 


Max Rhat: 1.01035
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [24:56<00:00,  1.34it/s] 


Max Rhat: 1.00777
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:42<00:00,  2.62it/s] 


Max Rhat: 1.00562
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [09:27<00:00,  3.53it/s] 


Max Rhat: 1.00967
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:09<00:00, 15.41it/s]


Max Rhat: 1.01051
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [09:09<00:00,  3.64it/s] 


Max Rhat: 1.00655
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:53<00:00,  3.75it/s] 


Max Rhat: 1.00637
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:27<00:00,  2.91it/s] 


Max Rhat: 1.00691
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:47<00:00,  2.60it/s] 


Max Rhat: 1.0045
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [13:49<00:00,  2.41it/s] 


Max Rhat: 1.00572
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:25<00:00,  2.68it/s] 


Max Rhat: 1.00519


### Hierarchical | Occupation

In [24]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "hierarchical-occ"
model_type = "hierarchical_occ" # NOTE: pooled, no_pooled, hierarchical

In [25]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, _, occ = filter_data(year, data, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, occ, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, occ, feature_names]
        model_coords = set_coords(mcmc, "occupation", occ_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, occ, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, occ, feature_names]
        model_coords = set_coords(mcmc, "occupation", occ_cat, X)
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:54:48<00:00,  3.44s/it]


Max Rhat: 1.03455
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:42:46<00:00,  3.08s/it]


Max Rhat: 1.01478
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [37:26<00:00,  1.12s/it] 


Max Rhat: 1.00663
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [19:24<00:00,  1.72it/s] 


Max Rhat: 1.00628
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [14:51<00:00,  2.24it/s] 


Max Rhat: 1.00931
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:10<00:00, 15.35it/s]


Max Rhat: 1.00749
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [13:14<00:00,  2.52it/s] 


Max Rhat: 1.00721
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:28<00:00,  2.67it/s] 


Max Rhat: 1.00769
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:11<00:00,  2.74it/s] 


Max Rhat: 1.0073
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:47<00:00,  2.61it/s] 


Max Rhat: 1.00676
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [13:07<00:00,  2.54it/s] 


Max Rhat: 1.00779
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:17<00:00,  2.71it/s] 


Max Rhat: 1.0078


### Hierarchical | Regularized

In [27]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "hierarchical-reg"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical
init_params_kwargs = {
    "mu_dist": "laplace",
    "mu_params": {"loc": 0, "scale": 0.001},
    "sigma_dist": "half_normal",
    "sigma_params": {"scale": 0.001},
    "shape_dist": "uniform",
    "shape_params": {"low": 1, "high": 100},
    "target_dist": "gamma"
}

In [28]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind = filter_data(year, data)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, **init_params_kwargs)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, feature_names]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, samples, **init_params_kwargs)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [09:59<00:00,  3.34it/s]


Max Rhat: 1.00698
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:43<00:00,  4.96it/s]


Max Rhat: 1.01155
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [04:58<00:00,  6.70it/s]


Max Rhat: 1.01038
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [04:13<00:00,  7.90it/s]


Max Rhat: 1.00669
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [03:42<00:00,  8.97it/s]


Max Rhat: 1.00713
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:56<00:00, 11.30it/s]


Max Rhat: 1.00597
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [03:14<00:00, 10.28it/s]


Max Rhat: 1.00511
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:48<00:00, 11.85it/s]


Max Rhat: 1.00635
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [03:10<00:00, 10.52it/s]


Max Rhat: 1.00578
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:44<00:00, 12.15it/s]


Max Rhat: 1.00838
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [03:25<00:00,  9.74it/s]


Max Rhat: 1.00691
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:58<00:00, 11.20it/s]


Max Rhat: 1.00697


### Hierarchical | Industry & Occupation

In [15]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "hierarchical-ind-occ"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [16]:
with open("../outputs/hierarchical-ind-occ/1999/model.pickle", "rb") as file:
    mcmc = pickle.load(file)
samples = mcmc.get_samples()

In [17]:
years = list(range(2000, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind, occ = filter_data(year, data, columns=None, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, feature_names]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, feature_names]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:48:06<00:00,  5.04s/it]  


Max Rhat: 1.01917
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [25:43<00:00,  1.30it/s]


Max Rhat: 1.00697
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:43:37<00:00,  4.91s/it]  


Max Rhat: 1.00957
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:38:07<00:00,  4.74s/it]  


Max Rhat: 1.00893
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:32:47<00:00,  2.78s/it]


Max Rhat: 1.00796
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:29:29<00:00,  2.68s/it]


Max Rhat: 1.00944
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:26:47<00:00,  2.60s/it]


Max Rhat: 1.00554
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [49:40<00:00,  1.49s/it] 


Max Rhat: 1.00878


### Hierarchical log-normal | Industry & Occupation

In [148]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "hierarchical-lognormal"
model_type = "hierarchical_lognormal" # NOTE: pooled, no_pooled, hierarchical

In [149]:
with open("../outputs/hierarchical-lognormal/1998/model.pickle", "rb") as file:
    mcmc = pickle.load(file)
samples = mcmc.get_samples()

In [150]:
years = list(range(1999, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind, occ = filter_data(year, data, columns=None, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, feature_names]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, feature_names]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [55:20<00:00,  1.66s/it]


Max Rhat: 1.00777
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


warmup:  32%|███▏      | 630/2000 [20:54<44:01,  1.93s/it] 

### Hierarchical normal | Industry & Occupation

In [None]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "hierarchical-normal"
model_type = "hierarchical_normal" # NOTE: pooled, no_pooled, hierarchical

In [None]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind, occ = filter_data(year, data, columns=None, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, feature_names]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:13:46<00:00,  2.21s/it]


Max Rhat: 1.00709
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [33:47<00:00,  1.01s/it]


Max Rhat: 1.00937
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:01<00:00,  4.15it/s]


Max Rhat: 1.00789
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [18:48<00:00,  1.77it/s]


Max Rhat: 1.00654
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [22:00<00:00,  1.51it/s] 


Max Rhat: 1.00677
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:04<00:00,  3.01it/s]


Max Rhat: 1.00553
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:47<00:00,  2.61it/s] 


Max Rhat: 1.0086
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [10:36<00:00,  3.14it/s]


Max Rhat: 1.00899
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [09:47<00:00,  3.40it/s]


Max Rhat: 1.00602
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:43<00:00,  3.82it/s]


Max Rhat: 1.00788
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:17<00:00,  4.57it/s]


Max Rhat: 1.00648
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:57<00:00,  4.79it/s]


Max Rhat: 1.00683
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:45<00:00,  4.30it/s]


Max Rhat: 1.00777


## Run models (variable selection)

### Variable Selection | VS1 - NO self_emp

In [28]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS1-no_self_emp"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [29]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure", "union",
                "part_time", "public_sector"]
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:59<00:00, 11.16it/s]


Max Rhat: 1.00705
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:58<00:00, 16.82it/s]


Max Rhat: 1.00772
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:05<00:00, 15.90it/s]


Max Rhat: 1.00586
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:54<00:00, 17.45it/s]


Max Rhat: 1.00715
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:08<00:00, 15.53it/s]


Max Rhat: 1.00753
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:38<00:00, 20.40it/s]


Max Rhat: 1.00779
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:55<00:00, 17.37it/s]


Max Rhat: 1.00815
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:05<00:00, 16.00it/s]


Max Rhat: 1.00559
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:49<00:00, 18.33it/s]


Max Rhat: 1.00548
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:31<00:00, 13.19it/s]


Max Rhat: 1.0081


### Variable Selection | VS2 - NO public sector

In [30]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS2-no_public_sector"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [31]:
compilate_samples = []
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure", "union",
                "part_time"]
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:49<00:00, 11.78it/s]


Max Rhat: 1.00695
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:08<00:00, 15.50it/s]


Max Rhat: 1.00663
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:01<00:00, 16.40it/s]


Max Rhat: 1.00809
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:00<00:00, 16.66it/s]


Max Rhat: 1.00506
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:59<00:00, 16.77it/s]


Max Rhat: 1.00749
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:23<00:00, 24.00it/s]


Max Rhat: 1.00491
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:53<00:00, 17.67it/s]


Max Rhat: 1.00669
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:49<00:00, 18.31it/s]


Max Rhat: 1.00918
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:01<00:00, 16.52it/s]


Max Rhat: 1.00694
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:51<00:00, 17.89it/s]


Max Rhat: 1.00651


### Variable Selection | VS3 - NO part_time

In [32]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS3-no_part_time"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [33]:
compilate_samples = []
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure", "union"]
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:42<00:00, 12.31it/s]


Max Rhat: 1.00527
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:49<00:00, 18.29it/s]


Max Rhat: 1.00464
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:51<00:00, 17.86it/s]


Max Rhat: 1.00535
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:50<00:00, 18.07it/s]


Max Rhat: 1.00575
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:53<00:00, 17.68it/s]


Max Rhat: 1.00463
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:33<00:00, 21.41it/s]


Max Rhat: 1.00478
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:51<00:00, 17.99it/s]


Max Rhat: 1.00898
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:46<00:00, 18.80it/s]


Max Rhat: 1.00686
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:00<00:00, 16.59it/s]


Max Rhat: 1.00531
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:47<00:00, 18.54it/s]


Max Rhat: 1.00471


### Variable Selection | VS4 - NO union

In [34]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS4-no_union"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [35]:
compilate_samples = []
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure"]
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [03:03<00:00, 10.92it/s]


Max Rhat: 1.00581
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:49<00:00, 18.31it/s]


Max Rhat: 1.00636
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:47<00:00, 18.54it/s]


Max Rhat: 1.00629
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:53<00:00, 17.63it/s]


Max Rhat: 1.00515
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:49<00:00, 18.28it/s]


Max Rhat: 1.01087
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:22<00:00, 24.22it/s]


Max Rhat: 1.00679
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:47<00:00, 18.64it/s]


Max Rhat: 1.00836
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:38<00:00, 20.36it/s]


Max Rhat: 1.00758
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:42<00:00, 19.50it/s]


Max Rhat: 1.00774
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:50<00:00, 18.04it/s]


Max Rhat: 1.00427


### Variable Selection | VS5 - NO tenure

In [36]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS5-no_tenure"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [37]:
compilate_samples = []
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age"]
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:18<00:00, 14.47it/s]


Max Rhat: 1.00504
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:42<00:00, 19.43it/s]


Max Rhat: 1.00735
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


warmup:   1%|▏         | 29/2000 [00:33<37:34,  1.14s/it]  


KeyboardInterrupt: 

### Variable Selection | VS6 - NO Age

In [None]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS6-no_age"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [None]:
compilate_samples = []
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu"]
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:39<00:00, 12.56it/s]


Max Rhat: 1.00474
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:47<00:00, 18.54it/s]


Max Rhat: 1.00614
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:42<00:00, 19.52it/s]


Max Rhat: 1.0071
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:47<00:00, 18.62it/s]


Max Rhat: 1.00598
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:32<00:00, 21.60it/s]


Max Rhat: 1.00384
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:32<00:00, 21.51it/s]


Max Rhat: 1.00587
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:20<00:00, 24.73it/s]


Max Rhat: 1.00592
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:30<00:00, 22.09it/s]


Max Rhat: 1.01022
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:27<00:00, 22.98it/s]


Max Rhat: 1.01124
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:37<00:00, 20.47it/s]


Max Rhat: 1.00832
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:33<00:00, 21.30it/s]


Max Rhat: 1.00511
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:27<00:00, 22.93it/s]


Max Rhat: 1.00441
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:31<00:00, 21.92it/s]


Max Rhat: 1.00504
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:39<00:00, 20.03it/s]


Max Rhat: 1.00602
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:35<00:00, 20.99it/s]


Max Rhat: 1.00594
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:28<00:00, 22.54it/s]


Max Rhat: 1.00426


### Variable Selection | VS7 - No Education Level

In [None]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS7-no_edu"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [None]:
compilate_samples = []
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex"]
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:10<00:00, 28.26it/s]


Max Rhat: 1.0062
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:54<00:00, 36.42it/s]


Max Rhat: 1.00489
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:52<00:00, 38.13it/s]


Max Rhat: 1.00545
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:54<00:00, 36.85it/s]


Max Rhat: 1.00486
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:53<00:00, 37.07it/s]


Max Rhat: 1.00254
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:44<00:00, 45.01it/s] 


Max Rhat: 1.0043
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:53<00:00, 37.25it/s]


Max Rhat: 1.00351
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:55<00:00, 35.95it/s]


Max Rhat: 1.00543
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:58<00:00, 34.05it/s]


Max Rhat: 1.00615
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:50<00:00, 39.51it/s]


Max Rhat: 1.00625
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:00<00:00, 32.98it/s]


Max Rhat: 1.00451
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:57<00:00, 34.87it/s]


Max Rhat: 1.00773
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:51<00:00, 38.80it/s]


Max Rhat: 1.01413
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:56<00:00, 35.44it/s]


Max Rhat: 1.00697
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:50<00:00, 39.76it/s]


Max Rhat: 1.00591
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:48<00:00, 41.53it/s] 


Max Rhat: 1.00369


### Variable Selection | VS8 - No Sex

In [None]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS8-no_sex"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [None]:
compilate_samples = []
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp"]
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:29<00:00, 22.35it/s]


Max Rhat: 1.00783
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:49<00:00, 40.18it/s]


Max Rhat: 1.01649
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:45<00:00, 43.80it/s] 


Max Rhat: 1.007
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:42<00:00, 46.80it/s] 


Max Rhat: 1.01442
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:37<00:00, 52.86it/s] 


Max Rhat: 1.01009
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:31<00:00, 63.23it/s] 


Max Rhat: 1.00558
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:37<00:00, 53.39it/s] 


Max Rhat: 1.0086
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:35<00:00, 56.59it/s] 


Max Rhat: 1.00466
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:43<00:00, 45.71it/s] 


Max Rhat: 1.00529
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:39<00:00, 51.15it/s]


Max Rhat: 1.01144
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:41<00:00, 48.40it/s] 


Max Rhat: 1.00547
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:39<00:00, 50.95it/s] 


Max Rhat: 1.00666
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:40<00:00, 49.03it/s] 


Max Rhat: 1.00589
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:37<00:00, 53.17it/s] 


Max Rhat: 1.00705
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:36<00:00, 54.43it/s] 


Max Rhat: 1.00812
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:34<00:00, 57.52it/s] 


Max Rhat: 1.00543


### Variable Selection | VS9 - No exp

In [None]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS9-no_exp"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [None]:
compilate_samples = []
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = []
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:22<00:00, 24.33it/s]


Max Rhat: 1.02031
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:35<00:00, 55.71it/s]


Max Rhat: 1.02626
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:31<00:00, 63.29it/s]


Max Rhat: 1.00541
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:29<00:00, 68.06it/s]


Max Rhat: 1.01406
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:33<00:00, 59.02it/s] 


Max Rhat: 1.01917
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:17<00:00, 111.86it/s]


Max Rhat: 1.00862
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:27<00:00, 72.84it/s] 


Max Rhat: 1.01585
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:28<00:00, 70.64it/s] 


Max Rhat: 1.01622
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:27<00:00, 72.37it/s] 


Max Rhat: 1.01406
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:29<00:00, 66.99it/s]


Max Rhat: 1.02289
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:28<00:00, 69.95it/s] 


Max Rhat: 1.0064
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:34<00:00, 58.24it/s]


Max Rhat: 1.0053
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:28<00:00, 70.24it/s] 


Max Rhat: 1.00668
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:27<00:00, 72.91it/s] 


Max Rhat: 1.00503
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:25<00:00, 78.12it/s] 


Max Rhat: 1.01216
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:25<00:00, 79.07it/s] 


Max Rhat: 1.00631


# Alternative models

### Hierarchical | All vars - No: age & self_emp

In [29]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "ALT-no_age_self_emp"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [30]:
compilate_samples = []
years = list(range(1996, 2012))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ['exp','sex','elementary_edu','highschool_edu','postsec_edu','undergrad_edu','graduate_edu',
               'tenure','union','part_time','public_sector']
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:41<00:00, 12.38it/s]


Max Rhat: 1.00646
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:08<00:00, 15.51it/s]


Max Rhat: 1.00656
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:00<00:00, 16.63it/s]


Max Rhat: 1.0072
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:51<00:00, 17.96it/s]


Max Rhat: 1.00554
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:02<00:00, 16.38it/s]


Max Rhat: 1.0055
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:26<00:00, 22.99it/s]


Max Rhat: 1.00904
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:57<00:00, 17.03it/s]


Max Rhat: 1.00594
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:50<00:00, 18.14it/s]


Max Rhat: 1.00658
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:55<00:00, 17.36it/s]


Max Rhat: 1.00635
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:03<00:00, 16.22it/s]


Max Rhat: 1.00411
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:14<00:00, 14.92it/s]


Max Rhat: 1.00569
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:32<00:00, 13.08it/s]


Max Rhat: 1.00499
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:18<00:00, 14.48it/s]


Max Rhat: 1.00594
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:57<00:00, 16.98it/s]


Max Rhat: 1.00647
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:48<00:00, 18.37it/s]


Max Rhat: 1.0074
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:50<00:00, 18.07it/s]


Max Rhat: 1.00647


### Hierarchical | All vars - No: age & self_emp & union

In [14]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "ALT-no_age_self_emp_union"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [15]:
compilate_samples = []
years = list(range(1996, 2012))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ['exp','sex','elementary_edu','highschool_edu','postsec_edu','undergrad_edu','graduate_edu',
               'tenure','part_time','public_sector']
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:46<00:00, 12.04it/s]


Max Rhat: 1.00847
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:55<00:00, 17.30it/s]


Max Rhat: 1.00566
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:53<00:00, 17.68it/s]


Max Rhat: 1.0085
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:47<00:00, 18.53it/s]


Max Rhat: 1.00565
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:51<00:00, 17.97it/s]


Max Rhat: 1.0063
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:35<00:00, 20.88it/s]


Max Rhat: 1.00553
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:01<00:00, 16.43it/s]


Max Rhat: 1.00585
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:59<00:00, 16.74it/s]


Max Rhat: 1.00488
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:02<00:00, 16.38it/s]


Max Rhat: 1.00668
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:02<00:00, 16.36it/s]


Max Rhat: 1.00649
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:02<00:00, 16.26it/s]


Max Rhat: 1.00509
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:04<00:00, 16.03it/s]


Max Rhat: 1.00598
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:01<00:00, 16.40it/s]


Max Rhat: 1.00584
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:03<00:00, 16.26it/s]


Max Rhat: 1.00605
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:58<00:00, 16.86it/s]


Max Rhat: 1.00668
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:22<00:00, 14.05it/s]


Max Rhat: 1.00609


### Hierarchical | All vars - No: age & self_emp & part_time

In [16]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "ALT-no_age_self_emp_part_time"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical

In [17]:
compilate_samples = []
years = list(range(1996, 2012))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ['exp','sex','elementary_edu','highschool_edu','postsec_edu','undergrad_edu','graduate_edu',
               'tenure','union','public_sector']
    X, y, ind = filter_data(year, data, columns=columns)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, columns]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:45<00:00, 12.10it/s]


Max Rhat: 1.00634
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:01<00:00, 16.40it/s]


Max Rhat: 1.00554
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [04:32<00:00,  7.35it/s] 


Max Rhat: 1.00525
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:57<00:00, 16.97it/s]


Max Rhat: 1.0056
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:58<00:00, 16.81it/s]


Max Rhat: 1.00695
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:28<00:00, 22.53it/s]


Max Rhat: 1.00629
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:49<00:00, 18.21it/s]


Max Rhat: 1.00931
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:51<00:00, 18.01it/s]


Max Rhat: 1.008
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:50<00:00, 18.18it/s]


Max Rhat: 1.00544
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:49<00:00, 18.32it/s]


Max Rhat: 1.00592
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:55<00:00, 17.33it/s]


Max Rhat: 1.0067
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:58<00:00, 16.86it/s]


Max Rhat: 1.00542
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:59<00:00, 16.69it/s]


Max Rhat: 1.00524
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:59<00:00, 16.79it/s]


Max Rhat: 1.0076
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:57<00:00, 17.08it/s]


Max Rhat: 1.00587
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:54<00:00, 17.47it/s]


Max Rhat: 1.00592


### Hierarchical | Weakly informative priors

In [18]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "hierarchical-weak-priors"
model_type = "hierarchical" # NOTE: pooled, no_pooled, hierarchical
init_params_kwargs = {
    "mu_dist": "normal",
    "mu_params": {"loc": 0, "scale": 10},
    "sigma_dist": "half_normal",
    "sigma_params": {"scale": 10},
    "shape_dist": "uniform",
    "shape_params": {"low": 1, "high": 100},
    "target_dist": "gamma"
}

In [19]:
years = list(range(1996, 2012))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind = filter_data(year, data)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, **init_params_kwargs)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, feature_names]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, samples, **init_params_kwargs)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [03:43<00:00,  8.93it/s]


Max Rhat: 1.00547
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:13<00:00, 14.93it/s]


Max Rhat: 1.00729
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:11<00:00, 15.21it/s]


Max Rhat: 1.0078
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:02<00:00, 16.34it/s]


Max Rhat: 1.00644
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:09<00:00, 15.39it/s]


Max Rhat: 1.00673
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:48<00:00, 18.37it/s]


Max Rhat: 1.00455
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:02<00:00, 16.37it/s]


Max Rhat: 1.00556
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:09<00:00, 15.47it/s]


Max Rhat: 1.00723
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:11<00:00, 15.20it/s]


Max Rhat: 1.00485
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:05<00:00, 15.99it/s]


Max Rhat: 1.0064
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:15<00:00, 14.78it/s]


Max Rhat: 1.00727
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:03<00:00, 16.20it/s]


Max Rhat: 1.0056
>>>>>>>>>>>>>>>>> year 2008 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:05<00:00, 15.90it/s]


Max Rhat: 1.00488
>>>>>>>>>>>>>>>>> year 2009 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:04<00:00, 16.00it/s]


Max Rhat: 1.00766
>>>>>>>>>>>>>>>>> year 2010 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:56<00:00, 17.11it/s]


Max Rhat: 1.00809
>>>>>>>>>>>>>>>>> year 2011 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:02<00:00, 16.37it/s]


Max Rhat: 1.00683
