## Import libraries

In [1]:
import pandas as pd
import numpy as np
import numpyro
import numpyro.distributions as dist
from numpyro.distributions import Distribution, constraints
from numpyro.distributions.util import validate_sample
from numpyro.infer import MCMC, NUTS, Predictive, init_to_median
import jax
from jax import random
from jax.scipy.stats import gaussian_kde
import jax.numpy as jnp
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az
import os
import pickle
import yaml
import json

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Create random seed for JAX
rng_key = random.PRNGKey(0)

I0000 00:00:1702560206.811986    4789 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.


## Define Models

In [4]:
DISTRIBUTIONS = {
    "normal": dist.Normal,
    "half_normal": dist.HalfNormal,
    "student_t": dist.StudentT,
    "laplace": dist.Laplace,
    "uniform": dist.Uniform,
    "gamma": dist.Gamma,
    "log-normal": dist.LogNormal,
    "exponential": dist.Exponential,
}

In [5]:
def pooled(X, y, ind, features_names, from_posterior=None, **init_params_kwargs):
    prior_dist = init_params_kwargs.get("prior_dist", "normal")
    prior_params = init_params_kwargs.get("prior_params", {"loc": 0, "scale": 1})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    if from_posterior is None:
        avg_salary = numpyro.sample("avg_salary", DISTRIBUTIONS[prior_dist](**prior_params))
        priors = []
        for i, feature in enumerate(features_names):
            priors.append(numpyro.sample(f"beta_{feature}", DISTRIBUTIONS[prior_dist](**prior_params)))
    else:
        avg_salary = numpyro.sample("avg_salary", DISTRIBUTIONS[prior_dist](from_posterior["avg_salary"].mean(), from_posterior["avg_salary"].std()))
        priors = []
        for i, feature in enumerate(features_names):
            priors.append(numpyro.sample(f"beta_{feature}", DISTRIBUTIONS[prior_dist](from_posterior[f"beta_{feature}"].mean(), from_posterior[f"beta_{feature}"].std())))
    shape = numpyro.sample("shape", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary
    for i, prior in enumerate(priors):
        mu += prior * X[:,i]
    mu = jnp.exp(mu)
    rate = shape / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape, rate=rate), obs=y)

def no_pooled_ind_occ(X, y, ind, occ, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    prior_dist = init_params_kwargs.get("prior_dist", "normal")
    prior_params = init_params_kwargs.get("prior_params", {"loc": 0, "scale": 1})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    # Priors
    priors_ind = []
    priors_occ = []
    if from_posterior is None:
        with numpyro.plate("industry", 16):
            avg_salary_ind = numpyro.sample("avg_salary_ind", DISTRIBUTIONS[prior_dist](**prior_params))
            for i, feature in enumerate(features_names):
                priors_ind.append(numpyro.sample(f"beta_{feature}_ind", DISTRIBUTIONS[prior_dist](**prior_params)))
            shape_ind = numpyro.sample("shape_ind", DISTRIBUTIONS[shape_dist](**shape_params))

        with numpyro.plate("occupation", 24):
            avg_salary_occ = numpyro.sample("avg_salary_occ", DISTRIBUTIONS[prior_dist](**prior_params))
            for i, feature in enumerate(features_names):
                priors_occ.append(numpyro.sample(f"beta_{feature}_occ", DISTRIBUTIONS[prior_dist](**prior_params)))
            shape_occ = numpyro.sample("shape_occ", DISTRIBUTIONS[shape_dist](**shape_params))
    else:
        with numpyro.plate("industry", 16):
            avg_salary_ind = numpyro.sample("avg_salary_ind", 
                                        DISTRIBUTIONS[prior_dist](from_posterior["avg_salary_ind"].mean(axis=0), from_posterior["avg_salary_ind"].std(axis=0)))
            for i, feature in enumerate(features_names):
                priors_ind.append(numpyro.sample(f"beta_{feature}_ind", 
                                             DISTRIBUTIONS[prior_dist](from_posterior[f"beta_{feature}_ind"].mean(axis=0), from_posterior[f"beta_{feature}_ind"].std(axis=0))))
            shape_ind = numpyro.sample("shape_ind", DISTRIBUTIONS[shape_dist](**shape_params))
        
        with numpyro.plate("occupation", 24):
            avg_salary_occ = numpyro.sample("avg_salary_occ", 
                                        DISTRIBUTIONS[prior_dist](from_posterior["avg_salary_occ"].mean(axis=0), from_posterior["avg_salary_occ"].std(axis=0)))
            for i, feature in enumerate(features_names):
                priors_occ.append(numpyro.sample(f"beta_{feature}_occ", 
                                             DISTRIBUTIONS[prior_dist](from_posterior[f"beta_{feature}_occ"].mean(axis=0), from_posterior[f"beta_{feature}_occ"].std(axis=0))))
            shape_occ = numpyro.sample("shape_occ", DISTRIBUTIONS[shape_dist](**shape_params))

    # Expected value
    mu = avg_salary_ind[ind] + avg_salary_occ[occ]
    for i, feature in enumerate(features_names):
        mu += priors_ind[i][ind] * X[:,i] + priors_occ[i][occ] * X[:,i]

    shape = shape_ind[ind] + shape_occ[occ]

    mu = jnp.exp(mu)
    rate = shape / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape, rate=rate), obs=y)

def hierarchical_ind_occ(X, y, ind, occ, features_names, from_posterior=None, **init_params_kwargs):
    # Initial parameters
    mu_dist = init_params_kwargs.get("mu_dist", "normal")
    mu_params = init_params_kwargs.get("mu_params", {"loc": 0, "scale": 3})
    sigma_dist = init_params_kwargs.get("sigma_dist", "half_normal")
    sigma_params = init_params_kwargs.get("sigma_params", {"scale": 3})
    shape_dist = init_params_kwargs.get("shape_dist", "uniform")
    shape_params = init_params_kwargs.get("shape_params", {"low": 1, "high": 100})
    target_dist = init_params_kwargs.get("target_dist", "gamma")

    # Hyperpriors
    mus_ind = []
    sigmas_ind = []
    mus_occ = []
    sigmas_occ = []
    for dim in ["ind", "occ"]:
        if from_posterior is None:
            if dim == "ind":
                mu_avg_salary_ind = numpyro.sample(f"mu_avg_salary_ind", DISTRIBUTIONS[mu_dist](**mu_params))
                sigma_avg_salary_ind = numpyro.sample(f"sigma_avg_salary_ind", DISTRIBUTIONS[sigma_dist](**sigma_params))
                for feature in features_names:
                    mus_ind.append(numpyro.sample(f"mu_{feature}_{dim}", DISTRIBUTIONS[mu_dist](**mu_params)))
                    sigmas_ind.append(numpyro.sample(f"sigma_{feature}_{dim}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
            else:
                mu_avg_salary_occ = numpyro.sample(f"mu_avg_salary_occ", DISTRIBUTIONS[mu_dist](**mu_params))
                sigma_avg_salary_occ = numpyro.sample(f"sigma_avg_salary_occ", DISTRIBUTIONS[sigma_dist](**sigma_params))
                for feature in features_names:
                    mus_occ.append(numpyro.sample(f"mu_{feature}_{dim}", DISTRIBUTIONS[mu_dist](**mu_params)))
                    sigmas_occ.append(numpyro.sample(f"sigma_{feature}_{dim}", DISTRIBUTIONS[sigma_dist](**sigma_params)))
            
        else:
            if dim == "ind":
                mu_avg_salary_ind = numpyro.sample(f"mu_avg_salary_ind", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_avg_salary_ind"].mean(axis=0), 
                                                                   from_posterior[f"mu_avg_salary_ind"].std(axis=0)))
                sigma_avg_salary_ind = numpyro.sample(f"sigma_avg_salary_ind", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_avg_salary_ind"].mean(axis=0)))
                for feature in features_names:
                    mus_ind.append(numpyro.sample(f"mu_{feature}_{dim}", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}_{dim}"].mean(axis=0), 
                                                                   from_posterior[f"mu_{feature}_{dim}"].std(axis=0))))
                    sigmas_ind.append(numpyro.sample(f"sigma_{feature}_{dim}", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}_{dim}"].mean(axis=0))))
            else:
                mu_avg_salary_occ = numpyro.sample(f"mu_avg_salary_occ", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_avg_salary_occ"].mean(axis=0), 
                                                                   from_posterior[f"mu_avg_salary_occ"].std(axis=0)))
                sigma_avg_salary_occ = numpyro.sample(f"sigma_avg_salary_occ", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_avg_salary_occ"].mean(axis=0)))
                for feature in features_names:
                    mus_occ.append(numpyro.sample(f"mu_{feature}_{dim}", 
                                            DISTRIBUTIONS[mu_dist](from_posterior[f"mu_{feature}_{dim}"].mean(axis=0), 
                                                                   from_posterior[f"mu_{feature}_{dim}"].std(axis=0))))
                    sigmas_occ.append(numpyro.sample(f"sigma_{feature}_{dim}", 
                                                DISTRIBUTIONS[sigma_dist](from_posterior[f"sigma_{feature}_{dim}"].mean(axis=0))))
            
    priors_ind = []
    priors_occ = []
    with numpyro.plate(f"industry", 16):
        offset_avg_salary_ind = numpyro.sample(f"offset_avg_salary_ind", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary_ind = numpyro.deterministic(f"avg_salary_ind", mu_avg_salary_ind + offset_avg_salary_ind * sigma_avg_salary_ind)
        for i, feature in enumerate(features_names):
            offset = numpyro.sample(f"offset_{feature}_ind", DISTRIBUTIONS["normal"](loc=0, scale=1))
            priors_ind.append(numpyro.deterministic(f"beta_{feature}_ind", mus_ind[i] + offset * sigmas_ind[i]))
        shape_ind = numpyro.sample("shape_ind", DISTRIBUTIONS[shape_dist](**shape_params))
    
    with numpyro.plate(f"occupation", 24):
        offset_avg_salary_occ = numpyro.sample(f"offset_avg_salary_occ", DISTRIBUTIONS["normal"](loc=0, scale=1))
        avg_salary_occ = numpyro.deterministic(f"avg_salary_occ", mu_avg_salary_occ + offset_avg_salary_occ * sigma_avg_salary_occ)
        for i, feature in enumerate(features_names):
            offset = numpyro.sample(f"offset_{feature}_occ", DISTRIBUTIONS["normal"](loc=0, scale=1))
            priors_occ.append(numpyro.deterministic(f"beta_{feature}_occ", mus_occ[i] + offset * sigmas_occ[i]))
        shape_occ = numpyro.sample("shape_occ", DISTRIBUTIONS[shape_dist](**shape_params))


    # Expected value
    mu = avg_salary_ind[ind] + avg_salary_occ[occ]
    for i, feature in enumerate(features_names):
        mu += priors_ind[i][ind] * X[:,i] + priors_occ[i][occ] * X[:,i]

    shape = shape_ind[ind] + shape_occ[occ]

    mu = jnp.exp(mu)
    rate = shape / mu

    # Likelihood
    with numpyro.plate("data", X.shape[0]):
        numpyro.sample("salary_hat", DISTRIBUTIONS[target_dist](concentration=shape, rate=rate), obs=y)

## Define functions

In [6]:
def filter_data(year, data, columns=None, occ_dim=False, samples=None):
    # Prepare data for running the model
    if columns is None:
        columns = ["exp","sex","no_edu","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure", "union", "public_sector", "self_emp"]
        
    if samples is None:
        dataset = data.query(f'year == {year}').copy()
    else:
        dataset = data.query(f'year == {year}').sample(samples, random_state=0).copy()

    X = dataset[columns].values
    y = dataset["salary"].values
    ind = dataset["ind_codes"].values
    occ = dataset["occ_codes"].values
    if occ_dim:
        return X, y, ind, occ
    else:
        return X, y, ind

In [7]:
def create_model(model_type):
    if model_type == "pooled":
        return pooled
    elif model_type == "no_pooled_ind_occ":
        return no_pooled_ind_occ
    elif model_type == "hierarchical_ind_occ":
        return hierarchical_ind_occ
    else:
        raise ValueError("Invalid model type")

In [8]:
def set_coords(mcmc, dimensions, categories, data):
    model_coords = {"coords": {dim: categories[i] for i, dim in enumerate(dimensions)}}
    model_coords["coords"]["obs"] = np.arange(0,data.shape[0])
    model_coords["dims"] = {}
    for latent_var in mcmc._states['z'].keys():
        if any(latent_var.startswith(field) for field in ["avg_","beta_"]):
            model_coords["dims"][latent_var] = ["industry"] if latent_var.endswith("ind") else ["occupation"]
    return model_coords

In [9]:
def export_model_outputs(mcmc, model, path, *model_params, **model_coords):
    # Export mcmc
    with open(f"{path}/model.pickle", "wb") as file:
        pickle.dump(mcmc, file)
    # Create posterior predictive samples
    predictive = Predictive(model, mcmc.get_samples())
    posterior_samples = predictive(rng_key, *model_params)
    # Add posterior predictive samples to trace
    if model_coords=={}:
        trace = az.from_numpyro(mcmc, posterior_predictive=posterior_samples)
    else:
        trace = az.from_numpyro(mcmc, posterior_predictive=posterior_samples, coords=model_coords["coords"], dims=model_coords["dims"])
    # Export trace
    trace.to_netcdf(f"{path}/trace.nc")
    # Export summary
    summary = az.summary(trace, round_to=5)
    summary.to_csv(f"{path}/summary.csv")
    # Return max Rhat
    return summary["r_hat"].max()   

## Import Data

In [10]:
# Load data and workflow
data = pd.read_csv('../datasets/model_dataset_cleaned.csv')

In [11]:
# Convert industries and occupations to categorical and create codes columns
ind_cat = [
    'agriculture',
    'forestry/oil/mining',
    'utilities',
    'construction',
    'manufacturing',
    'trade',
    'transportation',
    'info/culture',
    'finance/real estate',
    'scientific/technical',
    'business support',
    'education',
    'health/social',
    'accommodation/food',
    'other services',
    'public admin']
data["industry"] = pd.Categorical(data["industry"], categories=ind_cat)
data["ind_codes"] = data["industry"].cat.codes

occ_cat = ['senior management',
    'middle management',
    'business/finance professional',
    'secretarial/administrative',
    'natural/sciences professional',
    'technical specialist',
    'health professional',
    'health assistant',
    'teachers/professors',
    'government/religion services',
    'protective services',
    'childcare/home support',
    'art/culture occupations',
    'clerical/supervisor',
    'chefs/food services',
    'sales/service',
    'clerks/cashiers',
    'construction trades',
    'transport/equipment operators',
    'trade helper/labourer',
    'trades contractors/supervisors',
    'other trades',
    'operators/assemblers',
    'manufacturing labourer']
data["occup"] = pd.Categorical(data["occup"], categories=occ_cat)
data["occ_codes"] = data["occup"].cat.codes

In [12]:
# Get unique years and sort them
years = data["year"].unique()
years.sort()

# Define features
feature_names = ["exp","sex","no_edu","elementary_edu", "highschool_edu", "postsec_edu",
"undergrad_edu", "graduate_edu", "age", "tenure", "union", "public_sector", "self_emp"]

In [13]:
# Data split (training and testing)
# NOTE: Data before 2008 is used for training and data after 2008 is used for validating the model
data = data.query("year < 2008").copy()

In [14]:
# Create dictionary to store standardization parameters
standardization_params = {
    "exp": {"mean": data["exp"].mean(), "std": data["exp"].std()},
    "age": {"mean": data["age"].mean(), "std": data["age"].std()},
    "tenure": {"mean": data["tenure"].mean(), "std": data["tenure"].std()}
}

# Export standardization parameters (for be used in the validation step)
with open("src/standardization_params.json", "w") as file:
    json.dump(standardization_params, file)

# Standardize data
data["exp"] = (data["exp"] - data["exp"].mean()) / data["exp"].std()
data["age"] = (data["age"] - data["age"].mean()) / data["age"].std()
data["tenure"] = (data["tenure"] - data["tenure"].mean()) / data["tenure"].std()

## Run Models

### Pooled

In [2]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "pooled"
model_type = "pooled" # NOTE: pooled, no_pooled, hierarchical

In [22]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind = filter_data(year, data)
    # Run model
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, feature_names]
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [06:10<00:00,  5.39it/s]


Max Rhat: 1.00421
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:12<00:00, 15.14it/s]


Max Rhat: 1.00822
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [01:09<00:00, 28.60it/s]


Max Rhat: 1.00494
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:34<00:00, 57.66it/s]


Max Rhat: 1.0022
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:24<00:00, 81.09it/s] 


Max Rhat: 1.0026
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:25<00:00, 78.76it/s] 


Max Rhat: 1.0022
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:22<00:00, 88.95it/s] 


Max Rhat: 1.00231
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:23<00:00, 83.64it/s] 


Max Rhat: 1.00228
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:21<00:00, 91.92it/s] 


Max Rhat: 1.00228
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:29<00:00, 66.84it/s] 


Max Rhat: 1.00326
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:23<00:00, 83.86it/s] 


Max Rhat: 1.00224
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [00:24<00:00, 82.54it/s] 


Max Rhat: 1.00303


### No-pooled

In [17]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "no-pooled"
model_type = "no_pooled_ind_occ"

In [18]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind, occ = filter_data(year, data, columns=None, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, feature_names]
        model_coords = set_coords(mcmc, "industry", ind_cat, X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:44:29<00:00,  4.93s/it]  


Max Rhat: 1.01184
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [32:45<00:00,  1.02it/s] 


Max Rhat: 1.00921
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [19:36<00:00,  1.70it/s] 


Max Rhat: 1.00636
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [16:48<00:00,  1.98it/s] 


Max Rhat: 1.00693
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [12:06<00:00,  2.75it/s] 


Max Rhat: 1.00761
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [02:05<00:00, 15.97it/s]


Max Rhat: 1.00544
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [09:09<00:00,  3.64it/s] 


Max Rhat: 1.00487
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [09:03<00:00,  3.68it/s] 


Max Rhat: 1.00494
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:37<00:00,  3.87it/s] 


Max Rhat: 1.00591
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:55<00:00,  3.73it/s] 


Max Rhat: 1.00636
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [09:27<00:00,  3.52it/s] 


Max Rhat: 1.00706
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:38<00:00,  3.85it/s] 


Max Rhat: 1.00738


### Hierarchical

In [15]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "hierarchical-ind-occ"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [16]:
with open("../outputs/hierarchical-ind-occ/1999/model.pickle", "rb") as file:
    mcmc = pickle.load(file)
samples = mcmc.get_samples()

In [17]:
years = list(range(2000, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    X, y, ind, occ = filter_data(year, data, columns=None, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, feature_names]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, feature_names, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, feature_names]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:48:06<00:00,  5.04s/it]  


Max Rhat: 1.01917
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [25:43<00:00,  1.30it/s]


Max Rhat: 1.00697
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:43:37<00:00,  4.91s/it]  


Max Rhat: 1.00957
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:38:07<00:00,  4.74s/it]  


Max Rhat: 1.00893
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:32:47<00:00,  2.78s/it]


Max Rhat: 1.00796
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:29:29<00:00,  2.68s/it]


Max Rhat: 1.00944
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:26:47<00:00,  2.60s/it]


Max Rhat: 1.00554
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [49:40<00:00,  1.49s/it] 


Max Rhat: 1.00878


## Run models (variable selection)

### Variable Selection | VS1 - NO self_emp

In [33]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS1-no_self"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [35]:
with open("../outputs/VS1-no_self/2005/model.pickle", "rb") as file:
    mcmc = pickle.load(file)
samples = mcmc.get_samples()

In [36]:
years = list(range(2006, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure", "union", "public_sector"]
    X, y, ind, occ = filter_data(year, data, columns=columns, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [50:04<00:00,  1.50s/it] 


Max Rhat: 1.00727
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [43:29<00:00,  1.30s/it] 


Max Rhat: 1.00931


### Variable Selection | VS2 - NO public sector

In [37]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS2-no_public"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [38]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure", "union"]
    X, y, ind, occ = filter_data(year, data, columns=columns, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:41:20<00:00,  4.84s/it]  


Max Rhat: 1.30492
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:38:43<00:00,  4.76s/it]  


Max Rhat: 1.1172
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:42:24<00:00,  4.87s/it]  


Max Rhat: 1.03316
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:37:53<00:00,  4.74s/it]  


Max Rhat: 1.01834
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:38:50<00:00,  4.77s/it]  


Max Rhat: 1.01429
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:39<00:00,  2.86it/s]


Max Rhat: 1.00732
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:02:59<00:00,  3.69s/it]  


Max Rhat: 1.01419
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:24:12<00:00,  2.53s/it]


Max Rhat: 1.00873
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:13:24<00:00,  2.20s/it]


Max Rhat: 1.00936
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [49:50<00:00,  1.50s/it] 


Max Rhat: 1.00555
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [48:15<00:00,  1.45s/it] 


Max Rhat: 1.00679
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [30:03<00:00,  1.11it/s] 


Max Rhat: 1.0076


### Variable Selection | VS3 - NO union

In [29]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS3-no_union"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [30]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age", "tenure"]
    X, y, ind, occ = filter_data(year, data, columns=columns, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [3:00:37<00:00,  5.42s/it]  


Max Rhat: 1.50904
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:44:31<00:00,  4.94s/it]  


Max Rhat: 1.16325
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:42:40<00:00,  4.88s/it]  


Max Rhat: 1.09388
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:37:27<00:00,  4.72s/it]  


Max Rhat: 1.05701
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:39:12<00:00,  4.78s/it]  


Max Rhat: 1.02636
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:01<00:00,  3.02it/s]


Max Rhat: 1.00603
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:27:02<00:00,  4.41s/it]  


Max Rhat: 1.0077
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:25:32<00:00,  2.57s/it]


Max Rhat: 1.00795
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:15:57<00:00,  2.28s/it]


Max Rhat: 1.0098
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [50:06<00:00,  1.50s/it] 


Max Rhat: 1.01189
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [48:21<00:00,  1.45s/it] 


Max Rhat: 1.00839
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [30:57<00:00,  1.08it/s] 


Max Rhat: 1.01163


### Variable Selection | VS4 - NO tenure

In [33]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS4-no_tenure"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [34]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu", "age"]
    X, y, ind, occ = filter_data(year, data, columns=columns, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:50:44<00:00,  5.12s/it]  


Max Rhat: 1.2246
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:58:42<00:00,  5.36s/it]  


Max Rhat: 1.05379
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [3:00:33<00:00,  5.42s/it]  


Max Rhat: 1.02247
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:51:39<00:00,  5.15s/it]  


Max Rhat: 1.01598
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:46:56<00:00,  5.01s/it]  


Max Rhat: 1.00695
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [10:29<00:00,  3.18it/s]


Max Rhat: 1.00616
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:44:28<00:00,  3.13s/it] 


Max Rhat: 1.01257
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:25:57<00:00,  2.58s/it]


Max Rhat: 1.01578
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [55:35<00:00,  1.67s/it] 


Max Rhat: 1.00716
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [50:17<00:00,  1.51s/it] 


Max Rhat: 1.0108
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [51:23<00:00,  1.54s/it] 


Max Rhat: 1.00667
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [30:12<00:00,  1.10it/s] 


Max Rhat: 1.0161


### Variable Selection | VS5 - NO Age

In [18]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS5-no_age"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [19]:
with open("../outputs/VS5-no_age/1998/model.pickle", "rb") as file:
    mcmc = pickle.load(file)
samples = mcmc.get_samples()

In [20]:
years = list(range(1999, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex","elementary_edu", "highschool_edu", "postsec_edu",
                "undergrad_edu", "graduate_edu"]
    X, y, ind, occ = filter_data(year, data, columns=columns, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:33:39<00:00,  4.61s/it]  


Max Rhat: 1.01093
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [2:35:33<00:00,  4.67s/it]  


Max Rhat: 1.00871
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [08:56<00:00,  3.73it/s]


Max Rhat: 1.00699
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:31:30<00:00,  2.75s/it]


Max Rhat: 1.00769
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:21:31<00:00,  2.45s/it]


Max Rhat: 1.00931
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [47:26<00:00,  1.42s/it] 


Max Rhat: 1.00691
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [45:59<00:00,  1.38s/it] 


Max Rhat: 1.00612
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [34:57<00:00,  1.05s/it] 


Max Rhat: 1.01407
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [26:45<00:00,  1.25it/s] 


Max Rhat: 1.00877


### Variable Selection | VS6 - NO Education Level

In [22]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS6-no_edu"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [23]:
with open("../outputs/VS6-no_edu/1997/model.pickle", "rb") as file:
    mcmc = pickle.load(file)
samples = mcmc.get_samples()

In [24]:
years = list(range(1998, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp","sex"]
    X, y, ind, occ = filter_data(year, data, columns=columns, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:09:46<00:00,  2.09s/it]


Max Rhat: 1.04588
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:09:11<00:00,  2.08s/it]


Max Rhat: 1.02692
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:25:55<00:00,  2.58s/it]


Max Rhat: 1.01098
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [07:32<00:00,  4.42it/s]


Max Rhat: 1.00506
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [40:03<00:00,  1.20s/it] 


Max Rhat: 1.00388
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [35:39<00:00,  1.07s/it] 


Max Rhat: 1.00804
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [19:50<00:00,  1.68it/s] 


Max Rhat: 1.00731
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [20:39<00:00,  1.61it/s] 


Max Rhat: 1.00524
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [14:38<00:00,  2.28it/s] 


Max Rhat: 1.01499
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:14<00:00,  2.97it/s] 


Max Rhat: 1.00778


### Variable Selection | VS7 - No Sex

In [25]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS7-no_sex"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [26]:
years = list(range(1996, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = ["exp"]
    X, y, ind, occ = filter_data(year, data, columns=columns, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1996 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:06:05<00:00,  1.98s/it]


Max Rhat: 1.15142
>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:02:53<00:00,  1.89s/it]


Max Rhat: 1.04196
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:04:39<00:00,  1.94s/it]


Max Rhat: 1.01609
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:03:40<00:00,  1.91s/it]


Max Rhat: 1.00728
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:02:12<00:00,  1.87s/it]


Max Rhat: 1.0114
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [04:18<00:00,  7.72it/s]


Max Rhat: 1.00791
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [37:13<00:00,  1.12s/it] 


Max Rhat: 1.00815
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [19:13<00:00,  1.73it/s]


Max Rhat: 1.01435
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [19:02<00:00,  1.75it/s]


Max Rhat: 1.00796
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [15:23<00:00,  2.16it/s] 


Max Rhat: 1.0128
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:36<00:00,  2.87it/s] 


Max Rhat: 1.0081
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [10:41<00:00,  3.12it/s]


Max Rhat: 1.00953


### Variable Selection | VS8 - No exp

In [19]:
# Run settings
tune = 1000
draws = 1000
accept_prob = 0.95
chains = 4
model_name = "VS8-no_exp"
model_type = "hierarchical_ind_occ" # NOTE: pooled, no_pooled, hierarchical

In [20]:
with open("../outputs/VS8-no_exp/1996/model.pickle", "rb") as file:
    mcmc = pickle.load(file)
samples = mcmc.get_samples()

In [21]:
years = list(range(1997, 2008))
model = create_model(model_type)
for year in years:
    print(f">>>>>>>>>>>>>>>>> year {year} <<<<<<<<<<<<<<<<<<<")
    # Create output folder
    OUTPUT_PATH = f"../outputs/{model_name}/{year}"
    if not os.path.exists(f"{OUTPUT_PATH}"):
                os.makedirs(f"{OUTPUT_PATH}")
    # Filter data
    columns = []
    X, y, ind, occ = filter_data(year, data, columns=columns, occ_dim=True)
    # Run model
    rng_key = random.PRNGKey(0)
    rng_key, rng_key_ = random.split(rng_key)
    kernel = NUTS(model, target_accept_prob=accept_prob, init_strategy=init_to_median(num_samples=200))
    if year == 1996:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    else:
        mcmc = MCMC(kernel, num_warmup=tune, num_samples=draws, num_chains=chains, chain_method="vectorized")
        mcmc.run(rng_key, X, y, ind, occ, columns, samples)
        samples = mcmc.get_samples()
        # Save model outputs and calculate max Rhat
        model_params = [X, None, ind, occ, columns]
        model_coords = set_coords(mcmc, ["industry","occupation"], [ind_cat, occ_cat], X)
        max_rhat = export_model_outputs(mcmc, model, OUTPUT_PATH, *model_params, **model_coords)
    print(f"Max Rhat: {max_rhat}")

>>>>>>>>>>>>>>>>> year 1997 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:02:28<00:00,  1.87s/it]


Max Rhat: 1.08313
>>>>>>>>>>>>>>>>> year 1998 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:03:45<00:00,  1.91s/it]


Max Rhat: 1.03699
>>>>>>>>>>>>>>>>> year 1999 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [1:02:02<00:00,  1.86s/it]


Max Rhat: 1.01461
>>>>>>>>>>>>>>>>> year 2000 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [56:57<00:00,  1.71s/it] 


Max Rhat: 1.0107
>>>>>>>>>>>>>>>>> year 2001 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [03:59<00:00,  8.37it/s]


Max Rhat: 1.0091
>>>>>>>>>>>>>>>>> year 2002 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [34:26<00:00,  1.03s/it]


Max Rhat: 1.01093
>>>>>>>>>>>>>>>>> year 2003 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [27:32<00:00,  1.21it/s]


Max Rhat: 1.008
>>>>>>>>>>>>>>>>> year 2004 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [20:13<00:00,  1.65it/s]


Max Rhat: 1.00994
>>>>>>>>>>>>>>>>> year 2005 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [17:39<00:00,  1.89it/s]


Max Rhat: 1.01386
>>>>>>>>>>>>>>>>> year 2006 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [15:19<00:00,  2.18it/s]


Max Rhat: 1.01035
>>>>>>>>>>>>>>>>> year 2007 <<<<<<<<<<<<<<<<<<<


sample: 100%|██████████| 2000/2000 [11:10<00:00,  2.98it/s]


Max Rhat: 1.00872
