In [1]:
import pymc as pm
import pandas as pd
import numpy as np
import arviz as az

%load_ext lab_black
%load_ext watermark

# Paraguay vaccination status

This example goes over a multilevel or hierarchical logistic regression model. It also shows how to use the PyMC coordinate system.

Adapted from [unit 7: paraguay.odc](https://raw.githubusercontent.com/areding/6420-pymc/main/original_examples/Codes4Unit7/paraguay.odc)

Data can be found [here](https://raw.githubusercontent.com/areding/6420-pymc/main/data/paraguay.csv).

## Associated lecture video: Unit 7 Lesson 19

In [2]:
%%html
<iframe width="560" height="315" src="https://www.youtube.com/embed?v=xomK4tcePmc&list=PLv0FeK5oXK4l-RdT6DWJj0_upJOG2WKNO&index=81" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>

## Problem statement

This example considers factors influencing the vaccination status among 3424 children of 2552 mothers among 264 clusters in Paraguay. In this analysis, we're specifically interested in mother-level factors related to child immunization. However, there is randomness associated with different clusters.

- ID3:		   Cluster number
- VACCODE:  =1 if fully immunized, =0 otherwise
- LB.TOT:	  No. of live births
- MAGE2:	  mother age  <20 =1, otherwise = 0	
- UN2:		    consensual union = 1, otherwise = 0 
- TOILET2:	  unsafe toilet in hh = 1, otherwise = 0	
- PR.SPOC1:  spouse unskilled laborer = 1, otherwise = 0
- SPANISH2:  Spanish not hh language = 1, otherwise = 0

-----

We need to add a random effect by cluster. This is a good use case for the PyMC coordinates system.

In [3]:
data = pd.read_csv("../data/paraguay.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424 entries, 0 to 3423
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   ID3       3424 non-null   int64
 1   VACCODE   3424 non-null   int64
 2   LB.TOT    3424 non-null   int64
 3   MAGE2     3424 non-null   int64
 4   UN2       3424 non-null   int64
 5   TOILET2   3424 non-null   int64
 6   PR.SPOC1  3424 non-null   int64
 7   SPANISH2  3424 non-null   int64
dtypes: int64(8)
memory usage: 214.1 KB


In [4]:
y = data["VACCODE"].to_numpy()
# separate array for clusters
clusters = data["ID3"].to_numpy()
X = data.drop(["VACCODE", "ID3"], axis=1).to_numpy()
X_aug = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
y.shape, clusters.shape, X_aug.shape

((3424,), (3424,), (3424, 7))

In [5]:
cols = X_aug.shape[1]

In [6]:
# set up alternate coordinates, the ID3 or clusters column
cluster_idx, clusters = pd.factorize(data.ID3)
coords = {"cluster": clusters, "id": data.index.to_numpy()}

In [8]:
# note that the coords dict is passed to pm.Model call
with pm.Model(coords=coords) as m:
    X_data = pm.Data("X_data", X_aug, mutable=True)
    y_data = pm.Data("y_data", y, mutable=True)
    clust_idx = pm.Data("cluster_idx", cluster_idx, dims="id", mutable=True)

    cluster_tau = pm.Gamma("cluster_tau", 0.01, 0.01)
    cluster_variance = pm.Deterministic("cluster_variance", 1 / cluster_tau)
    beta = pm.Normal("beta", 0, tau=1e-3, shape=cols)

    cluster_effect = pm.Normal("cluster_effect", 0, tau=cluster_tau, dims="cluster")
    p = pm.math.dot(X_data, beta) + cluster_effect[clust_idx]

    pm.Bernoulli("likelihood", logit_p=p, observed=y_data)

    trace = pm.sample(3000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  aesara_function = aesara.function(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [cluster_tau, beta, cluster_effect]


  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
Sampling 4 chains for 1_000 tune and 3_000 draw iterations (4_000 + 12_000 draws total) took 30 seconds.


In [9]:
az.summary(trace, var_names=["beta", "cluster_variance"], filter_vars="like")

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta[0],1.446,0.131,1.201,1.697,0.001,0.001,10492.0,9339.0,1.0
beta[1],-0.071,0.015,-0.098,-0.043,0.0,0.0,16619.0,9409.0,1.0
beta[2],-0.565,0.204,-0.952,-0.18,0.001,0.001,18952.0,9730.0,1.0
beta[3],-0.195,0.098,-0.376,-0.009,0.001,0.001,20799.0,8769.0,1.0
beta[4],-0.693,0.131,-0.941,-0.455,0.001,0.001,12931.0,9684.0,1.0
beta[5],-0.284,0.11,-0.484,-0.071,0.001,0.001,12379.0,9379.0,1.0
beta[6],-0.62,0.097,-0.805,-0.44,0.001,0.001,15200.0,9748.0,1.0
cluster_variance,0.53,0.094,0.359,0.705,0.002,0.001,3169.0,5538.0,1.0


In [None]:
%watermark --iversions -v