In [1]:
import pymc as pm
import pandas as pd
import arviz as az
import numpy as np
"PyMC version used: {}".format(pm.__version__)

'PyMC version used: 4.0.0b4'

In [13]:
#initialize the data
def mask_nans(a):
    x = a.copy()
    return np.ma.masked_values(np.nan_to_num(x, nan=-1), value=-1)

raw_X = np.array([24, 32, 48, 56, np.nan, 70, 72, 75, 80, 96])
raw_Y = np.array([102.8, 104.5, 106.5, 107.0, 107.1, 105.1, 103.9, np.nan, 103.2, 102.1])
X = mask_nans(raw_X)
Y = mask_nans(raw_Y)

In [14]:
X

masked_array(data=[24.0, 32.0, 48.0, 56.0, --, 70.0, 72.0, 75.0, 80.0,
                   96.0],
             mask=[False, False, False, False,  True, False, False, False,
                   False, False],
       fill_value=-1.0)

In [15]:
#setup a model
with pm.Model() as m:

    #impute data
    imputed_x = pm.Uniform("x_imputed", lower=10, upper=150, observed=X)
    x_squared = pm.Deterministic("x_sq", pm.math.sqr(imputed_x)) #<- this would be much more logical! However, model fails with it...
    #x_squared = pm.Uniform("x_sq", lower=100, upper=150 * 150, observed=X ** 2) # <- this seems to work
    
    #regression coefficients
    n_coeff = 3
    coeffs = pm.Normal("coeffs", mu=0, tau = 1e-6, shape=n_coeff)
    
    d = pm.math.concatenate([np.ones(X.shape), imputed_x], axis=1)
    d = pm.math.concatenate([np.ones(X.shape), imputed_x, x_squared], axis=1)
    mu = pm.Deterministic("mu", pm.math.dot(d, coeffs))    

    #error term
    error_tau = pm.Gamma("error_tau", alpha=0.001, beta=0.001)      
    error_var = pm.Deterministic("error_var", 1 / error_tau)    
    
    #predictions     
    pred = pm.Normal("outcomes", mu=mu, tau = error_tau, observed=Y)
    
    #R-squared
    n = X.shape[0]
    p = n_coeff
    sse = error_var * (n - p)
    centered_y = Y - Y.mean()
    sst = pm.math.dot(centered_y.T, centered_y)
    br2 = pm.Deterministic("br2", pm.math.maximum(1 - sse/sst, 0))    
    adjusted_br2 = pm.Deterministic("adj_br2", 1 - (1 - br2) * (n - 1) / (n - p))
    
    
    # sample
    trace_multi = pm.sample(
        draws=5000,        
        tune=1000,
        cores=4,
        init="jitter+adapt_diag",
        chains=2,
        random_seed=50
    )
    
    #run posterior predictive checks
    ppc_multi = pm.sample_posterior_predictive(trace_multi)



ValueError: Axis value 1 is out of range for the given input dimensions

In [10]:
az.summary(trace_multi, hdi_prob=0.95)

  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)


Unnamed: 0,mean,sd,hdi_2.5%,hdi_97.5%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
"coeffs[0, 0]",97.396,3.38,90.291,103.971,0.148,0.105,418.0,915.0,1.03
"coeffs[1, 0]",0.319,0.125,0.061,0.565,0.005,0.004,396.0,877.0,1.03
"coeffs[2, 0]",-0.003,0.001,-0.005,-0.001,0.0,0.0,388.0,1076.0,1.02
outcomes_missing[0],104.907,1.513,101.937,107.869,0.05,0.036,683.0,2042.0,1.01
x_imputed_missing[0],54.38,14.733,25.513,82.918,0.482,0.341,856.0,833.0,1.02
error_tau,0.887,0.589,0.068,2.11,0.147,0.106,21.0,40.0,1.07
"x_sq[0, 0]",576.0,0.0,576.0,576.0,0.0,0.0,10000.0,10000.0,
"x_sq[1, 0]",1024.0,0.0,1024.0,1024.0,0.0,0.0,10000.0,10000.0,
"x_sq[2, 0]",2304.0,0.0,2304.0,2304.0,0.0,0.0,10000.0,10000.0,
"x_sq[3, 0]",3136.0,0.0,3136.0,3136.0,0.0,0.0,10000.0,10000.0,


In [11]:
def r_squared(ppc, Y):
    y_pred = np.array(ppc.posterior_predictive.outcomes.mean(axis=(0, 1)))
    return az.r2_score(Y, y_pred)

In [12]:
y_pred = np.array(ppc_multi.posterior_predictive.outcomes.mean(axis=(0, 1)))

In [13]:
az.r2_score(Y.data[~Y.mask], y_pred[~Y.mask])

r2        0.775252
r2_std    0.000000
dtype: float64

In [16]:
test = pm.draw(pm.HalfCauchy.dist(beta=10), 10000)

test.min(), test.max(), test.mean()



(2.2397017675653404e-05, 199286.61738084015, 91.34647148699511)