In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pystan
import numpy as np
import pandas as pd
import pymc3 as pm
import arviz as az
import scipy as sp
import matplotlib.pyplot as plt

az.style.use('arviz-darkgrid')

  from ._conv import register_converters as _register_converters


## 14M2

In [74]:
df = pd.read_csv("/Users/benjaminwee/Documents/courses/resources/Rethinking/Data/milk.csv", sep=";")
df.loc[:, "neocortex.prop"] = df["neocortex.perc"] / 100
df.loc[:, "logmass"] = np.log(df["mass"])

# prep data
neocortex = df["neocortex.prop"].values
logmass = df["logmass"].values
kcal = df["kcal.per.g"].values

# Create missing values
miss_idx = np.isnan(neocortex) * np.cumsum(np.isnan(neocortex))
N_miss = sum(np.isnan(neocortex)) # Used to define length of missing vector
Ncobs = np.where(np.isnan(neocortex), -1, neocortex)

Just do full models and linear model

In [33]:
m_14_m2 = '''
data {
  int N;
  vector[N] NCobs;
  int N_miss;
  int<lower=0,upper=N_miss> miss_idx[N];
  vector[N] logmass;
  vector[N] kcal;
}
parameters {
  real a;
  real bn;
  real bm;
  real<lower=0> sigma;
  real mu_nc; // Missings parameters
  real<lower=0> sigma_nc;
  vector[N_miss] nc_impute; // Imputed vector
}
model {
  vector[N] mu;
  vector[N] NC;
  int j = 1;
  
  // priors
  target += normal_lpdf(a  | 0, 100);
  target += normal_lpdf(bn | 0, 10);
  target += normal_lpdf(bm | 0, 10);
  target += cauchy_lpdf(sigma | 0, 1);
  target += normal_lpdf(mu_nc | 0.5, 1); // Priors for variable with missings
  target += cauchy_lpdf(sigma_nc | 0, 1);
  
  // combine observed and estimates for missing
  NC = NCobs;
  for(i in 1:N) if(miss_idx[i] > 0) NC[i] = nc_impute[miss_idx[i]]; // Use negative 1 as a placeholder for missing value like neg 999 in pymc3

  // impute missing
  target += normal_lpdf(NC | mu_nc, sigma_nc);
  
  // linear model
  mu = a + bn * NC + bm * logmass;
  
  // likelihood
  target += normal_lpdf(kcal | mu, sigma);
}
'''

In [26]:
m14_m2 = pystan.StanModel(model_code=m_14_m2)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_dbe49388a87296b143021c919deffd71 NOW.


In [75]:
df_m = dict(N = len(df),
          NCobs = Ncobs,
          N_miss = N_miss,
          miss_idx = miss_idx,
          logmass = logmass,
          kcal = kcal)

fit_m14_m2 = m14_m2.sampling(data = df_m)

Add predictor to linear model

In [60]:
m_14_m3 = '''
data {
  int N;
  vector[N] NCobs;
  int N_miss;
  int<lower=0,upper=N_miss> miss_idx[N];
  vector[N] logmass;
  vector[N] kcal;
}
parameters {
  real a;
  real bn;
  real bm;
  real<lower=0> sigma;
  real<lower=0> sigma_nc;
  vector[N_miss] nc_impute; // Imputed vector
  real a_nc;
  real bm_nc;
}
model {
  vector[N] mu;
  vector[N] mu_nc; // Not a parameter anymore but part of the model
  vector[N] NC;
  int j = 1;
  
  // priors
  target += normal_lpdf(a  | 0, 100);
  target += normal_lpdf(bn | 0, 10);
  target += normal_lpdf(bm | 0, 10);
  target += cauchy_lpdf(sigma | 0, 1);
  target += cauchy_lpdf(sigma_nc | 0, 1);
  target += normal_lpdf(a_nc | 0.5 , 1);
  target += normal_lpdf(bm_nc| 0, 10);
  
  // combine observed and estimates for missing
  NC = NCobs;
  for(i in 1:N) if(miss_idx[i] > 0) NC[i] = nc_impute[miss_idx[i]]; // Use negative 1 as a placeholder for missing value like neg 999 in pymc3

  // impute missing
  mu_nc = a_nc + bm_nc * logmass;
  target += normal_lpdf(NC | mu_nc, sigma_nc);
  
  // linear model
  mu = a + bn * NC + bm * logmass;
  
  // likelihood
  target += normal_lpdf(kcal | mu, sigma);
}
'''

In [61]:
m14_m3 = pystan.StanModel(model_code=m_14_m3)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_2ce76bb37431eeab409c7b1250f16406 NOW.


In [64]:
Ncobs

array([ 0.5516, -1.    , -1.    , -1.    , -1.    ,  0.6454,  0.6454,
        0.6764, -1.    ,  0.6885,  0.5885,  0.6169,  0.6032, -1.    ,
       -1.    ,  0.6997, -1.    ,  0.7041, -1.    ,  0.734 , -1.    ,
        0.6753, -1.    ,  0.7126,  0.726 , -1.    ,  0.7024,  0.763 ,
        0.7549])

In [76]:
fit_m14_m3 = m14_m3.sampling(data = df_m)

In [77]:
samples_m14_m2 = az.from_pystan(posterior = fit_m14_m2)
samples_m14_m3 = az.from_pystan(posterior = fit_m14_m3)

In [78]:
az.summary(samples_m14_m2)

Unnamed: 0,mean,sd,hpd_3%,hpd_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
a,-0.529,0.481,-1.362,0.447,0.017,0.012,764.0,764.0,761.0,1533.0,1.01
bn,1.891,0.749,0.459,3.241,0.027,0.019,742.0,742.0,737.0,1440.0,1.01
bm,-0.069,0.023,-0.111,-0.024,0.001,0.0,1114.0,1114.0,1103.0,2214.0,1.01
sigma,0.133,0.024,0.092,0.179,0.001,0.0,1585.0,1575.0,1623.0,2239.0,1.0
mu_nc,0.672,0.014,0.645,0.696,0.0,0.0,3157.0,3157.0,3183.0,2813.0,1.0
sigma_nc,0.061,0.011,0.043,0.081,0.0,0.0,1782.0,1668.0,2039.0,2246.0,1.0
nc_impute[0],0.633,0.05,0.535,0.726,0.001,0.001,3457.0,3436.0,3468.0,2719.0,1.0
nc_impute[1],0.627,0.052,0.531,0.729,0.001,0.001,2883.0,2801.0,2936.0,2246.0,1.0
nc_impute[2],0.624,0.052,0.519,0.713,0.001,0.001,3340.0,3269.0,3358.0,1997.0,1.0
nc_impute[3],0.654,0.048,0.563,0.74,0.001,0.001,3239.0,3161.0,3285.0,2531.0,1.0


In [79]:
az.summary(samples_m14_m3)

Unnamed: 0,mean,sd,hpd_3%,hpd_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
a,-0.849,0.486,-1.715,0.161,0.014,0.01,1266.0,1266.0,1285.0,1675.0,1.01
bn,2.406,0.762,0.789,3.724,0.022,0.015,1252.0,1252.0,1269.0,1674.0,1.01
bm,-0.087,0.024,-0.13,-0.042,0.001,0.0,1683.0,1683.0,1698.0,2152.0,1.01
sigma,0.129,0.023,0.088,0.168,0.0,0.0,2530.0,2400.0,2720.0,2465.0,1.0
sigma_nc,0.042,0.008,0.03,0.056,0.0,0.0,1995.0,1918.0,2163.0,2079.0,1.0
nc_impute[0],0.632,0.035,0.564,0.697,0.001,0.0,3917.0,3893.0,3907.0,2841.0,1.0
nc_impute[1],0.629,0.037,0.561,0.696,0.001,0.0,3594.0,3582.0,3615.0,2945.0,1.0
nc_impute[2],0.62,0.036,0.552,0.685,0.001,0.0,3166.0,3133.0,3198.0,2601.0,1.0
nc_impute[3],0.647,0.034,0.583,0.71,0.001,0.0,4538.0,4526.0,4568.0,2978.0,1.0
nc_impute[4],0.663,0.036,0.597,0.733,0.001,0.0,3249.0,3210.0,3319.0,2628.0,1.0


## 14M3

In [84]:
d = pd.read_csv("/Users/benjaminwee/Documents/courses/resources/Rethinking//Data/WaffleDivorce.csv", ";")
d["log_population"] = np.log(d["Population"])

div_obs = d["Divorce"].values
div_sd = d["Divorce SE"].values
div_sd_2 = d["Divorce SE"].values*2
A = d["MedianAgeMarriage"].values
R = d["Marriage"].values
N = len(d)

In [82]:
m_14_m4 = '''
data {
  int N;
  vector[N] A;
  vector[N] R;
  vector[N] div_obs;
  vector[N] div_sd;
}
parameters {
  real a;
  real ba;
  real br;
  real<lower=0> sigma;
  vector[N] Dest;
}
model {
  vector[N] mu; 
  // priors
  target += normal_lpdf(a | 0, 10);
  target += normal_lpdf(ba | 0, 1);
  target += normal_lpdf(br | 0, 1);
  target += exponential_lpdf(sigma | 1);
  
  // linear model
  mu = a + ba * A + br * R;
  
  // likelihood
  target += normal_lpdf(Dest | mu, sigma);
  
  // prior for estimates
  target += normal_lpdf(div_obs | Dest, div_sd);
}
generated quantities {
  vector[N] log_lik;
  {
  vector[N] mu;
  mu = a + ba * A + br * R;
  for(i in 1:N) log_lik[i] = normal_lpdf(Dest[i] | mu[i], sigma);
  }
}
'''

In [83]:
m14_m4 = pystan.StanModel(model_code=m_14_m4)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_38418bab1d4a08d658f84d5616481821 NOW.


In [86]:
m14_m5 = pystan.StanModel(model_code=m_14_m5)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_38418bab1d4a08d658f84d5616481821 NOW.


In [88]:
m14_df = dict(N = N,
              A = A,
              R = R,
              div_obs = div_obs,
              div_sd = div_sd)

fit_m14_m4 = m14_m4.sampling(data = m14_df)



In [89]:
m14_df_2 = dict(N = N,
              A = A,
              R = R,
              div_obs = div_obs,
              div_sd = div_sd_2)

fit_m14_m5 = m14_m4.sampling(data = m14_df_2)



In [90]:
samples_m14_m4 = az.from_pystan(posterior = fit_m14_m4)
samples_m14_m5 = az.from_pystan(posterior = fit_m14_m5)

In [91]:
az.summary(samples_m14_m4)

Unnamed: 0,mean,sd,hpd_3%,hpd_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
a,21.002,6.204,8.206,31.850,0.146,0.104,1795.0,1795.0,1800.0,1918.0,1.0
ba,-0.538,0.201,-0.891,-0.123,0.005,0.003,1821.0,1821.0,1829.0,1991.0,1.0
br,0.129,0.074,-0.015,0.262,0.002,0.001,2261.0,2212.0,2265.0,2324.0,1.0
sigma,1.109,0.200,0.755,1.482,0.006,0.004,1205.0,1205.0,1181.0,1690.0,1.0
Dest[0],11.757,0.670,10.490,13.020,0.009,0.007,5334.0,5291.0,5333.0,2785.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
log_lik[45],-1.193,0.284,-1.714,-0.691,0.006,0.004,2286.0,2286.0,2094.0,2583.0,1.0
log_lik[46],-1.161,0.263,-1.686,-0.723,0.007,0.005,1506.0,1506.0,1445.0,1953.0,1.0
log_lik[47],-1.326,0.475,-2.190,-0.659,0.011,0.008,1749.0,1723.0,1708.0,2128.0,1.0
log_lik[48],-1.286,0.361,-1.986,-0.718,0.007,0.005,3073.0,3073.0,2813.0,3008.0,1.0


In [92]:
az.summary(samples_m14_m5)

Unnamed: 0,mean,sd,hpd_3%,hpd_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
a,18.740,6.046,7.524,30.228,0.398,0.284,231.0,227.0,226.0,467.0,1.04
ba,-0.539,0.191,-0.889,-0.160,0.013,0.009,222.0,221.0,218.0,503.0,1.04
br,0.238,0.079,0.080,0.373,0.005,0.004,250.0,228.0,245.0,541.0,1.06
sigma,0.351,0.205,0.063,0.703,0.061,0.044,11.0,11.0,12.0,30.0,1.26
Dest[0],10.062,0.485,9.253,11.024,0.047,0.034,107.0,104.0,103.0,550.0,1.04
...,...,...,...,...,...,...,...,...,...,...,...
log_lik[45],-0.261,0.870,-1.706,1.410,0.148,0.106,34.0,34.0,29.0,50.0,1.10
log_lik[46],-0.108,0.910,-1.648,1.535,0.211,0.152,19.0,19.0,17.0,56.0,1.16
log_lik[47],-0.216,0.891,-1.763,1.466,0.176,0.126,26.0,26.0,21.0,59.0,1.13
log_lik[48],-0.138,0.880,-1.712,1.410,0.187,0.134,22.0,22.0,20.0,60.0,1.14


## 14H3

In [93]:
x_cc = np.random.randn(10)
y_cc = np.random.normal(loc=x_cc)
x = np.concatenate((x_cc, np.NaN), axis=None)
y = np.concatenate((y_cc, 100.0), axis=None) 

# masked array for missing values
# Create missing values
miss_idx = np.isnan(x) * np.cumsum(np.isnan(x))
N_miss = sum(np.isnan(x)) # Used to define length of missing vector
xobs = np.where(np.isnan(x), -1, x)

In [124]:
m_14_h3 = '''
data {
  int N;
  vector[N] xobs;
  int N_miss;
  int<lower=0,upper=N_miss> miss_idx[N];
  vector[N] x;
  vector[N] y;
}
parameters {
  real a;
  real b;
  real<lower=0> sigma;
  vector[N_miss] nc_impute; // Imputed vector
}
model {
  vector[N] mu;
  vector[N] xobsNC;
  int j = 1;
  
  // priors
  target += normal_lpdf(a  | 0, 100);
  target += normal_lpdf(b | 0, 100);
  target += cauchy_lpdf(sigma | 0,1);
  
  // combine observed and estimates for missing
  xobsNC = xobs;
  for(i in 1:N) if(miss_idx[i] > 0) xobsNC[i] = nc_impute[miss_idx[i]]; // Use negative 1 as a placeholder for missing value like neg 999 in pymc3

  // impute missing
  target += normal_lpdf(xobsNC | 0, 1);
  
  // linear model
  mu = a + b * xobsNC;
  
  // likelihood
  target += normal_lpdf(y | mu, sigma);
}
'''

In [125]:
m14_h3 = pystan.StanModel(model_code=m_14_h3)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_ba55bab12cbdd6f6b289606a1e15657e NOW.


In [109]:
m_14_h3_cc = '''
data {
  int N;
  vector[N] x_cc;
  vector[N] y_cc;
}
parameters {
  real a;
  real b;
  real<lower=0> sigma;
}
model {
  vector[N] mu;
  vector[N] NC;

  // priors
  target += normal_lpdf(a  | 0, 100);
  target += normal_lpdf(b | 0, 100);
  target += cauchy_lpdf(sigma | 0, 1);
    
  // linear model
  mu = a + b * x_cc;
  
  // likelihood
  target += normal_lpdf(y_cc | mu, sigma);
}
'''

In [110]:
m14_h3_cc = pystan.StanModel(model_code=m_14_h3_cc)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_e259a01d35736a64821cf36b04c58890 NOW.


In [121]:
m14_h3_df = dict(N = len(x),
                 x = x,
                 y = y,
                 N_miss = N_miss,
                 miss_idx = miss_idx,
                 xobs=xobs)

fit_m_14_h3 = m14_h3.sampling(data = m14_h3_df)
samples_m_14_h3= az.from_pystan(posterior = fit_m_14_h3)

az.summary(samples_m_14_h3)

Unnamed: 0,mean,sd,hpd_3%,hpd_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
a,9.594,10.389,-9.739,29.797,0.254,0.201,1677.0,1340.0,1777.0,1844.0,1.0
b,-1.579,11.119,-19.898,19.803,0.267,0.189,1739.0,1739.0,1769.0,2597.0,1.0
sigma,30.587,7.876,18.327,45.748,0.171,0.127,2109.0,1934.0,2375.0,2157.0,1.0
nc_impute[0],-0.119,1.387,-2.55,2.495,0.034,0.024,1712.0,1712.0,1721.0,2612.0,1.0


In [116]:
m14_h3_cc_df = dict(N = len(x_cc),
                 x_cc = x_cc,
                 y_cc = y_cc)

fit_m_14_h3_cc= m14_h3_cc.sampling(data = m14_h3_cc_df)
samples_m_14_h3_cc= az.from_pystan(posterior = fit_m_14_h3_cc)

az.summary(samples_m_14_h3_cc)

Unnamed: 0,mean,sd,hpd_3%,hpd_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
a,0.339,0.483,-0.519,1.304,0.01,0.009,2285.0,1463.0,2548.0,2401.0,1.0
b,1.296,0.402,0.526,2.055,0.009,0.006,2231.0,2231.0,2309.0,1785.0,1.0
sigma,1.35,0.375,0.766,2.006,0.009,0.007,1581.0,1487.0,1777.0,2120.0,1.0
