# Example of `InferenceData` schema in PyStan
The description of the `InferenceData` structure can be found [here](schema.html).

In [1]:
import arviz as az
import pystan
import pandas as pd
import numpy as np
import xarray
xarray.set_options(display_style="html");

In [2]:
#read data
data = pd.read_csv("linear_regression_data.csv", index_col=0)
time_since_joined = data.time.values
slack_comments = data.comments.values
github_commits = data.commits.values
names = data.index.values
N = len(names)
data

Unnamed: 0,comments,commits,time
Alice,7500,25,4.5
Bob,10100,32,6.0
Cole,18600,49,7.0
Danielle,25200,66,12.0
Erika,27500,96,18.0


In [3]:
# data for out of sample predictions
candidate_devs = ["Francis", "Gerard"]
candidate_devs_time = np.array([3.6, 5.1])
N_pred = len(candidate_devs)

In [4]:
linreg_prior_code = """
data {
  int<lower=0> N;
  real time_since_joined[N];
}

generated quantities {
    real b0;
    real b1;
    real log_b_sigma;
    real<lower=0> b_sigma;
    
    real c0;
    real c1;
    real log_c_sigma;
    real<lower=0> c_sigma;
    
    vector[N] slack_comments_hat;
    vector[N] github_commits_hat;
    
    b0 = normal_rng(0,200);
    b1 = normal_rng(0,200);
    b_sigma = abs(normal_rng(0,300));
    log_b_sigma = log(b_sigma);
    
    c0 = normal_rng(0,10);
    c1 = normal_rng(0,10);
    c_sigma = fabs(normal_rng(0,6));
    log_c_sigma = log(b_sigma);
    
    for (n in 1:N) {
        slack_comments_hat[n] = normal_rng(b0 + b1 * time_since_joined[n], b_sigma);
        github_commits_hat[n] = normal_rng(c0 + c1 * time_since_joined[n], c_sigma);
    }
}
"""
sm_prior = pystan.StanModel(model_code=linreg_prior_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_0d5510d8a96e25d9a54b4530edc08cfc NOW.


In [5]:
linreg_prior_data_dict = {"N": N, "time_since_joined": time_since_joined}
prior = sm_prior.sampling(data=linreg_prior_data_dict, iter=150, chains=1, algorithm='Fixed_param', warmup=0)

In [6]:
linreg_code = """
data {
  int<lower=0> N;
  vector<lower=0>[N] time_since_joined;
  vector<lower=0>[N] slack_comments;
  vector<lower=0>[N] github_commits;
  
  
  // out of sample prediction
  int<lower=0> N_pred;
  vector<lower=0>[N_pred] time_since_joined_pred;
}

parameters {
  real b0;
  real b1;
  real log_b_sigma;
  
  real c0;
  real c1;
  real log_c_sigma;
}

transformed parameters {
  real<lower=0> b_sigma = exp(log_b_sigma);
  real<lower=0> c_sigma = exp(log_c_sigma);
}

model {
  b0 ~ normal(0,200);
  b1 ~ normal(0,200);
  b_sigma ~ normal(0,300);
  slack_comments ~ normal(b0 + b1 * time_since_joined, b_sigma);
  github_commits ~ normal(c0 + c1 * time_since_joined, c_sigma);
  
}

generated quantities {
    // elementwise log likelihood
    vector[N] log_likelihood_slack_comments;
    vector[N] log_likelihood_github_commits;
    
    // posterior predictive
    vector[N] slack_comments_hat;
    vector[N] github_commits_hat;
    
    // out of sample prediction
    vector[N_pred] slack_comments_pred;
    vector[N_pred] github_commits_pred;
    
    // posterior predictive
    for (n in 1:N) {
        log_likelihood_slack_comments[n] = normal_lpdf(slack_comments[n] | b0 + b1 * time_since_joined[n], b_sigma);
        slack_comments_hat[n] = normal_rng(b0 + b1 * time_since_joined[n], b_sigma);
        
        log_likelihood_github_commits[n] = normal_lpdf(github_commits[n] | c0 + c1 * time_since_joined[n], c_sigma);
        github_commits_hat[n] = normal_rng(c0 + c1 * time_since_joined[n], c_sigma);
    }
    
    // out of sample prediction
    for (n in 1:N_pred) {
        slack_comments_pred[n] = normal_rng(b0 + b1 * time_since_joined_pred[n], b_sigma);
        github_commits_pred[n] = normal_rng(c0 + c1 * time_since_joined_pred[n], c_sigma);
    }
}
"""
sm = pystan.StanModel(model_code=linreg_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_bb366f8ad241e32840c67395e3324113 NOW.


In [7]:
linreg_data_dict = {"N": N, "slack_comments": slack_comments, "github_commits": github_commits, "time_since_joined": time_since_joined, "N_pred" : N_pred, "time_since_joined_pred" : candidate_devs_time}
posterior = sm.sampling(data=linreg_data_dict, iter=200, chains=4)



In [8]:
idata_stan = az.from_pystan(
    posterior=posterior,
    prior=prior,
    posterior_predictive=["slack_comments_hat","github_commits_hat"],
    prior_predictive=["slack_comments_hat","github_commits_hat"],
    observed_data=["slack_comments","github_commits"],
    constant_data=["time_since_joined"],
    log_likelihood={
        "slack_comments": "log_likelihood_slack_comments",
        "github_commits": "log_likelihood_github_commits"
    },
    predictions=["slack_comments_pred", "github_commits_pred"],
    predictions_constant_data=["time_since_joined_pred"],
    coords={"developer": names, "candidate developer" : candidate_devs},
    dims={
        "slack_comments": ["developer"],
        "github_commits" : ["developer"],
        "slack_comments_hat": ["developer"],
        "github_commits_hat": ["developer"],
        "time_since_joined": ["developer"],
        "slack_comments_pred" : ["candidate developer"],
        "github_commits_pred" : ["candidate developer"],
        "time_since_joined_pred" : ["candidate developer"],
    }
)

In [9]:
idata_stan

Inference data with groups:
	> posterior
	> sample_stats
	> log_likelihood
	> posterior_predictive
	> observed_data
	> constant_data
	> prior
	> sample_stats_prior
	> prior_predictive
	> predictions
	> predictions_constant_data

In this example, each variable has as dimension a combination of the following 3: `chain`, `draw` and `developer`. Moreover, each dimension has specific coordinate values. In the case of `chain` and `draw` it is an integer identifier starting at `0`; in the case of `developer` dimension, its coordinate values are the following strings: `["Alice", "Bob", "Cole", "Danielle", "Erika"]`.

In [10]:
idata_stan.posterior

In [11]:
idata_stan.sample_stats

In [12]:
idata_stan.log_likelihood

In [13]:
idata_stan.posterior_predictive

In [14]:
idata_stan.observed_data

In [15]:
idata_stan.constant_data

In [16]:
idata_stan.prior

In [17]:
idata_stan.sample_stats_prior

In [18]:
idata_stan.prior_predictive

In [19]:
idata_stan.predictions

In [20]:
idata_stan.predictions_constant_data