In [1]:
import arviz as az
import matplotlib.pyplot as plt
import pyjags as pj
import pandas as pd
import scipy.stats as ss
import numpy as np
from IPython.display import Image

### **1. Bayesian hierarchical normal linear regression - Univariate**

#### 1a. Univariate prior formulation

In [2]:
df = pd.read_csv('../data/lifeexpdiff.csv', sep=',', header=0, index_col=[0])
years = np.array([1960, 1970, 1980, 1990, 2000, 2010])
data = {'Y': np.ma.masked_invalid(df.values),
        'X': np.ma.masked_invalid(years),
        'Xbar': np.ma.masked_invalid(np.mean(years))}

inits = [{'tausq.y': 1, 'beta1': 0, 'beta2': 0, 'sigma.alpha1': 1, 'sigma.alpha2': 1},
             {'tausq.y': 100, 'beta1': 100, 'beta2': 100, 'sigma.alpha1': 0.1, 'sigma.alpha2': 0.1},
             {'tausq.y': 0.01, 'beta1': -100, 'beta2': -100, 'sigma.alpha1': 10, 'sigma.alpha2': 10}]


In [3]:
jags_model_string = '''
  data {
      dim.Y <- dim(Y)
    }

    model {
      for(i in 1:dim.Y[1]) {

        for(j in 1:dim.Y[2]) {
          Y[i,j] ~ dnorm(mu[i,j], tausq.y)
          mu[i,j] <- alpha[i,1] + alpha[i,2] * (X[j] - Xbar)
        }

        alpha[i,1] ~ dnorm(beta1, 1 / sigma.alpha1^2)
        alpha[i,2] ~ dnorm(beta2, 1 / sigma.alpha2^2)
      }

      tausq.y ~ dgamma(0.001, 0.001)
      sigma.y <- 1 / sqrt(tausq.y)

      beta1 ~ dnorm(0.0, 1.0E-6)
      beta2 ~ dnorm(0.0, 1.0E-6)
      sigma.alpha1 ~ dexp(0.001)
      sigma.alpha2 ~ dexp(0.001)
    }
    '''

In [4]:
jags_model \
    = pj.Model(code=jags_model_string,
               init=inits,
               data=data,
               chains=3,
               )

adapting: iterations 3000 of 3000, elapsed 0:00:00, remaining 0:00:00


#### 1b. Univariate model summary

In [5]:
# Usin 100000 iterations
samples_1 = jags_model.sample(iterations=140000, vars=["beta1", "beta2", "sigma.y", "sigma.alpha1", "sigma.alpha2"])
idata = az.from_pyjags(samples_1)

sampling: iterations 126726 of 420000, elapsed 0:00:06, remaining 0:00:14
sampling: iterations 336576 of 420000, elapsed 0:00:16, remaining 0:00:04
sampling: iterations 420000 of 420000, elapsed 0:00:20, remaining 0:00:00


In [6]:
func_dict = {
    "Mean":np.mean,
    "SD": np.std,
    "SE": lambda x: np.std(x) / np.sqrt(np.size(x)),
    "2.5%": lambda x: np.percentile(x, 2.5),
    "25%": lambda x: np.percentile(x, 25),
    "50%": lambda x: np.percentile(x, 50),
    "75%": lambda x: np.percentile(x, 75),
    "97.5%": lambda x: np.percentile(x, 97.5),
}

In [7]:
# burning the first 40000 iterations
az.summary(idata.posterior.sel(draw=slice(40000, 140000))[["beta1", "beta2", "sigma.y", "sigma.alpha1", "sigma.alpha2"]], stat_funcs=func_dict, extend=False)

Unnamed: 0,Mean,SD,SE,2.5%,25%,50%,75%,97.5%
beta1,0.116,0.191,0.0,-0.259,-0.011,0.117,0.243,0.492
beta2,-0.005,0.004,0.0,-0.013,-0.008,-0.005,-0.003,0.002
sigma.y,0.382,0.019,0.0,0.347,0.369,0.382,0.395,0.423
sigma.alpha1,1.329,0.141,0.0,1.088,1.23,1.317,1.416,1.638
sigma.alpha2,0.024,0.003,0.0,0.019,0.022,0.024,0.026,0.031


#### 1c.The 95% equal-tailed posterior credible intervals for $\beta_1$ and $\beta_2$

* 95% posterior predictive interval for $\beta_1$: [-0.258, 0.491]
* 95% posterior predictive interval for $\beta_2$: [-0.013, 0.002]

The 95% equal-tailed posterior credible intervals for $\beta_1$ and $\beta_2$ contains 0 because they change with respect of the national life expectancy is close to zero. 

### **2. Bayesian hierarchical normal linear regression - Bivariate**

#### 2a. Bivariate prior formulation

In [8]:
data = {'Y': np.ma.masked_invalid(df.values),
        'X': np.ma.masked_invalid(years),
        'Xbar': np.ma.masked_invalid(np.mean(years)),
        'Omega0': np.ma.masked_invalid(np.array([[1, 0], [0, 0.0005]])),
        'mu0': np.ma.masked_invalid(np.array([0, 0])),
        'Sigma0.inv': np.ma.masked_invalid(np.array([[1e-6, 0], [0, 1e-6]]))}

inits = [{'tausq.y': 1, 'beta': np.array([0, 0]), 'Omega.inv': np.identity(2)},
         {'tausq.y': 100, 'beta': np.array([100, 100]), 'Omega.inv': 100 * np.identity(2)},
         {'tausq.y': 0.1, 'beta': np.array([-100, -100]), 'Omega.inv': 0.01 * np.identity(2)}]

In [9]:
jags_model_string = '''
  data {
      dim.Y <- dim(Y)
    }
    model {
      for(i in 1:dim.Y[1]) {

        for(j in 1:dim.Y[2]) {
          Y[i,j] ~ dnorm(mu[i,j], tausq.y)
          mu[i,j] <- alpha[i,1] + alpha[i,2] * (X[j] - Xbar)
        }

        alpha[i,1:2] ~ dmnorm(beta, Omega.inv)
      }

      tausq.y ~ dgamma(0.001, 0.001)
      sigma.y <- 1 / sqrt(tausq.y)

      beta ~ dmnorm(mu0, Sigma0.inv)
      Omega.inv ~ dwish(2*Omega0, 2)
      Omega <- inverse(Omega.inv)

      rho <- Omega[1,2] / sqrt(Omega[1,1] * Omega[2,2])
      rho.ind <- rho >= 0
    }
    '''


In [10]:
jags_model \
    = pj.Model(code=jags_model_string,
               init=inits,
               data=data,
               chains=3,
               )

In [11]:
samples_1 = jags_model.sample(iterations=14000, vars=["beta", "sigma.y", "rho", 'Omega', 'rho.ind'])
samples_1['Omega'] = np.reshape(samples_1['Omega'], [4, 14000, 3])

sampling: iterations 42000 of 42000, elapsed 0:00:03, remaining 0:00:00


In [12]:
idata = az.from_pyjags(samples_1)

#### 2b. Bivariate model summary

In [13]:
# burning the first 4000 iterations
az.summary(idata.posterior.sel(draw=slice(4000, 14000))[["beta", "sigma.y", "rho", 'Omega', 'rho.ind']], stat_funcs=func_dict, extend=False)

Unnamed: 0,Mean,SD,SE,2.5%,25%,50%,75%,97.5%
beta[0],0.116,0.189,0.001,-0.26,-0.008,0.118,0.243,0.486
beta[1],-0.006,0.004,0.0,-0.013,-0.008,-0.006,-0.003,0.002
sigma.y,0.383,0.019,0.0,0.347,0.37,0.382,0.396,0.423
rho,0.173,0.146,0.001,-0.124,0.076,0.179,0.274,0.446
Omega[0],1.75,0.369,0.002,1.167,1.488,1.707,1.96,2.602
Omega[1],0.006,0.005,0.0,-0.004,0.002,0.005,0.009,0.016
Omega[2],0.006,0.005,0.0,-0.004,0.002,0.005,0.009,0.016
Omega[3],0.001,0.0,0.0,0.0,0.0,0.001,0.001,0.001
rho.ind,0.878,0.327,0.002,0.0,1.0,1.0,1.0,1.0


#### 2c. The posterior probability for $\rho >= 0$

In [14]:
rho_ge_0 = round(idata.posterior['rho.ind'].mean().values * 100, 2)
rho_ge_0

87.72

The posterior probability for $\rho >= 0$ is  87.72%. Which means that slope and intercept parameter are correlated

### **3. GRADUATE SECTION**

#### 3a. Conditionally iid

Consider Bernoulli (0 or 1) random variables $Y_1$ and $Y_2$. $Y_1$ and $Y_2$ are $conditionally \  iid$ given $X$.

**i.** The joint probability for $Y_1$ anda $Y_2$ is given by

\begin{equation} \label{eq:1}
Prob(Y1 = 1, Y2 = 1) = 0
\end{equation}


By the law of total probability we know that,

\begin{equation} \label{eq:2}
Prob(Y_1 = 1, Y_2= 1) \ \ =  \ \sum_x Prob(Y_1 = 1, Y_2 = 1 | X = x) Prob(X=x)
\end{equation}

As $Y_1$ and $Y_2$ are conditionally independent. Thus,

\begin{equation} \label{eq:3}
Prob(Y_1 = 1, Y_2= 1) \ \ =  \ \sum_x Prob(Y_1 = 1| x) Prob(Y_2 = 1 | X = x) Prob(X=x)
\end{equation}

We know that $Prob(Y_1 = 1, Y_2= 1) \ \ = 0$ 

\begin{equation} \label{eq:4}
0 =  \ \sum_x Prob(Y_1 = 1| x) Prob(Y_2 = 1 | X = x) Prob(X=x)
\end{equation}

Also that $Prob(Y_1 = 1 | X=x) \  =  \ Prob(Y_2= 1 | X=x) \ \ =  g(x)$ then,


\begin{equation} \label{eq:5}
0 =  \ \sum_x g(x)^{2} Prob(X=x)
\end{equation}

From \ref{eq:5} we have that for $Prob(X=x) > 0$, the $g(x)^2$ term needs to be $0$ to fullfil the equality. Therefore, this implies that $Prob(Y_1 = 1 | X = x) \ \ =  \ Prob(Y_2 = 1 | X = x)$ are also $0$
$\\ \\
$



**ii.** Show that $g(x) = 0$ such that $Prob(X=x)>0$ implies that $Prob(Y_1=1) = Prob(Y_2=1) = 0$

\begin{align*} \label{eq:1}
Prob(Y_1 = 1) \ \ &=  \ \sum_x Prob(Y_1 = 1| x) Prob(X=x) \\ \\
0 \ \ &=  \ \sum_x g(x) Prob(X=x)
\end{align*}

Since the we need to fullfill the equality, $g(x)$ needs to be 0 since  $Prob(X=x) > 0 $. Similarly for $Y_2$. Therefore $Prob(Y_1=1) = Prob(Y_2=1) = 0$

#### 3b. $Y_1$ and $Y_2$ are exchangeable

**i)**

\begin{equation}
Prob(Y_1 = 1; Y_2 = 0)  \ \ =  \ \ Prob(Y_1 = 0; Y_2 = 1) = 1/2
\end{equation}

The joint distribution of $Y_1$ and $Y_2$ are exhangeable because they have a symetric probability matrix as follows:

$$
\begin{array}{ccc}
& & Y_1 \\
& Y_2 & \begin{bmatrix} 0  & 1/2 \\ 1/2 & 0 \end{bmatrix}
\end{array}
$$


We can exhange them $Y_1$ and $Y_2$ and the joint probability matrix will still be the same

$$
\begin{array}{ccc}
& & Y_2 \\
& Y_1 & \begin{bmatrix} 0  & 1/2 \\ 1/2 & 0 \end{bmatrix}
\end{array}
$$



**ii)**  Suppose for contradiction that there exists a discrete random variables $X$ such that $Y_1$ and $Y_2$. $Y_1$ and $Y_2$ are $conditionally \  iid$ given $X$.


\begin{equation} 
Prob(Y_1 = 1; Y_2 = 0)  \ \ =  \ \ Prob(Y_1 = 0; Y_2 = 1) = 1/2
\end{equation}

Also we know from **3a** that $g(x) = 0$ for all $X$ such that $Prob(X = x) > 0$. Thus, 

\begin{align} \label{eq:67}
Prob(Y_1 = 1; Y_2 = 0) &=  \ \sum_x g(x)^{2} Prob(X=x) \\ \\
1/2 &=  \ \sum_x g(x)^{2} Prob(X=x)
\end{align}

$g(x)$ and $Prob(X=x)$ must be greater than zero to fullfill the equality in eq\ref{eq:67}. Therefore, there are not such X. 