In [414]:
# Load package
import numpy as np; from scipy import stats; import matplotlib.pyplot as plt; import pymc as pm;import arviz as az; 
import math; import pandas as pd

# Task 1: Real data set example 

Replicate application in section 5 from O. Fonseca, T. C., R. Ferreira, M. A., & Migon, H. S. (2014). Objective Bayesian analysis for the Student-t regression model. Biometrika, 101(1), 252–252. https://doi.org/10.1093/biomet/asu001

In [3]:
# Load data
data = pd.read_csv('PublicSchools.csv')
print(data.head())

   Unnamed: 0  Expenditure  Income
0     Alabama          275    6247
1      Alaska          821   10851
2     Arizona          339    7374
3    Arkansas          275    6183
4  California          387    8850


In [4]:
# Extract data in vector form

# Predictors
X1 = data['Income'] # Linear 
X2 = data['Income']^(2) # Quadratic

# Outcome
true_y = data['Expenditure']

# Sample size = 50
n = 50

## Geweke's lamda = 0.1, Linear model

In [26]:
# Log probability of sigma prior: pdf = 1/x
def logsig(x):
    return -np.log(x)

In [29]:
model_0_1 = pm.Model()

with model_0_1:
    # Geweke prior on nu with lamda = 0.1
    nu = pm.Exponential('nu', lam = 0.1)
    
    # sigma prior 1/x
    sigma = pm.CustomDist('sigma',logp=logsig)
    
    # Flat prior on betas
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    
    # Linear combination of beta and x
    mu = pm.Deterministic('mu', intercept + beta_1*X1)

    # T regression likelihood
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_y)

    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 9 seconds.


In [80]:
# Posterior median
print("number of MCMC samples:", idata.posterior['nu'].values.flatten().shape[-1])
az.summary(idata,var_names=["nu","sigma","intercept","beta_1"], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,6.18,2.37,2.31,22.87,0.1,2227.71,2021.0,1.0
sigma,48.96,5.1,36.33,64.18,0.18,2139.09,1912.97,1.0
intercept,-80.23,43.61,-211.52,47.32,2.21,1624.88,1696.96,1.0
beta_1,0.06,0.01,0.04,0.08,0.0,1716.45,1698.63,1.0


## Geweke's lamda = 0.1, Quadratic model

In [33]:
model_0_1_quad = pm.Model()

with model_0_1_quad:
    # Geweke prior on nu with lamda = 0.1
    nu = pm.Exponential('nu', lam = 0.1)
    
    # sigma prior 1/x
    sigma = pm.CustomDist('sigma',logp=logsig)
    
    # Flat prior on betas
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
       
    # Linear combination of beta and x
    mu = pm.Deterministic('mu', intercept + beta_1*X1 + beta_2*X2)

    # T regression likelihood
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_y)

    idata_quad = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 217 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 438 divergences after tuning. Increase `target_accept` or reparameterize.
Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 1 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 2 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.


In [81]:
# Posterior median
print("number of MCMC samples:", idata_quad.posterior['nu'].values.flatten().shape[-1])
az.summary(idata_quad,var_names=["nu","sigma","intercept","beta_1","beta_2"], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,8.71,4.74,2.42,24.13,3.59,9.01,42.7,1.43
sigma,52.93,3.51,39.32,63.21,3.6,6.6,42.26,1.48
intercept,-91.42,32.5,-213.47,38.56,7.57,21.94,174.12,1.56
beta_1,-0.96,3.84,-10.35,8.94,2.7,6.43,13.21,2.4
beta_2,1.02,3.85,-8.88,10.41,2.7,6.43,12.93,2.41


In Geweke 0.1 prior, quadratic model gives higher estimate than linear model (similar with paper results 6.35 vs 8.07)

## Geweke's lamda = 1.0, Linear model

In [22]:
model_1_0 = pm.Model()

with model_1_0:
    # Geweke prior on nu with lamda = 1.0  
    nu = pm.Exponential('nu', lam = 1.0)
    
    # sigma prior 1/x
    sigma = pm.CustomDist('sigma',logp=logsig)
    
    # Flat prior on betas
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    
    # Linear combination of beta and x
    mu = pm.Deterministic('mu', intercept + beta_1*X1)

    # T regression likelihood
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_y)

    idata2 = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 10 seconds.


In [82]:
# Posterior median
print("number of MCMC samples:", idata2.posterior['nu'].values.flatten().shape[-1])
az.summary(idata2,var_names=["nu","sigma","intercept","beta_1"], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,2.93,0.68,1.5,5.67,0.02,2466.26,2100.06,1.0
sigma,43.28,4.51,31.57,57.72,0.17,2410.82,1899.52,1.0
intercept,-71.59,42.87,-194.72,48.06,1.57,1939.69,2036.53,1.0
beta_1,0.06,0.01,0.04,0.07,0.0,1955.62,1981.61,1.0


## Gweke's lamda = 1.0, Quadratic model

In [37]:
model_1_0_quad = pm.Model()

with model_1_0_quad:
    # Geweke prior on nu with lamda = 1.0 
    nu = pm.Exponential('nu', lam = 1.0)
    
    # sigma prior 1/x
    sigma = pm.CustomDist('sigma',logp=logsig)
    
    # Flat prior on betas
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    
    # Linear combination of beta and x
    mu = pm.Deterministic('mu', intercept + beta_1*X1 + beta_2*X2)

    # T regression likelihood
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_y)

    idata2_quad = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 214 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 1 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 3 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.


In [83]:
# Posterior median
print("number of MCMC samples:", idata2_quad.posterior['nu'].values.flatten().shape[-1])
az.summary(idata2_quad,var_names=["nu","sigma","intercept","beta_1","beta_2"], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,3.52,0.57,1.65,5.84,0.31,12.74,112.01,1.24
sigma,41.3,2.88,34.12,56.84,2.83,11.57,233.66,1.12
intercept,-44.32,35.81,-182.19,12.7,5.89,10.13,85.67,1.55
beta_1,2.13,2.11,-4.36,7.35,1.17,5.4,33.47,1.87
beta_2,-2.08,2.11,-7.3,4.41,1.18,5.34,33.47,1.88


Geweke's prior with lamda = 1 leads to smaller nu estimates (consistent with paper results).

## Jeffreys prior, Linear model

In [48]:
import pytensor.tensor as pt
# Jeffreys prior pdf
def logJeff(x):
    return pt.log((x/(x+3))**(1/2)*(pt.polygamma(1,x/2) - pt.polygamma(1, (x+1)/2) - 2*(x+3)/(x*(x+1)**2))**(1/2))

In [49]:
modelJeff = pm.Model()

with modelJeff:
    # Jeffrey's prior on nu
    nu = pm.CustomDist('nu', logp=logJeff)
    
    # sigma prior 1/x
    sigma = pm.CustomDist('sigma',logp=logsig)
    
    # Flat prior on betas
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    
    # Linear combination of beta and x
    mu = pm.Deterministic('mu', intercept + beta_1*X1)

    # T regression likelihood
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_y)

    idataJeff = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 14 seconds.


In [57]:
# Posterior median
print("number of MCMC samples:", idataJeff.posterior['nu'].values.flatten().shape[-1])
az.summary(idataJeff,var_names=["nu","sigma","intercept","beta_1"], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,4.3,1.47,1.89,17.44,0.08,1461.38,284.94,1.01
sigma,46.52,4.96,34.03,62.19,0.22,1572.16,1466.83,1.01
intercept,-72.46,43.57,-205.27,48.7,1.74,1480.17,1465.44,1.0
beta_1,0.06,0.01,0.04,0.08,0.0,1463.1,1358.44,1.0


- nu, sigma and intercept estimates are close to paper result
- intercept CI is off from paper result (-210.8, 53)
- beta_1 median estimiate is off from paper result : 583.2

## Jeffreys prior, Quadratic model

In [68]:
modelJeff_quad = pm.Model()

with modelJeff_quad:
    # Jeffrey's prior on nu
    nu = pm.CustomDist('nu', logp=logJeff)
    
    # sigma prior 1/x
    sigma = pm.CustomDist('sigma',logp=logsig)
    
    # Flat prior on betas
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    
    # Linear combination of beta and x
    mu = pm.Deterministic('mu', intercept + beta_1*X1 + beta_2*X2)

    # T regression likelihood
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_y)

    idataJeff_quad = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 312 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
Chain 0 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 2 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.
Chain 3 reached the maximum tree depth. Increase `max_treedepth`, increase `target_accept` or reparameterize.


In [69]:
# Posterior median
print("number of MCMC samples:", idataJeff_quad.posterior['nu'].values.flatten().shape[-1])
az.summary(idataJeff_quad,var_names=["nu","sigma","intercept","beta_1","beta_2"], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,4.49,0.96,1.83,8.44,0.15,24.72,239.82,1.57
sigma,46.25,2.8,33.83,56.83,0.01,93.28,180.93,1.55
intercept,-89.31,29.58,-190.46,29.4,0.07,50.15,218.88,1.54
beta_1,1.12,2.84,-3.16,9.25,2.58,7.59,44.39,1.44
beta_2,-1.08,2.86,-9.19,3.22,2.58,7.59,44.39,1.44


- nu and sigma estimates are close to paper result
- nu upper CI is a bit off from paper result: 24.92
- all beta estimates are very off from paper results: 899.7, -2077, 1789.7

## Potential explanation: direct substitution of MLE for betas?

In [78]:
lin = 899.7 -2077*X1 + 1789.7*X2

modelJeff_quad_MLE = pm.Model()

with modelJeff_quad_MLE:
    # Jeffrey's prior on nu
    nu = pm.CustomDist('nu', logp=logJeff, initval = 4.8)
    
    # sigma prior 1/x
    sigma = pm.CustomDist('sigma',logp=logsig, initval = 46)

    # T regression likelihood
    y = pm.StudentT('y', nu = nu, mu = lin, sigma = sigma, shape = n, observed = true_y)

    idataJeff_quad_MLE = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 16 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 1790 divergences after tuning. Increase `target_accept` or reparameterize.


In [79]:
az.summary(idataJeff_quad_MLE,var_names=["nu","sigma"], stat_focus="median", round_to=2)

Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,46.34,32.52,8.29,5037.51,8.39,90.23,33.45,1.07
sigma,2193129.58,142604.6,1842588.57,2669710.18,13759.88,205.53,129.42,1.01


nu and sigma estimates way off now.

# Task 2: Report format

# Task 3:  T regression in R

- use StLM function
- https://cran.r-project.org/web/packages/StReg/StReg.pdf

# Task 4: Simulation on $\nu$ profile likelihood by obtaining $\beta_\nu$ and $\sigma_\nu$

In [420]:
# Load packages
from scipy.optimize import minimize
from scipy import special

In [421]:
# sample data
x1 = stats.norm(loc = 0, scale = 1).rvs(100)
x2 = stats.norm(loc = 0, scale = 1).rvs(100)
x3 = stats.norm(loc = 0, scale = 1).rvs(100)
x4 = stats.norm(loc = 0, scale = 1).rvs(100)

n = 100

true_xb = 1 + x1*1 + 0.3*x2 + 0.9*x3 + 1*x4
true_nu = 5
hyper_sigma = math.sqrt(1.5)

true_Y= stats.t.rvs(df=true_nu, loc=true_xb, scale=hyper_sigma, random_state=None)

In [427]:
# Log likelihood function to be optimized
def get_beta_sigma(params, nu):
    intercept, beta_1, beta_2, beta_3, beta_4, sigma = params
    XB = intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4

    # loglikelihood
    equation = n * np.log(special.gamma((nu + 1)/2)) + n* nu *0.5 * np.log(nu) - n * np.log(special.gamma(nu/2)) - 0.5*n*np.log(np.pi) - n * np.log(sigma) - 0.5 *(nu + 1)*np.sum(np.log(nu + ((true_Y - XB)/sigma)**2))
    return -equation

In [428]:
# Optimize function with initial guess
def optimize_over_nu(nu):
    initial_guess = [1, 1, 1, 1, 1, 1]
    result = minimize(get_beta_sigma, initial_guess, args=(nu,), method='BFGS')  # You can choose other optimization methods as well

    return result.x, result.fun  # Return the optimized parameters and the likelihood value

In [438]:
# Loop through different values of nu
nu_values = range(100)  # Any desired range of nu values here
best_likelihood = float('inf')
best_parameters = None

for nu in nu_values:
    parameters, likelihood = optimize_over_nu(nu)
    #print(f"For nu={nu}, optimized parameters: {parameters}, likelihood: {likelihood}")

    # Update best parameters if likelihood is greater
    if likelihood < best_likelihood:
        best_likelihood = likelihood
        best_parameters = parameters

print(f"The parameters giving the largest likelihood: {best_parameters}, likelihood: {best_likelihood}")

  equation = n * np.log(special.gamma((nu + 1)/2)) + n* nu *0.5 * np.log(nu) - n * np.log(special.gamma(nu/2)) - 0.5*n*np.log(np.pi) - n * np.log(sigma) - 0.5 *(nu + 1)*np.sum(np.log(nu + ((true_Y - XB)/sigma)**2))
  equation = n * np.log(special.gamma((nu + 1)/2)) + n* nu *0.5 * np.log(nu) - n * np.log(special.gamma(nu/2)) - 0.5*n*np.log(np.pi) - n * np.log(sigma) - 0.5 *(nu + 1)*np.sum(np.log(nu + ((true_Y - XB)/sigma)**2))
  equation = n * np.log(special.gamma((nu + 1)/2)) + n* nu *0.5 * np.log(nu) - n * np.log(special.gamma(nu/2)) - 0.5*n*np.log(np.pi) - n * np.log(sigma) - 0.5 *(nu + 1)*np.sum(np.log(nu + ((true_Y - XB)/sigma)**2))
  equation = n * np.log(special.gamma((nu + 1)/2)) + n* nu *0.5 * np.log(nu) - n * np.log(special.gamma(nu/2)) - 0.5*n*np.log(np.pi) - n * np.log(sigma) - 0.5 *(nu + 1)*np.sum(np.log(nu + ((true_Y - XB)/sigma)**2))
  equation = n * np.log(special.gamma((nu + 1)/2)) + n* nu *0.5 * np.log(nu) - n * np.log(special.gamma(nu/2)) - 0.5*n*np.log(np.pi) - n * n

The parameters giving the largest likelihood: [0.98536624 0.67204923 0.40009124 0.61678768 0.80376156 1.17711344], likelihood: 179.55656443047394


# Task 6: Use Profile Likelihood (with MLE fitted) and Jeffery Prior estimate $\nu$

In [439]:
intercept = 0.98536624
beta_1 = 0.67204923
beta_2 = 0.40009124
beta_3 = 0.61678768
beta_4 = 0.80376156
sigma = 1.17711344

mu = intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4

In [440]:
def logJeff(x):
    return pt.log((x/(x+3))**(1/2)*(pt.polygamma(1,x/2) - pt.polygamma(1, (x+1)/2) - 2*(x+3)/(x*(x+1)**2))**(1/2))

modelJeff_MLE = pm.Model()

with modelJeff_MLE:
    nu = pm.CustomDist('nu', logp=logJeff, initval = 1)

    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = 100, observed = true_Y)

    idataJeff_MLE = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 2 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
There were 33 divergences after tuning. Increase `target_accept` or reparameterize.


In [441]:
print("number of MCMC samples:", idataJeff_MLE.posterior['nu'].values.flatten().shape[-1])
az.summary(idataJeff_MLE.posterior['nu'], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,4.63,1.01,2.66,9.22,0.06,1160.1,880.2,1.01


In [442]:
with modelJeff_MLE:
    map_est_Jeff_MLE = pm.find_MAP()
print(map_est_Jeff_MLE['nu'])

  warn(



4.030831329363366


# Task 5: Reference priors

## What is Ordering

BERGER, J. O., & BERNARDO, J. M. (1992). Ordered group reference priors with application to the multinomial problem. Biometrika, 79(1), 25–37. https://doi.org/10.1093/biomet/79.1.25

Wang, M., & Yang, M. (2016). Posterior property of Student-t linear regression model using objective priors. Statistics & Probability Letters, 113, 23–29. https://doi.org/10.1016/j.spl.2016.02.003

- 'parameters of interest' and 'nuisance parameters
- multiplying Fisher's information matrix in different order gives different joint distribution

In [402]:
# sample data
x1 = stats.norm(loc = 0, scale = 1).rvs(100)
x2 = stats.norm(loc = 0, scale = 1).rvs(100)
x3 = stats.norm(loc = 0, scale = 1).rvs(100)
x4 = stats.norm(loc = 0, scale = 1).rvs(100)

n = 100

true_xb = 1 + x1*1 + 0.3*x2 + 0.9*x3 + 1*x4
true_nu = 1
hyper_sigma = math.sqrt(1.5)

true_Y= stats.t.rvs(df=true_nu, loc=true_xb, scale=hyper_sigma, random_state=None)

In [268]:
# Define square root delta functions

def sqrtdelta1(x):
    return (pt.polygamma(1,x/2) - pt.polygamma(1, (x+1)/2) - 2*(x+5)/(x*(x+1)*(x+3)))**(1/2)

def sqrtdelta2(x):
    return (pt.polygamma(1,x/2) - pt.polygamma(1, (x+1)/2) - 2*(x+3)/(x*(x+1)**2))**(1/2)

In [269]:
## 1. R1

def r1sigma(x):
    return pt.log(sqrtdelta1(x))


model_r1 = pm.Model()

with model_r1:
    nu = pm.CustomDist('nu', logp=r1sigma, initval = 1)
    
    sigma = pm.CustomDist('sigma',logp=logsig, initval = 1)
    
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    beta_3 = pm.Flat('beta_3')
    beta_4 = pm.Flat('beta_4')
    
    mu = pm.Deterministic('mu', intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4)

    ### Changed to a T distribution ###
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_Y)

    idata_r1 = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 9 seconds.
There were 199 divergences after tuning. Increase `target_accept` or reparameterize.


In [270]:
## 2. R2

def r2sigma(x):
    return pt.log(sqrtdelta2(x))


model_r2 = pm.Model()

with model_r2:
    nu = pm.CustomDist('nu', logp=r2sigma, initval = 1)
    
    sigma = pm.CustomDist('sigma',logp=logsig, initval = 1)
    
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    beta_3 = pm.Flat('beta_3')
    beta_4 = pm.Flat('beta_4')
    
    mu = pm.Deterministic('mu', intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4)

    ### Changed to a T distribution ###
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_Y)

    idata_r2 = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 7 seconds.
There were 106 divergences after tuning. Increase `target_accept` or reparameterize.


In [271]:
## 3. R3

p = 5

def r3sigma(x):
    return pt.log(((x+1)/(x+3))**(p/2)*sqrtdelta1(x))


model_r3 = pm.Model()

with model_r3:
    nu = pm.CustomDist('nu', logp=r3sigma, initval = 1)
    
    sigma = pm.CustomDist('sigma',logp=logsig, initval = 1)
    
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    beta_3 = pm.Flat('beta_3')
    beta_4 = pm.Flat('beta_4')
    
    mu = pm.Deterministic('mu', intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4)

    ### Changed to a T distribution ###
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_Y)

    idata_r3 = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 8 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 1040 divergences after tuning. Increase `target_accept` or reparameterize.


In [272]:
## 4. R4

p = 5

def r4sigma(x):
    return pt.log(((x+1)/(x+3))**(p/2)*sqrtdelta2(x))


model_r4 = pm.Model()

with model_r4:
    nu = pm.CustomDist('nu', logp=r4sigma, initval = 1)
    
    sigma = pm.CustomDist('sigma',logp=logsig, initval = 1)
    
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    beta_3 = pm.Flat('beta_3')
    beta_4 = pm.Flat('beta_4')
    
    mu = pm.Deterministic('mu', intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4)

    ### Changed to a T distribution ###
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_Y)

    idata_r4 = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 8 seconds.
There were 278 divergences after tuning. Increase `target_accept` or reparameterize.


In [273]:
## 5. R5

p = 5

def psigma(x):
    return -(p+1)*np.log(x)

model_r5 = pm.Model()

with model_r5:
    nu = pm.CustomDist('nu', logp=r2sigma, initval = 1)
    
    sigma = pm.CustomDist('sigma',logp=psigma, initval = 1)
    
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    beta_3 = pm.Flat('beta_3')
    beta_4 = pm.Flat('beta_4')
    
    mu = pm.Deterministic('mu', intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4)

    ### Changed to a T distribution ###
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_Y)

    idata_r5 = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 7 seconds.
There were 169 divergences after tuning. Increase `target_accept` or reparameterize.


In [274]:
## 6. R6

p = 5

model_r6 = pm.Model()

with model_r6:
    nu = pm.CustomDist('nu', logp=r1sigma, initval = 1)
    
    sigma = pm.CustomDist('sigma',logp=psigma, initval = 1)
    
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    beta_3 = pm.Flat('beta_3')
    beta_4 = pm.Flat('beta_4')
    
    mu = pm.Deterministic('mu', intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4)

    ### Changed to a T distribution ###
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = n, observed = true_Y)

    idata_r6 = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 7 seconds.
There were 29 divergences after tuning. Increase `target_accept` or reparameterize.


In [275]:
def logJeff(x):
    return pt.log((x/(x+3))**(1/2)*(pt.polygamma(1,x/2) - pt.polygamma(1, (x+1)/2) - 2*(x+3)/(x*(x+1)**2))**(1/2))

modelJeff = pm.Model()

with modelJeff:
    nu = pm.CustomDist('nu', logp=logJeff, initval = 1)
    
    sigma = pm.CustomDist('sigma',logp=logsig, initval = 1)
    
    intercept = pm.Flat('intercept')
    beta_1 = pm.Flat('beta_1')
    beta_2 = pm.Flat('beta_2')
    beta_3 = pm.Flat('beta_3')
    beta_4 = pm.Flat('beta_4')
    
    
    mu = pm.Deterministic('mu', intercept + beta_1*x1 + beta_2*x2 + beta_3*x3 + beta_4*x4)

    ### Changed to a T distribution ###
    y = pm.StudentT('y', nu = nu, mu = mu, sigma = sigma, shape = 100, observed = true_Y)

    idataJeff = pm.sample()

  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [nu, sigma, intercept, beta_1, beta_2, beta_3, beta_4]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 8 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
There were 72 divergences after tuning. Increase `target_accept` or reparameterize.


## Posterior median

In [276]:
print("number of MCMC samples:", idata_r1.posterior['nu'].values.flatten().shape[-1])
az.summary(idata_r1.posterior['nu'], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,4.34,1.31,2.15,15.18,0.07,1870.97,811.83,1.0


In [277]:
print("number of MCMC samples:", idata_r2.posterior['nu'].values.flatten().shape[-1])
az.summary(idata_r2.posterior['nu'], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,4.32,1.36,2.08,16.04,0.06,1418.25,480.96,1.0


In [278]:
print("number of MCMC samples:", idata_r3.posterior['nu'].values.flatten().shape[-1])
az.summary(idata_r3.posterior['nu'], stat_focus="median", round_to=2)


number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,5.28,1.73,2.39,19.14,0.14,412.95,422.84,1.02


In [279]:
print("number of MCMC samples:", idata_r4.posterior['nu'].values.flatten().shape[-1])
az.summary(idata_r4.posterior['nu'], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,4.92,1.64,2.29,22.19,0.1,1151.69,312.21,1.0


In [280]:
print("number of MCMC samples:", idata_r5.posterior['nu'].values.flatten().shape[-1])
az.summary(idata_r5.posterior['nu'], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,3.45,0.93,1.8,10.09,0.05,1552.34,427.06,1.01


In [281]:
print("number of MCMC samples:", idata_r6.posterior['nu'].values.flatten().shape[-1])
az.summary(idata_r6.posterior['nu'], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,3.59,1.01,1.82,11.44,0.05,1638.23,432.9,1.0


In [282]:
print("number of MCMC samples:", idataJeff.posterior['nu'].values.flatten().shape[-1])
az.summary(idataJeff.posterior['nu'], stat_focus="median", round_to=2)

number of MCMC samples: 4000


Unnamed: 0,median,mad,eti_3%,eti_97%,mcse_median,ess_median,ess_tail,r_hat
nu,4.38,1.43,1.8,15.41,0.08,979.39,75.86,1.01


## MAP

In [292]:
with model_r1:
    map_est_r1 = pm.find_MAP()
print(map_est_r1['nu'])

  warn(



3.128794866730754


In [293]:
with model_r2:
    map_est_r2 = pm.find_MAP()
print(map_est_r2['nu'])

  warn(



3.090151871725406


In [294]:
with model_r3:
    map_est_r3 = pm.find_MAP()
print(map_est_r3['nu'])

  warn(



3.4212955995536185


In [295]:
with model_r4:
    map_est_r4 = pm.find_MAP()
print(map_est_r4['nu'])

  warn(



3.376193930111923


In [296]:
with model_r5:
    map_est_r5 = pm.find_MAP()
print(map_est_r5['nu'])

  warn(



2.590546644108015


In [291]:
with model_r6:
    map_est_r6 = pm.find_MAP()
print(map_est_r6['nu'])

  warn(



2.6209153948222395


In [None]:
with modelJeff:
    map_est_Jeff = pm.find_MAP()
print(map_est_Jeff['nu'])