In [1]:
# 正規化なし、generated quantities あり
# Rhat 悪い

In [2]:
import numpy as np
from numpy.random import *
import pystan
import pandas as pd
import matplotlib.pyplot as plt
import arviz as az
from scipy.special import expit

In [3]:
# 「Pythonによる因果分析」第４章のデータを使った

In [4]:
num_data = 500

x_1 = randint(15, 76, num_data).astype('float')
e_z = randn(num_data)

z_base = x_1 - 40 + 1*e_z
z_prob = expit(z_base)

Z = np.array([])
for i in range(num_data):
    Z_i = np.random.choice(2, size=1, p=[1-z_prob[i], z_prob[i]])[0]
    Z = np.append(Z, Z_i)

e_y = randn(num_data)

Y = -x_1 + 40*Z + 100 + 2*e_y

In [5]:
df = pd.DataFrame({'Age': x_1,
                   'CM': Z,
                   'Sale_amount': Y,
                   })

df.head(10)

Unnamed: 0,Age,CM,Sale_amount
0,70.0,1.0,72.285842
1,62.0,1.0,78.833123
2,20.0,0.0,77.38739
3,28.0,0.0,71.21054
4,21.0,0.0,78.925717
5,50.0,1.0,90.225639
6,18.0,0.0,85.419263
7,64.0,1.0,74.748994
8,61.0,1.0,79.748851
9,48.0,1.0,93.087058


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          500 non-null    float64
 1   CM           500 non-null    float64
 2   Sale_amount  500 non-null    float64
dtypes: float64(3)
memory usage: 11.8 KB


In [7]:
Sale_amount = df['Sale_amount']
Age = df['Age']
CM = df['CM']

stan_data = {
    'N': num_data,
    'Sale_amount': Sale_amount,
    'Age': Age,
    'CM': CM
}

In [8]:
stan_code = """
data {
    int N;
    real Sale_amount[N];
    vector[N] Age;
    vector[N] CM;
}

parameters {
    real<upper=0> Intercept;
    real b_Age;
    real<lower=0> sigma;
    real a;
    real b;
    real c;
}

model {
    vector[N] prob = inv_logit(Intercept + b_Age * Age);
    Sale_amount ~ normal(a + b * Age + c * prob, sigma);
}

generated quantities {
    vector[N] Sales_amount_pred;
    for (i in 1:N) {
        Sales_amount_pred[i] = a + b * Age[i] + c * inv_logit(Intercept + b_Age * Age[i]);
    }
}
"""

In [9]:
sm = pystan.StanModel(model_code= stan_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_7c1b1bb4a76c136b3b65a9faa2479a8d NOW.


In [10]:
mcmc_result = sm.sampling(
    data=stan_data,
    chains=3,
    iter=3000,
    warmup=2000,
    thin=1
)



In [11]:
print(mcmc_result)

  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)


Inference for Stan model: anon_model_7c1b1bb4a76c136b3b65a9faa2479a8d.
3 chains, each with iter=3000; warmup=2000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=3000.

                         mean se_mean     sd     2.5%      25%    50%    75%  97.5%  n_eff   Rhat
Intercept                -inf     nan    inf -1.6e308 -3.7e307 -2.4e4 -20.43 -14.62    nan    nan
b_Age                  128.97  267.29 336.32   -375.5   -116.3   0.31 561.79 605.04      2   6.95
sigma                    8.82    1.34   1.65     6.56     6.87   8.99  10.49  11.26      2    6.7
a                       80.53   10.76  13.24    62.21    65.88  79.84  96.01  98.43      2  11.65
b                       -0.15    0.43   0.53    -0.89    -0.81  -0.04    0.4    0.5      2  17.92
c                       15.45   25.86  37.32   -35.66   -27.91  31.95  34.47  97.65      2   4.38
Sales_amount_pred[1]    72.72    2.79   3.49    67.73    69.86  71.67  76.74  78.48      2   4.73
Sales_amount_pred[2]    78.

In [14]:
Sales_amount_pred = mcmc_result['Sales_amount_pred'].mean(axis = 0)
df['Sales_amount_pred'] = Sales_amount_pred
pd.set_option('display.max_rows', None)
df

Unnamed: 0,Age,CM,Sale_amount,Sales_amount_pred
0,70.0,1.0,72.285842,72.721965
1,62.0,1.0,78.833123,78.521291
2,20.0,0.0,77.38739,77.578381
3,28.0,0.0,71.21054,76.397617
4,21.0,0.0,78.925717,77.430849
5,50.0,1.0,90.225639,84.085513
6,18.0,0.0,85.419263,77.873422
7,64.0,1.0,74.748994,76.812807
8,61.0,1.0,79.748851,79.334886
9,48.0,1.0,93.087058,84.475148
