In [None]:
import pystan

In [None]:
stations = pd.read_csv('input/stations_tokai3ken_edited.csv', index_col=0)

In [None]:
X = pd.concat([stations['乗降客数（日）'], stations['路線ID']], axis=1)
y = stations['飲食店事業所数']

In [None]:
rosen = dict(zip(stations['路線ID'].unique(), range(1, 1+len(stations['路線ID'].unique()))))
Rosen = []
for i in range(len(X)):
    Rosen.append(rosen[X['路線ID'][i]])

data = dict(
    N_station=X.shape[0],
    N_rosen=len(rosen),
    Chukan=X['乗降客数（日）'].values,
    Rosen=Rosen,
    Inshoku=y.values
)

In [None]:
model = '''
data {
    int N_station;
    int N_rosen;
    vector<lower=0>[N_station] Chukan;
    int<lower=1> Rosen[N_station];
    vector<lower=0>[N_station] Inshoku;
}

parameters {
    real mu_a;
    real<lower=0> mu_b;
    real<lower=0> mu_s;
    real<lower=0> s_a;
    real<lower=0> s_b;
    real<lower=0> s_s;
    vector[N_rosen] a;
    vector<lower=0>[N_rosen] b;
    vector<lower=0>[N_rosen] s;
}

model {
    a ~ student_t(4, mu_a, s_a);
    b ~ student_t(4, mu_b, s_b);
    s ~ student_t(4, mu_s, s_s);
    for (i in 1:N_station)
        Inshoku[i] ~ student_t(4, a[Rosen[i]] + b[Rosen[i]]*Chukan[i], s[Rosen[i]]);
}

generated quantities{
    vector[N_station] predict;
    for (i in 1:N_station)
        predict[i] = student_t_rng(4, a[Rosen[i]] + b[Rosen[i]]*Chukan[i], s[Rosen[i]]);
}
'''

In [None]:
fit = pystan.stan(model_code=model, data=data, chains=3, iter=500, warmup=100, thin=1)
#fit = pystan.stan(model_code=model, data=data, chains=4, iter=2000, warmup=500, thin=1)

In [None]:
fit

In [None]:
#MCMCサンプリングの結果を抽出
ms = fit.extract(permuted=False, inc_warmup=True)
#ウォームアップ（バーンイン）のサイズを取得
iter_from = fit.sim['warmup']
#ウォームアップの区間を省く
iter_range = np.arange(iter_from, ms.shape[0])
#各変数名を取得
paraname = fit.sim['fnames_oi']

#※※※今回は全て描画したいので、こちらを使う
iter_start = np.arange(0, ms.shape[0])

In [None]:
#seabornのcolorpalette
palette = sns.color_palette()
#おまじない？
sns.set(font_scale=1)
sns.set_style("ticks")
sns.despine(offset=10, trim=True)

#複数グラフの描画（これしか方法知らない）
fig,axes  = plt.subplots(nrows=2, ncols=3, figsize=(15,10))

for i in range(2):
    for j in range(3):
        axes[i,j].plot(iter_start, ms[iter_start, :, i*3+j], 
                       linewidth=2, color=palette[i*3+j])
        axes[i,j].set_title(paraname[i*3+j])
        axes[i,j].set_xlabel('mcmc_size')
        axes[i,j].set_ylabel('parameter')
        axes[i,j].grid(True)

fig.show()

In [None]:
summary = pd.DataFrame(data=fit.summary()['summary'], index=fit.summary()['summary_rownames'], columns=fit.summary()['summary_colnames'])

In [None]:
pred = summary.query('index.str.startswith("predict")', engine='python')['50%'].values

In [None]:
#RMSE
np.sqrt(np.mean((y.values-pred)**2))

In [None]:
from sklearn.metrics import r2_score
r2_score(y.values, pred)

In [None]:
fig, ax = plt.subplots()
ax.scatter(pred, y, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
rosen_uniq = stations['路線ID'].unique()

In [None]:
fig, axes = plt.subplots(nrows=8, ncols=8, figsize=(60,60))

for i in range(8):
    for j in range(8):
        plt.hold(True);
        a = summary.loc[f'a[{i*8+j}]', '50%']
        b = summary.loc[f'b[{i*8+j}]', '50%']
        s = summary.loc[f's[{i*8+j}]', '50%']
        xxx = stations[stations['路線ID'] == rosen_uniq[i*8+j]].sort_values(by='乗降客数（日）')["乗降客数（日）"]
        yyy = stations[stations['路線ID'] == rosen_uniq[i*8+j]].sort_values(by='乗降客数（日）')["飲食店事業所数"]
        axes[i,j].plot(xxx, yyy, 'o-')
        xx = range(int(xxx.tolist()[-1]))
        yy = a + b*xx
        axes[i,j].plot(xx, yy)
        if i*8+j == 3:
            axes[i,j].set_title(
                f'{i*8+j}, a={a:.2f}, b={b:.4f}, s={s:.2f}, {rosen_uniq[i*8+j]}\n \
                ochiaigawa, takenami, kokokei, minosakamoto, kamado,\
                \n sakashita, mizunami, tokishi, ena, nakatugawa, tajimi')
        else:
            axes[i,j].set_title(f'{i*8+j}, a={a:.2f}, b={b:.4f}, s={s:.2f}, {rosen_uniq[i*8+j]}')
        if i*8+j == 62: #本当は64個あるけど、64+1=65個はきれいに並べれないから、最後の1個省く
            break
a = summary.loc['mu_a', '50%']
b = summary.loc['mu_b', '50%']
s = summary.loc['mu_s', '50%']
xxx = stations.sort_values(by='乗降客数（日）')["乗降客数（日）"]
yyy = stations.sort_values(by='乗降客数（日）')["飲食店事業所数"]
axes[7,7].plot(xxx, yyy, 'o')
xx = range(int(xxx.tolist()[-1]))
yy = a + b*xx
axes[7,7].plot(xx, yy)
axes[7,7].set_title(f'{0}, a={a:.2f}, b={b:.4f}, s={s:.2f}, total')

x_mizunami = 9441
y_mizunami = 22
x_tokishi = 11764
y_tokishi = 39
x_ena = 6367
y_ena = 31
x_nakatsugawa = 7096
y_nakatsugawa = 52
x_tajimi = 27188
y_tajimi = 50
axes[7,7].plot(x_mizunami, y_mizunami, 'ro', x_tokishi, y_tokishi, 'yo', x_ena, y_ena,
               'yo', x_nakatsugawa, y_nakatsugawa, 'yo', x_tajimi, y_tajimi, 'yo')

fig.savefig('output/hierarchical_bayes_joko', dpi=200)
fig.show()

In [None]:
a_summary = summary.query('index.str.contains("a")', engine='python')['50%']

In [None]:
a_summary

In [None]:
b_summary = summary.query('index.str.contains("b")', engine='python')['50%']

In [None]:
b_summary

In [None]:
s_summary = summary.query('index.str.contains("s")', engine='python')['50%']

In [None]:
s_summary