# 昼間人口

In [None]:
import pystan

In [None]:
stations = pd.read_csv('input/stations_tokai3ken_edited.csv', index_col=0)

In [None]:
X = pd.concat([stations['昼間人口'], stations['路線ID']], axis=1)
y = stations['飲食店事業所数']

In [None]:
rosen = dict(zip(stations['路線ID'].unique(), range(1, 1+len(stations['路線ID'].unique()))))
Rosen = []
for i in range(len(X)):
    Rosen.append(rosen[X['路線ID'][i]])

data = dict(
    N_station=X.shape[0],
    N_rosen=len(rosen),
    Chukan=X['昼間人口'].values,
    Rosen=Rosen,
    Inshoku=y.values
)

In [None]:
model = '''
data {
    int N_station;
    int N_rosen;
    vector<lower=0>[N_station] Chukan;
    int<lower=1> Rosen[N_station];
    vector<lower=0>[N_station] Inshoku;
}

parameters {
    vector<lower=-10, upper=10>[N_rosen] a;
    vector<lower=0, upper=0.02>[N_rosen] b;
    vector<lower=0, upper=10>[N_rosen] s;
}

model {
    for (i in 1:N_station)
        Inshoku[i] ~ student_t(4, a[Rosen[i]] + b[Rosen[i]]*Chukan[i], s[Rosen[i]]);
}

generated quantities{
    vector[N_station] predict;
    for (i in 1:N_station)
        predict[i] = student_t_rng(4, a[Rosen[i]] + b[Rosen[i]]*Chukan[i], s[Rosen[i]]);
}
'''

階層ベイズみたいにa, b, sの値を制限しないでやると、収束しない

In [None]:
fit = pystan.stan(model_code=model, data=data, chains=3, iter=500, warmup=100, thin=1)

In [None]:
fit

In [None]:
#MCMCサンプリングの結果を抽出
ms = fit.extract(permuted=False, inc_warmup=True)
#ウォームアップ（バーンイン）のサイズを取得
iter_from = fit.sim['warmup']
#ウォームアップの区間を省く
iter_range = np.arange(iter_from, ms.shape[0])
#各変数名を取得
paraname = fit.sim['fnames_oi']

#※※※今回は全て描画したいので、こちらを使う
iter_start = np.arange(0, ms.shape[0])

In [None]:
#seabornのcolorpalette
palette = sns.color_palette()
#おまじない？
sns.set(font_scale=1)
sns.set_style("ticks")
sns.despine(offset=10, trim=True)

#複数グラフの描画（これしか方法知らない）
fig,axes  = plt.subplots(nrows=2, ncols=3, figsize=(15,10))

for i in range(2):
    for j in range(3):
        axes[i,j].plot(iter_start, ms[iter_start, :, i*3+j], 
                       linewidth=2, color=palette[i*3+j])
        axes[i,j].set_title(paraname[i*3+j])
        axes[i,j].set_xlabel('mcmc_size')
        axes[i,j].set_ylabel('parameter')
        axes[i,j].grid(True)

fig.show()

In [None]:
summary = pd.DataFrame(data=fit.summary()['summary'], index=fit.summary()['summary_rownames'], columns=fit.summary()['summary_colnames'])

In [None]:
summary

In [None]:
pred = summary.query('index.str.startswith("predict")', engine='python')['50%'].values

In [None]:
#RMSE
np.sqrt(np.mean((y.values-pred)**2))

In [None]:
from sklearn.metrics import r2_score
r2_score(y.values, pred)

In [None]:
fig, ax = plt.subplots()
ax.scatter(pred, y, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
rosen_rank = stations['路線ID'].value_counts().index

In [None]:
rosen_rank

In [None]:
rosen_uniq = stations['路線ID'].unique()

In [None]:
len(rosen_rank)

In [None]:
fig, axes = plt.subplots(nrows=22, ncols=3, figsize=(15,100))

for i in range(22):
    for j in range(3):
        plt.hold(True);
        a = summary.loc[f'a[{i*3+j}]', '50%']
        b = summary.loc[f'b[{i*3+j}]', '50%']
        xxx = stations[stations['路線ID'] == rosen_uniq[i*3+j]].sort_values(by='昼間人口')["昼間人口"]
        yyy = stations[stations['路線ID'] == rosen_uniq[i*3+j]].sort_values(by='昼間人口')["飲食店事業所数"]
        axes[i,j].plot(xxx, yyy, 'o-')
        xx = range(int(xxx.tolist()[-1]))
        yy = a + b*xx
        axes[i,j].plot(xx, yy)
        axes[i,j].set_title(f'{i*3+j}, a={a:.2f}, b={b:.4f}, {rosen_uniq[i*3+j]}')
        if i*3+j == 63:
            break
#fig.savefig('output/bayes_chukan', dpi=200)
fig.show()

In [None]:
fig, axes = plt.subplots(nrows=7, ncols=9, figsize=(60,40))

for i in range(7):
    for j in range(9):
        plt.hold(True);
        a = summary.loc[f'a[{i*9+j}]', '50%']
        b = summary.loc[f'b[{i*9+j}]', '50%']
        xxx = stations[stations['路線ID'] == rosen_uniq[i*9+j]].sort_values(by='昼間人口')["昼間人口"]
        yyy = stations[stations['路線ID'] == rosen_uniq[i*9+j]].sort_values(by='昼間人口')["飲食店事業所数"]
        axes[i,j].plot(xxx, yyy, 'o-')
        xx = range(int(xxx.tolist()[-1]))
        yy = a + b*xx
        axes[i,j].plot(xx, yy)
        axes[i,j].set_title(f'{i*9+j}, a={a:.2f}, b={b:.4f}, {rosen_uniq[i*9+j]}')
fig.savefig('output/bayes_chukan', dpi=200)
fig.show()

In [None]:
#中央本線
#路線ID g_3.0
#summary 3

#駅数 11
#多治見、土岐、瑞浪、恵那、中津川
#5601, 2927, 2465, 3003, 4377

In [None]:
a = summary.loc['a[3]', '50%']
b = summary.loc['b[3]', '50%']
xxx = stations[stations['路線ID'] == 'g_3.0'].sort_values(by='昼間人口')["昼間人口"]
yyy = stations[stations['路線ID'] == 'g_3.0'].sort_values(by='昼間人口')["飲食店事業所数"]
plt.plot(xxx, yyy, 'o-')
xx = range(int(xxx.tolist()[-1]))
yy = a + b*xx
plt.plot(xx, yy)

In [None]:
a = summary.loc['mu_a', '50%']
b = summary.loc['b[3]', '50%']
xxx = stations[stations['路線ID'] == 'g_3.0'].sort_values(by='昼間人口')["昼間人口"]
yyy = stations[stations['路線ID'] == 'g_3.0'].sort_values(by='昼間人口')["飲食店事業所数"]
plt.plot(xxx, yyy, 'o-')
xx = range(int(xxx.tolist()[-1]))
yy = a + b*xx
plt.plot(xx, yy)

In [None]:
a_summary = summary.query('index.str.contains("a")', engine='python')['50%']

In [None]:
a_summary

In [None]:
b_summary = summary.query('index.str.contains("b")', engine='python')['50%']

In [None]:
b_summary

In [None]:
s_summary = summary.query('index.str.contains("s")', engine='python')['50%']

In [None]:
s_summary

乗降客数と比べて、まずまずいい感じにできてる

bをmu_bとs_bから求めるときの分布は、normalよりもstudent_t(4)よりもcauchyの方が、RMSEとR2いい結果。

ただ、b[3]が大きすぎて、b[4]が小さすぎる問題は直らない。なんでこうなるのか不明。