In [None]:
import pystan

In [None]:
X = pd.read_csv('./variables/devision_rosenID.csv', index_col=0)
y = pd.read_csv('./variables/target.csv', index_col=0 ,names=['飲食店数'])

In [None]:
def zscore(x, axis = None):
    xmean = x.mean(axis=axis)
    xstd  = np.std(x, axis=axis)
    zscore = (x-xmean)/xstd
    return zscore

In [None]:
X.iloc[:, :-1] = zscore(X.iloc[:, :-1], axis=0)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [None]:
rosen = dict(zip(X['路線ID'].unique(), range(1, 1+len(X['路線ID'].unique()))))
Rosen_train = []
for i in range(len(X_train)):
    Rosen_train.append(rosen[X_train['路線ID'][i]])
Rosen_test = []
for i in range(len(X_test)):
    Rosen_test.append(rosen[X_test['路線ID'][i]])

In [None]:
X_train = X_train.drop('路線ID', axis=1)
X_test = X_test.drop('路線ID', axis=1)

In [None]:
data = dict(
    N_train=X_train.shape[0],
    N_test=X_test.shape[0],
    N_explanatory=X_train.shape[1],
    N_rosen=len(rosen),
    X_train=X_train.values,
    X_test=X_test.values,
    Rosen_train=Rosen_train,
    Rosen_test=Rosen_test,
    Y_train=y_train['飲食店数'].values,
    Y_test=y_test['飲食店数'].values
)

In [None]:
model = '''
data {
    int N_train;
    int N_test;
    int N_explanatory;
    int N_rosen;
    matrix[N_train, N_explanatory] X_train;
    matrix[N_test, N_explanatory] X_test;
    int<lower=1> Rosen_train[N_train];
    int<lower=1> Rosen_test[N_test];
    vector<lower=0>[N_train] Y_train;
    vector<lower=0>[N_test] Y_test;
}

parameters {
    real mu_a;
    real mu_b[N_explanatory];
    real<lower=0> mu_s;
    real<lower=0> s_a;
    real<lower=0> s_b[N_explanatory];
    real<lower=0> s_s;
    vector[N_rosen] a;
    matrix[N_explanatory, N_rosen] b;
    vector<lower=0>[N_rosen] s;
}

model {
    a ~ student_t(4, mu_a, s_a);
    for (i in 1:N_explanatory)
        b[i,] ~ student_t(4, mu_b[i], s_b[i]);
    s ~ student_t(4, mu_s, s_s);
    for (i in 1:N_train)
        Y_train[i] ~ student_t(4, a[Rosen_train[i]] + X_train[i,]*b[,Rosen_train[i]], s[Rosen_train[i]]);
}

generated quantities{
    vector[N_test] predict;
    for (i in 1:N_test)
        predict[i] = student_t_rng(4, a[Rosen_test[i]] + X_test[i,]*b[,Rosen_test[i]], s[Rosen_test[i]]);
}
'''

In [None]:
fit = pystan.stan(model_code=model, data=data, chains=3, iter=500, warmup=100, thin=1)

In [None]:
fit

In [None]:
#MCMCサンプリングの結果を抽出
ms = fit.extract(permuted=False, inc_warmup=True)
#ウォームアップ（バーンイン）のサイズを取得
iter_from = fit.sim['warmup']
#ウォームアップの区間を省く
iter_range = np.arange(iter_from, ms.shape[0])
#各変数名を取得
paraname = fit.sim['fnames_oi']

#※※※今回は全て描画したいので、こちらを使う
iter_start = np.arange(0, ms.shape[0])

In [None]:
#seabornのcolorpalette
palette = sns.color_palette()
#おまじない？
sns.set(font_scale=1)
sns.set_style("ticks")
sns.despine(offset=10, trim=True)

fig,axes  = plt.subplots(nrows=4, ncols=3, figsize=(15,10))

for i in range(4):
    for j in range(3):
        axes[i,j].plot(iter_start, ms[iter_start, :, i*3+j])
        axes[i,j].set_title(paraname[i*3+j])
        axes[i,j].set_xlabel('mcmc_size')
        axes[i,j].set_ylabel('parameter')
        axes[i,j].grid(True)

fig.show()

In [None]:
summary = pd.DataFrame(data=fit.summary()['summary'], index=fit.summary()['summary_rownames'], columns=fit.summary()['summary_colnames'])

In [None]:
summary

In [None]:
pred = summary.query('index.str.startswith("predict")', engine='python')['50%'].values

In [None]:
#RMSE
np.sqrt(np.mean((y_test.values-pred)**2))

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test.values, pred)

In [None]:
fig, ax = plt.subplots()
ax.scatter(pred, y_test, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
def plot_actual_predicted(actual, predicted):
    tmp = pd.DataFrame({'actual': actual, 'predicted': predicted}).sort_values(['actual'])
    plt.scatter(range(tmp.shape[0]), tmp['predicted'], color='green')
    plt.scatter(range(tmp.shape[0]), tmp['actual'], color='blue')
    plt.show()
    del tmp

plot_actual_predicted(y_test, pred)

In [None]:
rosen_uniq = stations['路線ID'].unique()

In [None]:
fig, axes = plt.subplots(nrows=8, ncols=8, figsize=(60,60))

for i in range(8):
    for j in range(8):
        plt.hold(True);
        a = summary.loc[f'a[{i*8+j}]', '50%']
        b = summary.loc[f'b[{i*8+j}]', '50%']
        s = summary.loc[f's[{i*8+j}]', '50%']
        xxx = stations[stations['路線ID'] == rosen_uniq[i*8+j]].sort_values(by='昼間人口')["昼間人口"]
        yyy = stations[stations['路線ID'] == rosen_uniq[i*8+j]].sort_values(by='昼間人口')["飲食店事業所数"]
        axes[i,j].plot(xxx, yyy, 'o-')
        xx = range(int(xxx.tolist()[-1]))
        yy = a + b*xx
        axes[i,j].plot(xx, yy)
        if i*8+j == 3:
            axes[i,j].set_title(
                f'{i*8+j}, a={a:.2f}, b={b:.4f}, s={s:.2f}, {rosen_uniq[i*8+j]}\n \
                ochiaigawa, takenami, kokokei, minosakamoto, kamado,\
                \n sakashita, mizunami, tokishi, ena, nakatugawa, tajimi')
        else:
            axes[i,j].set_title(f'{i*8+j}, a={a:.2f}, b={b:.4f}, s={s:.2f}, {rosen_uniq[i*8+j]}')
        if i*8+j == 62: #本当は64個あるけど、64+1=65個はきれいに並べれないから、最後の1個省く
            break
a = summary.loc['mu_a', '50%']
b = summary.loc['mu_b', '50%']
s = summary.loc['mu_s', '50%']
xxx = stations.sort_values(by='昼間人口')["昼間人口"]
yyy = stations.sort_values(by='昼間人口')["飲食店事業所数"]
axes[7,7].plot(xxx, yyy, 'o-')
xx = range(int(xxx.tolist()[-1]))
yy = a + b*xx
axes[7,7].plot(xx, yy)
axes[7,7].set_title(f'{0}, a={a:.2f}, b={b:.4f}, s={s:.2f}, total')

fig.savefig('output/hierarchical_bayes_chukan', dpi=200)
fig.show()

In [None]:
a = summary.loc['a[3]', '50%']
b = summary.loc['b[3]', '50%']
xxx = stations[stations['路線ID'] == 'g_3.0'].sort_values(by='昼間人口')["昼間人口"]
yyy = stations[stations['路線ID'] == 'g_3.0'].sort_values(by='昼間人口')["飲食店事業所数"]
plt.plot(xxx, yyy, 'o-')
xx = range(int(xxx.tolist()[-1]))
yy = a + b*xx
plt.plot(xx, yy)
plt.title('a= \n \
ochiaigawa, takenami, kokokei, minosakamoto, kamado, \n sakashita, mizunami, tokishi, ena, nakatugawa, tajimi')

In [None]:
#中央本線
#路線ID g_3.0
#summary 3

#駅数 11
#多治見、土岐、瑞浪、恵那、中津川
#5601, 2927, 2465, 3003, 4377

In [None]:
a = summary.loc['mu_a', '50%']
b = summary.loc['mu_b', '50%']
xxx = stations.sort_values(by='昼間人口')["昼間人口"]
yyy = stations.sort_values(by='昼間人口')["飲食店事業所数"]
plt.plot(xxx, yyy, 'o-')
xx = range(int(xxx.tolist()[-1]))
yy = a + b*xx
plt.plot(xx, yy)

In [None]:
a_summary = summary.query('index.str.contains("a")', engine='python')['50%']

In [None]:
a_summary

In [None]:
b_summary = summary.query('index.str.contains("b")', engine='python')['50%']

In [None]:
b_summary

In [None]:
s_summary = summary.query('index.str.contains("s")', engine='python')['50%']

In [None]:
s_summary