In [None]:
import pystan

In [None]:
X = pd.read_csv('./variables/devision_rosenID.csv', index_col=0)
y = pd.read_csv('./variables/target.csv', index_col=0 ,names=['飲食店数'])

In [None]:
X = X[X['路線ID'].str.startswith('m')]
y = y[y.index.str.startswith('m')]

In [None]:
def zscore(x, axis = None):
    xmean = x.mean(axis=axis)
    xstd  = np.std(x, axis=axis)
    zscore = (x-xmean)/xstd
    return zscore

In [None]:
#各変数、正規分布に従ってる？
X.iloc[:, :-1] = zscore(X.iloc[:, :-1], axis=0)

In [None]:
rosen = dict(zip(X['路線ID'].unique(), range(1, 1+len(X['路線ID'].unique()))))
Rosen = []
for i in range(len(X)):
    Rosen.append(rosen[X['路線ID'][i]])
X = X.drop(['路線ID'], axis=1)

In [None]:
data = dict(
    N_station=X.shape[0],
    N_explanatory=X.shape[1],
    N_rosen=len(rosen),
    X=X.values,
    Rosen=Rosen,
    Y=y['飲食店数'].values
)

In [None]:
model = '''
data {
    int N_station;
    int N_explanatory;
    int N_rosen;
    matrix[N_station, N_explanatory] X;
    int<lower=1> Rosen[N_station];
    vector<lower=0>[N_station] Y;
}

parameters {
    real mu_a;
    real mu_b[N_explanatory];
    real<lower=0> mu_s;
    real<lower=0> s_a;
    real<lower=0> s_b[N_explanatory];
    real<lower=0> s_s;
    vector[N_rosen] a;
    matrix[N_explanatory, N_rosen] b;
    vector<lower=0>[N_rosen] s;
}

model {
    a ~ normal(mu_a, s_a);
    for (i in 1:N_explanatory)
        b[i,] ~ normal(mu_b[i], s_b[i]);
    s ~ normal(mu_s, s_s);
    for (i in 1:N_station)
        Y[i] ~ normal(a[Rosen[i]] + X[i,]*b[,Rosen[i]], s[Rosen[i]]);
}

generated quantities{
    vector[N_station] predict;
    for (i in 1:N_station)
        predict[i] = normal_rng(a[Rosen[i]] + X[i,]*b[,Rosen[i]], s[Rosen[i]]);
}
'''

In [None]:
fit = pystan.stan(model_code=model, data=data, chains=3, iter=500, warmup=100, thin=1)
#fit = pystan.stan(model_code=model, data=data, chains=4, iter=2000, warmup=500, thin=1)

In [None]:
X.columns

In [None]:
fit

効き具合（mu_b[:]のmeanの大きさ）は昼間人口がダントツ、続いて乗降客数。人口総数はほとんど効いてない。

ただ、路線毎にだいぶ傾向違いそう。

In [None]:
#MCMCサンプリングの結果を抽出
ms = fit.extract(permuted=False, inc_warmup=True)
#ウォームアップ（バーンイン）のサイズを取得
iter_from = fit.sim['warmup']
#ウォームアップの区間を省く
iter_range = np.arange(iter_from, ms.shape[0])
#各変数名を取得
paraname = fit.sim['fnames_oi']

#※※※今回は全て描画したいので、こちらを使う
iter_start = np.arange(0, ms.shape[0])

In [None]:
#seabornのcolorpalette
palette = sns.color_palette()
#おまじない？
sns.set(font_scale=1)
sns.set_style("ticks")
sns.despine(offset=10, trim=True)

#複数グラフの描画（これしか方法知らない）
fig,axes  = plt.subplots(nrows=4, ncols=3, figsize=(15,10))

for i in range(4):
    for j in range(3):
#        axes[i,j].plot(iter_start, ms[iter_start, :, i*3+j], 
#                       linewidth=3, color=palette[i*3+j])
        axes[i,j].plot(iter_start, ms[iter_start, :, i*3+j])
        axes[i,j].set_title(paraname[i*3+j])
        axes[i,j].set_xlabel('mcmc_size')
        axes[i,j].set_ylabel('parameter')
        axes[i,j].grid(True)

fig.show()

In [None]:
summary = pd.DataFrame(data=fit.summary()['summary'], index=fit.summary()['summary_rownames'], columns=fit.summary()['summary_colnames'])

In [None]:
summary

In [None]:
a_summary = summary.query('index.str.contains("a")', engine='python')['50%']

In [None]:
b_summary = summary.query('index.str.contains("b")', engine='python')['50%']

In [None]:
s_summary = summary.query('index.str.contains("s")', engine='python')['50%']

In [None]:
pred = summary.query('index.str.startswith("predict")', engine='python')['50%'].values

In [None]:
#RMSE
np.sqrt(np.mean((y['飲食店数'].values-pred)**2))

In [None]:
from sklearn.metrics import r2_score
r2_score(y.values, pred)

In [None]:
fig, ax = plt.subplots()
ax.scatter(pred, y['飲食店数'].values, edgecolors=(0, 0, 0))
ax.plot([y['飲食店数'].min(), y['飲食店数'].max()], [y['飲食店数'].min(), y['飲食店数'].max()], 'k--', lw=1)
ax.set_xlabel('Predicted')
ax.set_ylabel('Measured')
plt.show()

In [None]:
def plot_actual_predicted(actual, predicted):
    tmp = pd.DataFrame({'actual': actual, 'predicted': predicted}).sort_values(['actual'])
    plt.scatter(range(tmp.shape[0]), tmp['predicted'], color='green')
    plt.scatter(range(tmp.shape[0]), tmp['actual'], color='blue')
    plt.show()
    del tmp

plot_actual_predicted(y['飲食店数'].values, pred)

In [None]:
X = pd.read_csv('./variables/devision_rosenID.csv', index_col=0)

In [None]:
actual_predicted = pd.DataFrame({'actual': y['飲食店数'].values, 'predicted': pred, 'rosen': X['路線ID']})

In [None]:
def plot_actual_predicted(actual, predicted, title):
    tmp = pd.DataFrame({'actual': actual, 'predicted': predicted}).sort_values(['actual'])
    plt.scatter(range(tmp.shape[0]), tmp['predicted'], color='green')
    plt.scatter(range(tmp.shape[0]), tmp['actual'], color='blue')
    plt.title(title)
    plt.tick_params(labelbottom=False, labelleft=True, labelright=False, labeltop=False)
    plt.legend()
    plt.show()
    del tmp

In [None]:
#for i in range(len(rosen)):
#    actual = actual_predicted[actual_predicted['rosen'] == list(rosen.keys())[i]]['actual']
#    predicted = actual_predicted[actual_predicted['rosen'] == list(rosen.keys())[i]]['predicted']
#    title = list(rosen.keys())[i]
#    plot_actual_predicted(actual, predicted, title)

In [None]:
fig, axes = plt.subplots(nrows=8, ncols=8, figsize=(60, 60))

w = 0.3
for i in range(8):
    for j in range(8):
        actual = actual_predicted[actual_predicted['rosen'] == list(rosen.keys())[
            i*8+j]]['actual']
        predicted = actual_predicted[actual_predicted['rosen'] == list(rosen.keys())[
            i*8+j]]['predicted']
        title = list(rosen.keys())[i*8+j]
        tmp = pd.DataFrame(
            {'actual': actual, 'predicted': predicted}).sort_values(['actual'])

        axes[i, j].hold(True)
        axes[i, j].bar(np.array(range(tmp.shape[0])),
                       tmp['predicted'], width=w, color='green', alpha=0.8)
        axes[i, j].bar(np.array(range(tmp.shape[0]))+w,
                       tmp['actual'], width=w, color='blue', alpha=0.8)
        axes[i, j].legend(['predicted', 'actual'])
        if i*8+j == 3:
            axes[i, j].set_title(
                f'{title}\n \
                ochiaigawa, takenami, kokokei, minosakamoto, kamado,\
                \n sakashita, mizunami, tokishi, ena, nakatugawa, tajimi')
        else:
            axes[i, j].set_title(f'{title}')
        axes[i, j].tick_params(
            labelbottom=False, labelleft=True, labelright=False, labeltop=False)

fig.savefig('output/actual_predicted', dpi=200)
fig.show()

In [None]:
#中央本線
#路線ID g_3.0
#summary 3

#駅数 11
#多治見、土岐、瑞浪、恵那、中津川
#5601, 2927, 2465, 3003, 4377

瑞浪はほぼ実際と予測が同じ。まあ、多治見やら恵那やらはだいぶ実際が多い。

In [None]:
def plot_feature_importances(df, n_rosen, variable):
    
    plt.figure(figsize = (10, n_rosen//3))
    ax = plt.subplot()
    
    ax.barh(list(reversed(list(df.index[:n_rosen]))), 
            df['value'].head(n_rosen),
            align = 'center', edgecolor = 'k')
    
    ax.set_yticks(list(reversed(list(df.index[:n_rosen]))))
    ax.set_yticklabels(df['rosen'].head(n_rosen))
    
    plt.xlabel('value')
    plt.title(variable)
    plt.savefig(f'output/{variable}')
    plt.show()
    
    return df

In [None]:
n_rosen = len(rosen)
variable = 'seppen'
data = pd.DataFrame([rosen.keys(), a_summary.iloc[2:]], index=['rosen', 'value'])
data = data.T
plot_feature_importances(data, n_rosen, variable)

In [None]:
from scipy.stats import norm

In [None]:
loc = summary.loc['mu_a', '50%']
scale = summary.loc['s_a', '50%']

sns.distplot(data.value.astype(float), fit=norm)
x = np.linspace(norm.ppf(0.01, loc, scale),
                norm.ppf(0.99, loc, scale), 100)
plt.plot(x, norm.pdf(x, loc, scale),
        'r-', lw=2, alpha=0.6, label='norm pdf')

In [None]:
n_rosen = len(rosen)
variable = 'joko'
data = pd.DataFrame([rosen.keys(), b_summary.iloc[14::7]], index=['rosen', 'value'])
data = data.T
plot_feature_importances(data, n_rosen, variable)

In [None]:
loc = summary.loc['mu_b[0]', '50%']
scale = summary.loc['s_b[0]', '50%']

sns.distplot(data.value.astype(float), fit=norm)
x = np.linspace(norm.ppf(0.01, loc, scale),
                norm.ppf(0.99, loc, scale), 100)
plt.plot(x, norm.pdf(x, loc, scale),
        'r-', lw=2, alpha=0.6, label='norm pdf')

In [None]:
n_rosen = len(rosen)
variable = 'jinko'
data = pd.DataFrame([rosen.keys(), b_summary.iloc[15::7]], index=['rosen', 'value'])
data = data.T
plot_feature_importances(data, n_rosen, variable)

In [None]:
loc = summary.loc['mu_b[1]', '50%']
scale = summary.loc['s_b[1]', '50%']

sns.distplot(data.value.astype(float), fit=norm)
x = np.linspace(norm.ppf(0.01, loc, scale),
                norm.ppf(0.99, loc, scale), 100)
plt.plot(x, norm.pdf(x, loc, scale),
        'r-', lw=2, alpha=0.6, label='norm pdf')

In [None]:
n_rosen = len(rosen)
variable = 'chukan'
data = pd.DataFrame([rosen.keys(), b_summary.iloc[16::7]], index=['rosen', 'value'])
data = data.T
plot_feature_importances(data, n_rosen, variable)

In [None]:
loc = summary.loc['mu_b[2]', '50%']
scale = summary.loc['s_b[2]', '50%']

sns.distplot(data.value.astype(float), fit=norm)
x = np.linspace(norm.ppf(0.01, loc, scale),
                norm.ppf(0.99, loc, scale), 100)
plt.plot(x, norm.pdf(x, loc, scale),
        'r-', lw=2, alpha=0.6, label='norm pdf')

In [None]:
n_rosen = len(rosen)
variable = 'dansei'
data = pd.DataFrame([rosen.keys(), b_summary.iloc[17::7]], index=['rosen', 'value'])
data = data.T
plot_feature_importances(data, n_rosen, variable)

In [None]:
loc = summary.loc['mu_b[3]', '50%']
scale = summary.loc['s_b[3]', '50%']

sns.distplot(data.value.astype(float), fit=norm)
x = np.linspace(norm.ppf(0.01, loc, scale),
                norm.ppf(0.99, loc, scale), 100)
plt.plot(x, norm.pdf(x, loc, scale),
        'r-', lw=2, alpha=0.6, label='norm pdf')

In [None]:
n_rosen = len(rosen)
variable = 'kodomo'
data = pd.DataFrame([rosen.keys(), b_summary.iloc[18::7]], index=['rosen', 'value'])
data = data.T
plot_feature_importances(data, n_rosen, variable)

In [None]:
loc = summary.loc['mu_b[4]', '50%']
scale = summary.loc['s_b[4]', '50%']

sns.distplot(data.value.astype(float), fit=norm)
x = np.linspace(norm.ppf(0.01, loc, scale),
                norm.ppf(0.99, loc, scale), 100)
plt.plot(x, norm.pdf(x, loc, scale),
        'r-', lw=2, alpha=0.6, label='norm pdf')

In [None]:
n_rosen = len(rosen)
variable = 'otoshiyori'
data = pd.DataFrame([rosen.keys(), b_summary.iloc[19::7]], index=['rosen', 'value'])
data = data.T
plot_feature_importances(data, n_rosen, variable)

In [None]:
loc = summary.loc['mu_b[5]', '50%']
scale = summary.loc['s_b[5]', '50%']

sns.distplot(data.value.astype(float), fit=norm)
x = np.linspace(norm.ppf(0.01, loc, scale),
                norm.ppf(0.99, loc, scale), 100)
plt.plot(x, norm.pdf(x, loc, scale),
        'r-', lw=2, alpha=0.6, label='norm pdf')

In [None]:
n_rosen = len(rosen)
variable = 'setaininzu'
data = pd.DataFrame([rosen.keys(), b_summary.iloc[20::7]], index=['rosen', 'value'])
data = data.T
plot_feature_importances(data, n_rosen, variable)

In [None]:
loc = summary.loc['mu_b[6]', '50%']
scale = summary.loc['s_b[6]', '50%']

sns.distplot(data.value.astype(float), fit=norm)
x = np.linspace(norm.ppf(0.01, loc, scale),
                norm.ppf(0.99, loc, scale), 100)
plt.plot(x, norm.pdf(x, loc, scale),
        'r-', lw=2, alpha=0.6, label='norm pdf')

中央本線、別に特徴がある路線ってわけでもない。

In [None]:
actual_predicted['ratio'] = (actual_predicted['predicted'] -
                             actual_predicted['actual'])*100/actual_predicted['actual']
actual_predicted = actual_predicted.replace(-np.inf, np.nan)
actual_predicted = actual_predicted.replace(np.inf, np.nan)

In [None]:
actual_predicted.sort_values(by='ratio', ascending=False)[:20]

In [None]:
actual_predicted.sort_values(by='ratio')[:20]

In [None]:
actual_predicted['difference'] = (actual_predicted['predicted'] - actual_predicted['actual'])

In [None]:
actual_predicted.sort_values(by='difference', ascending=False)[:20]

In [None]:
actual_predicted.sort_values(by='difference')[:20]

In [None]:
actual_predicted['ratio'].mean()

In [None]:
actual_predicted['difference'].mean()

In [None]:
np.abs(actual_predicted['ratio']).mean()

In [None]:
np.abs(actual_predicted['difference']).mean()

In [None]:
sns.distplot(actual_predicted[actual_predicted['ratio'].notnull()]['ratio'])

In [None]:
sns.distplot(actual_predicted['difference'])