In [None]:
import pandas as pd
from pystan import StanModel
import pickle

In [None]:
loserwinner = pd.read_csv('output/data_kaimei.csv', index_col=0)

In [None]:
loserwinner = loserwinner.reset_index(drop=True)

In [None]:
loserwinner

In [None]:
len(loserwinner['loser'].unique())

In [None]:
len(loserwinner['loser'].value_counts()[loserwinner['loser'].value_counts()<20])

In [None]:
len(loserwinner['winner'].value_counts()[loserwinner['winner'].value_counts()<20])

In [None]:
rare_rikishies = []
rare_rikishies.append(loserwinner['loser'].value_counts()[loserwinner['loser'].value_counts()<20].index)
rare_rikishies.append(loserwinner['winner'].value_counts()[loserwinner['winner'].value_counts()<20].index)
rare_rikishies = [rare_rikishi for loser_winner in rare_rikishies for rare_rikishi in loser_winner]

In [None]:
for rare_rikishi in rare_rikishies:
    loserwinner = loserwinner[loserwinner['loser'] != rare_rikishi]
    loserwinner = loserwinner[loserwinner['winner'] != rare_rikishi]
loserwinner = loserwinner.reset_index(drop=True)

In [None]:
loserwinner

In [None]:
len(loserwinner['loser'].unique())

In [None]:
len(loserwinner['winner'].unique())

In [None]:
rikishi_id = dict(zip(loserwinner['loser'].unique(), range(1, len(loserwinner['loser'].unique())+1)))

In [None]:
rikishi_id

In [None]:
loserwinner = loserwinner.replace(rikishi_id)

In [None]:
loserwinner

In [None]:
loserwinner.info()

In [None]:
loserwinner['loser'].min()

In [None]:
loserwinner['loser'].max()

In [None]:
loserwinner['year'] = loserwinner['year']-2000

In [None]:
N = loserwinner['loser'].max()
M = loserwinner.shape[0]
L = loserwinner['year'].max()
Id = list(loserwinner[['loser', 'winner', 'year']].values)

data = dict(
    N=N,
    M=M,
    L=L,
    Id=Id
)

In [None]:
model = '''
data {
    int N;
    int M;
    int L;
    int<lower=1, upper=N> Id[M, 3];
}

parameters {
    ordered[2] performance[M];
    matrix[L, N-1] strength0;
    real<lower=0> s_strength;
    real<lower=0> s_time_strength;
    vector<lower=0>[N] stability;
}

transformed parameters {
    matrix[L, N] strength;
    strength[, 1:N-1] = strength0;
    for (i in 1:L)
        strength[i, N] = -sum(strength0[i, ]);
}

model {
    strength[1, ] ~ normal(0, s_strength);
    for (k in 2:L)
        strength[k, ] ~ normal(strength[k-1, ], s_time_strength);
    stability ~ gamma(10, 10);
    for (i in 1:M)
        for (j in 1:2)
            performance[i, j] ~ student_t(1, strength[Id[i, 3], Id[i, j]], stability[Id[i, j]]);
}
'''

In [None]:
stanmodel = StanModel(model_code=model)

In [None]:
fit = stanmodel.vb(data=data, seed=1234)
#234s

In [None]:
#with open('2001to2018_advi_time.pkl', 'wb') as f:
#    pickle.dump(stanmodel, f)
#    pickle.dump(fit, f)

In [None]:
vb_sample = pd.read_csv(fit['args']['sample_file'].decode('utf-8'), comment='#')
vb_sample = vb_sample.drop([0,1])

In [None]:
strength = vb_sample.filter(regex='strength\.\d+')

In [None]:
strength18 = vb_sample.filter(regex='strength\.18\.\d+')

In [None]:
strength18

In [None]:
stability = vb_sample.filter(regex='stability\.\d+')

In [None]:
rikishi_df = pd.DataFrame(index=range(1, len(rikishi_id)+1), columns=['strength', 'stability'])

In [None]:
rikishi_df['id'] = rikishi_df.index

In [None]:
rikishi_df['strength'] = np.mean(strength18).values
rikishi_df['stability'] = np.mean(stability).values

In [None]:
#strength10 = vb_sample.filter(regex='strength\.10\.\d+')

In [None]:
rikishi_df['strength'] = np.mean(strength18).values
rikishi_df['stability'] = np.mean(stability).values

In [None]:
rikishi_df

In [None]:
rikishi_df = rikishi_df.sort_values(by='strength', ascending=False)

In [None]:
rikishi_df['rank'] = range(1, len(rikishi_df)+1)

In [None]:
#rikishi_df

In [None]:
rikishi_id_inverse = dict(zip(rikishi_id.values(), rikishi_id.keys()))

In [None]:
rikishi_df.index = rikishi_df['id'].replace(rikishi_id_inverse).values

In [None]:
rikishi_df

In [None]:
geneki = ['鶴竜', '白鵬', '稀勢の里', '豪栄道', '高安', '栃ノ心', '御嶽海', '逸ノ城', '玉鷲',
          '貴景勝', '勢', '魁聖', '豊山', '千代大龍', '正代', '遠藤', '千代の国', '阿炎', '妙義龍',
          '朝乃山', '輝', '阿武咲', '松鳳山', '栃煌山', '宝富士', '琴奨菊', '北勝富士', '大翔丸',
          '碧山', '大栄翔', '佐田の海', '旭大星', '隠岐の海', '錦木', '竜電', '貴ノ岩', '隆の勝',
          '千代丸', '千代翔馬', '嘉風', '琴勇輝', '石浦']

In [None]:
rikishi_df['geneki'] = rikishi_df.index.isin(geneki)

In [None]:
#rikishi_df

In [None]:
#rikishi_df = rikishi_df[rikishi_df['geneki'] == True]

In [None]:
strong = pd.DataFrame(data=[rikishi_df['rank'].values, rikishi_df['id'].values, rikishi_df.index.values],
#                      index=['rank', 'id', 'name']).T.iloc[:20, :]
                      index=['rank', 'id', 'name']).T

In [None]:
strong

In [None]:
probs = (10, 25, 50, 75, 90)
cols = ['p{}'.format(p) for p in probs]
cols.append('x')

In [None]:
transitions = pd.Panel(major_axis=range(L), minor_axis=cols)

In [None]:
transitions

In [None]:
strength

In [None]:
for i in range(len(strong)):
    id = strong['id'][i]
    transition = pd.DataFrame(np.percentile(strength.T[strength.columns.str.endswith(f'.{id}')].T, probs, axis=0)).T
    transition.columns = ['p{}'.format(p) for p in probs]
    transition['x'] = transition.index + 2001
    transitions[i] = transition

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.axes()
cmap = plt.cm.get_cmap('tab10')

for i in range(len(strong)):
    c = cmap(i%10)
    ax.plot('x', 'p50', data=transitions[i], color=c)
#    ax.fill_between('x', 'p10', 'p90', data=transitions[i], color=c, alpha=0.1)
#    ax.fill_between('x', 'p25', 'p75', data=transitions[i], color=c, alpha=0.2)
ax.legend(strong['name'])

plt.setp(ax, xlabel='year', ylabel='strength')
plt.xticks(range(2001, 2019))
plt.show()
plt.savefig('5years_advi_time', dpi=200)

デビュー前、引退後、休場中、関係なくstrength計算しちゃうから、おかしなことになる。

2015年以降に絞れば、その問題和らぐ。

In [None]:
check = loserwinner.groupby(['loser', 'year']).sum().index.values

In [None]:
check_df = pd.DataFrame(index=range(1, 1+loserwinner['loser'].max()))
for i in range(len(check)):
    rikishi = check[i][0]
    year = check[i][1]-1
    check_df.loc[rikishi, year] = True

In [None]:
id_rank = dict(zip(strong['id'], strong['rank']))

In [None]:
id_rank

In [None]:
for id in range(1, 1+len(strong)):
    rank = id_rank[id]
    for j in range(check_df.shape[1]):
        if check_df.loc[id, j] != True:
            transitions[rank-1].loc[j] = np.nan

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.axes()
cmap = plt.cm.get_cmap('tab10')

for i in range(len(strong)):
    c = cmap(i%10)
    ax.plot('x', 'p50', data=transitions[i], color=c)
#    ax.fill_between('x', 'p10', 'p90', data=transitions[i], color=c, alpha=0.1)
#    ax.fill_between('x', 'p25', 'p75', data=transitions[i], color=c, alpha=0.2)
ax.legend(strong['name'])

plt.setp(ax, xlabel='year', ylabel='strength')
plt.xticks(range(2001, 2019))
plt.show()
plt.savefig('5years_advi_time', dpi=200)