* 投球数が多い5投手について数え上げた確率とIOHMMで推測した確率とのKLを測るコード

In [None]:

import os
import logging
import resource
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from IOHMM import SemiSupervisedIOHMM
from IOHMM import CrossEntropyMNL, DiscreteMNL

In [4]:
# smoothing を加えた DiscreteMNL
class SmoothedDiscreteMNL(DiscreteMNL):
    def __init__(self, solver='lbfgs', alpha=1.0, **kw):
        super().__init__(solver=solver, **kw)
        self.alpha = alpha
    def predict_log_proba(self, X):
        logp = super().predict_log_proba(X)
        p    = np.exp(logp)
        p += self.alpha
        p /= p.sum(axis=1, keepdims=True)
        return np.log(p)
    def loglike_per_sample(self, X, y):
        y_idx = y.ravel().astype(int)
        lp    = self.predict_log_proba(X)
        return lp[np.arange(len(y_idx)), y_idx]

In [15]:
import pandas as pd
df = pd.read_csv('df_clean_zone_cluster.csv')

In [16]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# 対象投手とカバレッジ（balls-strikes組み合わせ）
pitcher_id = "543037"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# zoneのラベルエンコーダ
le_zone = encs["le_zone"]
K = len(le_zone.classes_)

# データの読み込みと遷移カウントに基づく遷移確率行列の取得
df = pd.read_csv("df_clean_zone_cluster.csv")
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}

# 遷移確率を数え上げて正規化
counted_trans_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        z = g["zone"].dropna().astype(int).map(zone_to_idx).values
        for i in range(len(z) - 1):
            records.append((z[i], z[i + 1]))
    cm = pd.DataFrame(records, columns=["prev", "next"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(K), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_trans_dict[(b, s)] = cm_norm

# モデル予測による遷移確率行列の取得
model_trans_dict = {}
for b, s in covs:
    proba_mat = np.zeros((K, K))
    for i in range(K):
        x = np.array([[b, s]])
        log_probs = model.model_transition[i].predict_log_proba(x)
        proba_mat[i] = np.exp(log_probs[0])
    model_trans_dict[(b, s)] = proba_mat

# KLダイバージェンスを計算（counted: P, model: Q）
kl_dict = {}
for b, s in covs:
    P = counted_trans_dict[(b, s)]
    Q = model_trans_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_df = pd.DataFrame([
    {"balls": b, "strikes": s, "prev_zone": i, "KL_divergence": kl_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])




In [17]:
kl_df

Unnamed: 0,balls,strikes,prev_zone,KL_divergence
0,0,0,0,0.076612
1,0,0,1,0.049162
2,0,0,2,0.127980
3,0,0,3,0.074569
4,0,0,4,0.046816
...,...,...,...,...
151,3,2,8,0.169970
152,3,2,9,0.130808
153,3,2,10,0.302028
154,3,2,11,0.569317


In [18]:
avg_kl = kl_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.614537
10      3        1       0.847223
6       2        0       0.641246
11      3        2       0.254077
7       2        1       0.223226
2       0        2       0.200287
8       2        2       0.177874
5       1        2       0.163910
3       1        0       0.144354
4       1        1       0.104069
1       0        1       0.089178
0       0        0       0.060306


In [19]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# 対象投手とカバレッジ（balls-strikes組み合わせ）
pitcher_id = "554430"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# zoneのラベルエンコーダ
le_zone = encs["le_zone"]
K = len(le_zone.classes_)

# データの読み込みと遷移カウントに基づく遷移確率行列の取得
df = pd.read_csv("df_clean_zone_cluster.csv")
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}

# 遷移確率を数え上げて正規化
counted_trans_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        z = g["zone"].dropna().astype(int).map(zone_to_idx).values
        for i in range(len(z) - 1):
            records.append((z[i], z[i + 1]))
    cm = pd.DataFrame(records, columns=["prev", "next"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(K), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_trans_dict[(b, s)] = cm_norm

# モデル予測による遷移確率行列の取得
model_trans_dict = {}
for b, s in covs:
    proba_mat = np.zeros((K, K))
    for i in range(K):
        x = np.array([[b, s]])
        log_probs = model.model_transition[i].predict_log_proba(x)
        proba_mat[i] = np.exp(log_probs[0])
    model_trans_dict[(b, s)] = proba_mat

# KLダイバージェンスを計算（counted: P, model: Q）
kl_dict = {}
for b, s in covs:
    P = counted_trans_dict[(b, s)]
    Q = model_trans_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_df = pd.DataFrame([
    {"balls": b, "strikes": s, "prev_zone": i, "KL_divergence": kl_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])




In [20]:
avg_kl = kl_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.505134
10      3        1       0.945325
6       2        0       0.584534
2       0        2       0.252407
7       2        1       0.247263
11      3        2       0.216397
3       1        0       0.122515
5       1        2       0.108713
8       2        2       0.107930
4       1        1       0.091411
1       0        1       0.078349
0       0        0       0.051507


In [22]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# 対象投手とカバレッジ（balls-strikes組み合わせ）
pitcher_id = "605400"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# zoneのラベルエンコーダ
le_zone = encs["le_zone"]
K = len(le_zone.classes_)

# データの読み込みと遷移カウントに基づく遷移確率行列の取得
df = pd.read_csv("df_clean_zone_cluster.csv")
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}

# 遷移確率を数え上げて正規化
counted_trans_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        z = g["zone"].dropna().astype(int).map(zone_to_idx).values
        for i in range(len(z) - 1):
            records.append((z[i], z[i + 1]))
    cm = pd.DataFrame(records, columns=["prev", "next"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(K), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_trans_dict[(b, s)] = cm_norm

# モデル予測による遷移確率行列の取得
model_trans_dict = {}
for b, s in covs:
    proba_mat = np.zeros((K, K))
    for i in range(K):
        x = np.array([[b, s]])
        log_probs = model.model_transition[i].predict_log_proba(x)
        proba_mat[i] = np.exp(log_probs[0])
    model_trans_dict[(b, s)] = proba_mat

# KLダイバージェンスを計算（counted: P, model: Q）
kl_dict = {}
for b, s in covs:
    P = counted_trans_dict[(b, s)]
    Q = model_trans_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_df = pd.DataFrame([
    {"balls": b, "strikes": s, "prev_zone": i, "KL_divergence": kl_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])




In [23]:
avg_kl = kl_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.816070
10      3        1       1.185207
6       2        0       0.720375
11      3        2       0.287797
7       2        1       0.198350
2       0        2       0.174270
8       2        2       0.158780
3       1        0       0.143066
5       1        2       0.101155
1       0        1       0.089815
4       1        1       0.082273
0       0        0       0.058859


In [24]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# 対象投手とカバレッジ（balls-strikes組み合わせ）
pitcher_id = "621244"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# zoneのラベルエンコーダ
le_zone = encs["le_zone"]
K = len(le_zone.classes_)

# データの読み込みと遷移カウントに基づく遷移確率行列の取得
df = pd.read_csv("df_clean_zone_cluster.csv")
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}

# 遷移確率を数え上げて正規化
counted_trans_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        z = g["zone"].dropna().astype(int).map(zone_to_idx).values
        for i in range(len(z) - 1):
            records.append((z[i], z[i + 1]))
    cm = pd.DataFrame(records, columns=["prev", "next"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(K), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_trans_dict[(b, s)] = cm_norm

# モデル予測による遷移確率行列の取得
model_trans_dict = {}
for b, s in covs:
    proba_mat = np.zeros((K, K))
    for i in range(K):
        x = np.array([[b, s]])
        log_probs = model.model_transition[i].predict_log_proba(x)
        proba_mat[i] = np.exp(log_probs[0])
    model_trans_dict[(b, s)] = proba_mat

# KLダイバージェンスを計算（counted: P, model: Q）
kl_dict = {}
for b, s in covs:
    P = counted_trans_dict[(b, s)]
    Q = model_trans_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_df = pd.DataFrame([
    {"balls": b, "strikes": s, "prev_zone": i, "KL_divergence": kl_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])




In [25]:
avg_kl = kl_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.392329
10      3        1       1.055013
6       2        0       0.640527
7       2        1       0.379220
11      3        2       0.378772
8       2        2       0.316372
2       0        2       0.209389
3       1        0       0.205217
5       1        2       0.199589
4       1        1       0.176587
1       0        1       0.076782
0       0        0       0.066568


In [26]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# 対象投手とカバレッジ（balls-strikes組み合わせ）
pitcher_id = "656302"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# zoneのラベルエンコーダ
le_zone = encs["le_zone"]
K = len(le_zone.classes_)

# データの読み込みと遷移カウントに基づく遷移確率行列の取得
df = pd.read_csv("df_clean_zone_cluster.csv")
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}

# 遷移確率を数え上げて正規化
counted_trans_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        z = g["zone"].dropna().astype(int).map(zone_to_idx).values
        for i in range(len(z) - 1):
            records.append((z[i], z[i + 1]))
    cm = pd.DataFrame(records, columns=["prev", "next"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(K), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_trans_dict[(b, s)] = cm_norm

# モデル予測による遷移確率行列の取得
model_trans_dict = {}
for b, s in covs:
    proba_mat = np.zeros((K, K))
    for i in range(K):
        x = np.array([[b, s]])
        log_probs = model.model_transition[i].predict_log_proba(x)
        proba_mat[i] = np.exp(log_probs[0])
    model_trans_dict[(b, s)] = proba_mat

# KLダイバージェンスを計算（counted: P, model: Q）
kl_dict = {}
for b, s in covs:
    P = counted_trans_dict[(b, s)]
    Q = model_trans_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_df = pd.DataFrame([
    {"balls": b, "strikes": s, "prev_zone": i, "KL_divergence": kl_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])




In [27]:
avg_kl = kl_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.814296
10      3        1       0.952332
6       2        0       0.500761
7       2        1       0.273593
2       0        2       0.270043
11      3        2       0.266489
8       2        2       0.147460
4       1        1       0.145959
5       1        2       0.144891
3       1        0       0.140531
1       0        1       0.123873
0       0        0       0.058667


In [44]:
from sklearn.preprocessing import LabelEncoder

# データ読み込み
df = pd.read_csv("df_clean_zone_cluster.csv")
df_p = df[df["pitcher"].astype(str) == pitcher_id].copy()

# zone, pitch_cluster_label のエンコーディング
le_zone = LabelEncoder()
le_zone.fit(df_p["zone"].dropna())

le_pcl = LabelEncoder()
le_pcl.fit(df_p["pitch_cluster_label"].dropna())

le_zcl = LabelEncoder()
le_zcl.fit(df_p["zone_cluster_label"])

# クラス数
K = len(le_zone.classes_)
C = len(le_pcl.classes_)
Z = len(le_zcl.classes_)

KeyError: 'zone_cluster_label'

In [32]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# パラメータ
pitcher_id = "543037"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikes 全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# データの読み込み
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)
df_p = df_p[df_p["pitch_cluster_label"].notna() & df_p["zone"].notna()]

# zoneとpitch_cluster_labelのエンコード
df_p["zone_enc"] = le_zone.transform(df_p["zone"].astype(str))
df_p["pcl_enc"] = le_pcl.transform(df_p["pitch_cluster_label"].astype(str))

# 出力確率（数え上げ）: P(pcl | zone, b, s)
emit_count_dict = {}
for b, s in covs:
    mat = np.zeros((K, C))
    for _, g in df_p.groupby("game_pk"):
        g_cov = g[(g["balls"] == b) & (g["strikes"] == s)]
        for _, row in g_cov.iterrows():
            z = row["zone_enc"]
            pcl = row["pcl_enc"]
            mat[z, pcl] += 1
    row_sums = mat.sum(axis=1, keepdims=True)
    prob_mat = np.divide(mat, row_sums, out=np.zeros_like(mat), where=row_sums != 0)
    emit_count_dict[(b, s)] = prob_mat

# モデル出力確率: P(pcl | zone, b, s)
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        if isinstance(model_list, list):
            model_z = model_list[0]
        else:
            model_z = model_list
        log_probs = model_z.predict_log_proba(x)[0]
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# KLダイバージェンスの計算（counted: P, model: Q）
kl_emit_dict = {}
for b, s in covs:
    P = emit_count_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": z, "KL_divergence": kl_emit_dict[(b, s)][z]}
    for (b, s) in covs for z in range(K)
])




In [34]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.175521
10      3        1       0.634854
11      3        2       0.533916
2       0        2       0.497581
5       1        2       0.442524
8       2        2       0.431066
1       0        1       0.407101
6       2        0       0.403660
4       1        1       0.368819
0       0        0       0.357283
7       2        1       0.313542
3       1        0       0.309520


In [35]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# パラメータ
pitcher_id = "554430"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikes 全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# データの読み込み
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)
df_p = df_p[df_p["pitch_cluster_label"].notna() & df_p["zone"].notna()]

# zoneとpitch_cluster_labelのエンコード
df_p["zone_enc"] = le_zone.transform(df_p["zone"].astype(str))
df_p["pcl_enc"] = le_pcl.transform(df_p["pitch_cluster_label"].astype(str))

# 出力確率（数え上げ）: P(pcl | zone, b, s)
emit_count_dict = {}
for b, s in covs:
    mat = np.zeros((K, C))
    for _, g in df_p.groupby("game_pk"):
        g_cov = g[(g["balls"] == b) & (g["strikes"] == s)]
        for _, row in g_cov.iterrows():
            z = row["zone_enc"]
            pcl = row["pcl_enc"]
            mat[z, pcl] += 1
    row_sums = mat.sum(axis=1, keepdims=True)
    prob_mat = np.divide(mat, row_sums, out=np.zeros_like(mat), where=row_sums != 0)
    emit_count_dict[(b, s)] = prob_mat

# モデル出力確率: P(pcl | zone, b, s)
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        if isinstance(model_list, list):
            model_z = model_list[0]
        else:
            model_z = model_list
        log_probs = model_z.predict_log_proba(x)[0]
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# KLダイバージェンスの計算（counted: P, model: Q）
kl_emit_dict = {}
for b, s in covs:
    P = emit_count_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": z, "KL_divergence": kl_emit_dict[(b, s)][z]}
    for (b, s) in covs for z in range(K)
])




In [36]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.126182
10      3        1       0.703438
6       2        0       0.677568
2       0        2       0.612585
11      3        2       0.532782
3       1        0       0.524700
7       2        1       0.519810
5       1        2       0.508469
1       0        1       0.500827
0       0        0       0.498934
8       2        2       0.446031
4       1        1       0.417994


In [37]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# パラメータ
pitcher_id = "605400"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikes 全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# データの読み込み
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)
df_p = df_p[df_p["pitch_cluster_label"].notna() & df_p["zone"].notna()]

# zoneとpitch_cluster_labelのエンコード
df_p["zone_enc"] = le_zone.transform(df_p["zone"].astype(str))
df_p["pcl_enc"] = le_pcl.transform(df_p["pitch_cluster_label"].astype(str))

# 出力確率（数え上げ）: P(pcl | zone, b, s)
emit_count_dict = {}
for b, s in covs:
    mat = np.zeros((K, C))
    for _, g in df_p.groupby("game_pk"):
        g_cov = g[(g["balls"] == b) & (g["strikes"] == s)]
        for _, row in g_cov.iterrows():
            z = row["zone_enc"]
            pcl = row["pcl_enc"]
            mat[z, pcl] += 1
    row_sums = mat.sum(axis=1, keepdims=True)
    prob_mat = np.divide(mat, row_sums, out=np.zeros_like(mat), where=row_sums != 0)
    emit_count_dict[(b, s)] = prob_mat

# モデル出力確率: P(pcl | zone, b, s)
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        if isinstance(model_list, list):
            model_z = model_list[0]
        else:
            model_z = model_list
        log_probs = model_z.predict_log_proba(x)[0]
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# KLダイバージェンスの計算（counted: P, model: Q）
kl_emit_dict = {}
for b, s in covs:
    P = emit_count_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": z, "KL_divergence": kl_emit_dict[(b, s)][z]}
    for (b, s) in covs for z in range(K)
])




In [38]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.339858
2       0        2       0.652588
5       1        2       0.631766
8       2        2       0.600523
11      3        2       0.593814
10      3        1       0.555577
6       2        0       0.550990
3       1        0       0.533816
0       0        0       0.510209
1       0        1       0.506580
4       1        1       0.505533
7       2        1       0.459481


In [39]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# パラメータ
pitcher_id = "621244"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikes 全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# データの読み込み
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)
df_p = df_p[df_p["pitch_cluster_label"].notna() & df_p["zone"].notna()]

# zoneとpitch_cluster_labelのエンコード
df_p["zone_enc"] = le_zone.transform(df_p["zone"].astype(str))
df_p["pcl_enc"] = le_pcl.transform(df_p["pitch_cluster_label"].astype(str))

# 出力確率（数え上げ）: P(pcl | zone, b, s)
emit_count_dict = {}
for b, s in covs:
    mat = np.zeros((K, C))
    for _, g in df_p.groupby("game_pk"):
        g_cov = g[(g["balls"] == b) & (g["strikes"] == s)]
        for _, row in g_cov.iterrows():
            z = row["zone_enc"]
            pcl = row["pcl_enc"]
            mat[z, pcl] += 1
    row_sums = mat.sum(axis=1, keepdims=True)
    prob_mat = np.divide(mat, row_sums, out=np.zeros_like(mat), where=row_sums != 0)
    emit_count_dict[(b, s)] = prob_mat

# モデル出力確率: P(pcl | zone, b, s)
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        if isinstance(model_list, list):
            model_z = model_list[0]
        else:
            model_z = model_list
        log_probs = model_z.predict_log_proba(x)[0]
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# KLダイバージェンスの計算（counted: P, model: Q）
kl_emit_dict = {}
for b, s in covs:
    P = emit_count_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": z, "KL_divergence": kl_emit_dict[(b, s)][z]}
    for (b, s) in covs for z in range(K)
])




In [40]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.049838
10      3        1       0.847155
6       2        0       0.704852
11      3        2       0.617248
2       0        2       0.597673
0       0        0       0.559236
3       1        0       0.546099
7       2        1       0.512056
1       0        1       0.511113
8       2        2       0.507745
5       1        2       0.506980
4       1        1       0.478709


In [41]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from sklearn.preprocessing import LabelEncoder

# パラメータ
pitcher_id = "656302"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikes 全組合せ

# モデルとエンコーダの読み込み
model_dir = f"iohmm_{pitcher_id}"
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# データの読み込み
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)
df_p = df_p[df_p["pitch_cluster_label"].notna() & df_p["zone"].notna()]

# zoneとpitch_cluster_labelのエンコード
df_p["zone_enc"] = le_zone.transform(df_p["zone"].astype(str))
df_p["pcl_enc"] = le_pcl.transform(df_p["pitch_cluster_label"].astype(str))

# 出力確率（数え上げ）: P(pcl | zone, b, s)
emit_count_dict = {}
for b, s in covs:
    mat = np.zeros((K, C))
    for _, g in df_p.groupby("game_pk"):
        g_cov = g[(g["balls"] == b) & (g["strikes"] == s)]
        for _, row in g_cov.iterrows():
            z = row["zone_enc"]
            pcl = row["pcl_enc"]
            mat[z, pcl] += 1
    row_sums = mat.sum(axis=1, keepdims=True)
    prob_mat = np.divide(mat, row_sums, out=np.zeros_like(mat), where=row_sums != 0)
    emit_count_dict[(b, s)] = prob_mat

# モデル出力確率: P(pcl | zone, b, s)
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        if isinstance(model_list, list):
            model_z = model_list[0]
        else:
            model_z = model_list
        log_probs = model_z.predict_log_proba(x)[0]
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# KLダイバージェンスの計算（counted: P, model: Q）
kl_emit_dict = {}
for b, s in covs:
    P = emit_count_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# 結果を1つのDataFrameにまとめる
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": z, "KL_divergence": kl_emit_dict[(b, s)][z]}
    for (b, s) in covs for z in range(K)
])




In [42]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       1.111204
10      3        1       0.692248
6       2        0       0.508759
2       0        2       0.485177
5       1        2       0.410300
11      3        2       0.396879
7       2        1       0.384881
8       2        2       0.355294
0       0        0       0.347320
3       1        0       0.317755
1       0        1       0.315112
4       1        1       0.304890


In [45]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr

# --- パラメータ定義 ---
pitcher_id = "543037"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ
model_dir = f"iohmm_{pitcher_id}"

# --- モデルとエンコーダ読み込み ---
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# --- ラベルエンコーダ取得 ---

# --- データ読み込み ---
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

# zoneとzone_clusterのindex変換
zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}
zcl_to_idx = {z: i for i, z in enumerate(sorted(df_p["zone_cluster"].dropna().unique()))}

# --- カウントベースの出力確率（zone_cluster） ---
counted_emit_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        if g.empty:
            continue
        for _, row in g.iterrows():
            if pd.isna(row["zone"]) or pd.isna(row["zone_cluster"]):
                continue
            z = zone_to_idx[int(row["zone"])]
            zcl = zcl_to_idx[row["zone_cluster"]]
            records.append((z, zcl))
    cm = pd.DataFrame(records, columns=["zone", "zcl"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(C), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_emit_dict[(b, s)] = cm_norm

# --- モデルによる出力確率（zone_cluster） ---
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        log_probs = model_list[1].predict_log_proba(x)[0]  # zone_clusterに対する予測
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# --- KLダイバージェンスの計算（zone_cluster） ---
kl_emit_dict = {}
for b, s in covs:
    P = counted_emit_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# --- 結果のDataFrame作成 ---
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": i, "KL_divergence": kl_emit_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])





In [46]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       0.850935
10      3        1       0.848933
6       2        0       0.804367
2       0        2       0.792280
8       2        2       0.764177
4       1        1       0.763132
5       1        2       0.758909
7       2        1       0.758862
1       0        1       0.748068
3       1        0       0.743693
0       0        0       0.741611
11      3        2       0.736493


In [47]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr

# --- パラメータ定義 ---
pitcher_id = "554430"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ
model_dir = f"iohmm_{pitcher_id}"

# --- モデルとエンコーダ読み込み ---
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# --- ラベルエンコーダ取得 ---

# --- データ読み込み ---
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

# zoneとzone_clusterのindex変換
zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}
zcl_to_idx = {z: i for i, z in enumerate(sorted(df_p["zone_cluster"].dropna().unique()))}

# --- カウントベースの出力確率（zone_cluster） ---
counted_emit_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        if g.empty:
            continue
        for _, row in g.iterrows():
            if pd.isna(row["zone"]) or pd.isna(row["zone_cluster"]):
                continue
            z = zone_to_idx[int(row["zone"])]
            zcl = zcl_to_idx[row["zone_cluster"]]
            records.append((z, zcl))
    cm = pd.DataFrame(records, columns=["zone", "zcl"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(C), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_emit_dict[(b, s)] = cm_norm

# --- モデルによる出力確率（zone_cluster） ---
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        log_probs = model_list[1].predict_log_proba(x)[0]  # zone_clusterに対する予測
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# --- KLダイバージェンスの計算（zone_cluster） ---
kl_emit_dict = {}
for b, s in covs:
    P = counted_emit_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# --- 結果のDataFrame作成 ---
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": i, "KL_divergence": kl_emit_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])





In [48]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       0.829931
6       2        0       0.804630
10      3        1       0.788382
2       0        2       0.780102
1       0        1       0.778672
11      3        2       0.777789
4       1        1       0.762478
8       2        2       0.761013
5       1        2       0.759582
3       1        0       0.759061
7       2        1       0.758227
0       0        0       0.751155


In [49]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr

# --- パラメータ定義 ---
pitcher_id = "605400"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ
model_dir = f"iohmm_{pitcher_id}"

# --- モデルとエンコーダ読み込み ---
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# --- ラベルエンコーダ取得 ---

# --- データ読み込み ---
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

# zoneとzone_clusterのindex変換
zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}
zcl_to_idx = {z: i for i, z in enumerate(sorted(df_p["zone_cluster"].dropna().unique()))}

# --- カウントベースの出力確率（zone_cluster） ---
counted_emit_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        if g.empty:
            continue
        for _, row in g.iterrows():
            if pd.isna(row["zone"]) or pd.isna(row["zone_cluster"]):
                continue
            z = zone_to_idx[int(row["zone"])]
            zcl = zcl_to_idx[row["zone_cluster"]]
            records.append((z, zcl))
    cm = pd.DataFrame(records, columns=["zone", "zcl"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(C), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_emit_dict[(b, s)] = cm_norm

# --- モデルによる出力確率（zone_cluster） ---
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        log_probs = model_list[1].predict_log_proba(x)[0]  # zone_clusterに対する予測
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# --- KLダイバージェンスの計算（zone_cluster） ---
kl_emit_dict = {}
for b, s in covs:
    P = counted_emit_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# --- 結果のDataFrame作成 ---
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": i, "KL_divergence": kl_emit_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])





In [50]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       0.906687
5       1        2       0.801888
1       0        1       0.772025
2       0        2       0.771582
8       2        2       0.768706
11      3        2       0.759483
10      3        1       0.759169
7       2        1       0.758615
0       0        0       0.755755
3       1        0       0.753422
4       1        1       0.752897
6       2        0       0.748489


In [51]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr

# --- パラメータ定義 ---
pitcher_id = "621244"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ
model_dir = f"iohmm_{pitcher_id}"

# --- モデルとエンコーダ読み込み ---
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# --- ラベルエンコーダ取得 ---

# --- データ読み込み ---
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

# zoneとzone_clusterのindex変換
zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}
zcl_to_idx = {z: i for i, z in enumerate(sorted(df_p["zone_cluster"].dropna().unique()))}

# --- カウントベースの出力確率（zone_cluster） ---
counted_emit_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        if g.empty:
            continue
        for _, row in g.iterrows():
            if pd.isna(row["zone"]) or pd.isna(row["zone_cluster"]):
                continue
            z = zone_to_idx[int(row["zone"])]
            zcl = zcl_to_idx[row["zone_cluster"]]
            records.append((z, zcl))
    cm = pd.DataFrame(records, columns=["zone", "zcl"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(C), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_emit_dict[(b, s)] = cm_norm

# --- モデルによる出力確率（zone_cluster） ---
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        log_probs = model_list[1].predict_log_proba(x)[0]  # zone_clusterに対する予測
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# --- KLダイバージェンスの計算（zone_cluster） ---
kl_emit_dict = {}
for b, s in covs:
    P = counted_emit_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# --- 結果のDataFrame作成 ---
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": i, "KL_divergence": kl_emit_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])





In [52]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
10      3        1       0.824854
6       2        0       0.815631
9       3        0       0.814288
7       2        1       0.790924
11      3        2       0.784655
2       0        2       0.774638
8       2        2       0.772567
4       1        1       0.769902
3       1        0       0.768816
0       0        0       0.752705
5       1        2       0.750116
1       0        1       0.728560


In [53]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.special import rel_entr

# --- パラメータ定義 ---
pitcher_id = "656302"
covs = [(b, s) for b in range(4) for s in range(3)]  # balls-strikesの全組合せ
model_dir = f"iohmm_{pitcher_id}"

# --- モデルとエンコーダ読み込み ---
with open(os.path.join(model_dir, "model.pkl"), "rb") as mf:
    model = pickle.load(mf)
with open(os.path.join(model_dir, "encoders.pkl"), "rb") as ef:
    encs = pickle.load(ef)

# --- ラベルエンコーダ取得 ---

# --- データ読み込み ---
df_p = df[df["pitcher"].astype(str) == pitcher_id].sort_values("game_pk").reset_index(drop=True)

# zoneとzone_clusterのindex変換
zone_to_idx = {int(z): i for i, z in enumerate(sorted(df_p["zone"].dropna().unique()))}
zcl_to_idx = {z: i for i, z in enumerate(sorted(df_p["zone_cluster"].dropna().unique()))}

# --- カウントベースの出力確率（zone_cluster） ---
counted_emit_dict = {}
for b, s in covs:
    records = []
    for _, g in df_p.groupby("game_pk"):
        g = g[(g["balls"] == b) & (g["strikes"] == s)]
        if g.empty:
            continue
        for _, row in g.iterrows():
            if pd.isna(row["zone"]) or pd.isna(row["zone_cluster"]):
                continue
            z = zone_to_idx[int(row["zone"])]
            zcl = zcl_to_idx[row["zone_cluster"]]
            records.append((z, zcl))
    cm = pd.DataFrame(records, columns=["zone", "zcl"]).value_counts().unstack(fill_value=0).reindex(index=range(K), columns=range(C), fill_value=0)
    cm_norm = cm.div(cm.sum(axis=1).replace(0, 1), axis=0).values
    counted_emit_dict[(b, s)] = cm_norm

# --- モデルによる出力確率（zone_cluster） ---
emit_model_dict = {}
for b, s in covs:
    x = np.array([[b, s]])
    mat = np.zeros((K, C))
    for z in range(K):
        model_list = model.model_emissions[z]
        log_probs = model_list[1].predict_log_proba(x)[0]  # zone_clusterに対する予測
        mat[z] = np.exp(log_probs)
    emit_model_dict[(b, s)] = mat

# --- KLダイバージェンスの計算（zone_cluster） ---
kl_emit_dict = {}
for b, s in covs:
    P = counted_emit_dict[(b, s)]
    Q = emit_model_dict[(b, s)]
    kl_matrix = rel_entr(P, Q)
    kl_values = np.sum(kl_matrix, axis=1)
    kl_emit_dict[(b, s)] = kl_values

# --- 結果のDataFrame作成 ---
kl_emit_df = pd.DataFrame([
    {"balls": b, "strikes": s, "zone": i, "KL_divergence": kl_emit_dict[(b, s)][i]}
    for (b, s) in covs for i in range(K)
])





In [54]:
avg_kl = kl_emit_df.groupby(['balls', 'strikes'])['KL_divergence'].mean().reset_index()
print(avg_kl.sort_values('KL_divergence', ascending=False))


    balls  strikes  KL_divergence
9       3        0       0.858208
5       1        2       0.802076
2       0        2       0.799723
6       2        0       0.784798
1       0        1       0.775778
7       2        1       0.768416
11      3        2       0.764091
0       0        0       0.762101
8       2        2       0.761411
10      3        1       0.760017
3       1        0       0.751198
4       1        1       0.741240
