### <font color="orange">ライブラリ読み込み</font>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

In [3]:
# データフレームを水平に表示する。
def show_many_dfs(*dfs, n=10):
    class HorizontalDisplay:
        def _repr_html_(self):
            template = '<div style="float: left; padding: 5px;">{}</div>'
            return  ''.join(template.format(df.head(n)._repr_html_()) for df in dfs)
    return HorizontalDisplay()

### <font color="orange">データ読み込み</font>

In [4]:
data_path = "./make_data_for_mainmodel/maked_data.csv"

In [5]:
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,pitch_name,game_year,game_month,game_day,home_team,LAA_score,ENEMY_score,score_difference,is_LAA_score_is_higher,stand,on_3b,on_2b,on_1b,count_runner,inning,pitch_number,pitch_count_per_inning,pitch_count_per_game,outs_when_up,balls,strikes,sz_top,sz_bot,is_first_pitch_number,release_speed,release_pos_x,release_pos_y,is_incourse_ball,is_high_ball,release_pos_z,release_spin_rate,release_extension,plate_x,plate_z,events,description,zone,type,hc_x,hc_y,one_pitch_ago,two_pitches_ago,three_pitches_ago
0,1,2018,4,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,3.54,1.6,1,,,,,,,,,,,,,,,,,,,
1,0,2018,4,1,1,0,0,0,0,0,0,0,0,0,1,2,2,2,0,0,1,3.33,1.46,0,96.4,-2.16,54.05,1.0,0.0,6.2,2107.0,6.4,-0.36,2.81,19.0,2.0,4.0,0.0,0.0,0.0,4-Seam Fastball,,
2,0,2018,4,1,1,0,0,0,0,0,0,0,0,0,1,3,3,3,0,0,2,3.59,1.66,0,81.5,-2.36,54.42,0.0,1.0,6.03,2355.0,6.1,0.21,3.43,19.0,9.0,2.0,0.0,0.0,0.0,Sweeper,4-Seam Fastball,
3,0,2018,4,1,1,0,0,0,0,0,0,0,0,0,1,4,4,4,0,1,2,3.33,1.46,0,83.6,-2.36,54.43,0.0,1.0,6.01,2368.0,6.1,0.24,3.67,19.0,0.0,2.0,1.0,0.0,0.0,Sweeper,Sweeper,4-Seam Fastball
4,1,2018,4,1,1,0,0,0,0,1,0,0,0,0,1,1,5,5,1,0,0,3.22,1.4,1,,,,,,,,,,,,,,,,,,,


In [6]:
print(f"2022年 : {data[data['game_year'].isin([2022])].shape}")

2022年 : (2629, 43)


### <font color="orange">データタイプ確認</font>

In [7]:
# int, float以外のカラムがあるか確認
data.select_dtypes(exclude=["int", "float"]).columns.to_list()

['one_pitch_ago', 'two_pitches_ago', 'three_pitches_ago']

### <font color="orange">train_data, test_dataに分割</font>

In [8]:
def make_train_test_data(select_game_year):
    
    data = pd.read_csv(data_path)
    
    categolical_list = [
        "outs_when_up",
        "balls",
        "strikes",
        "events",
        "description",
        "zone",
        "type",
        "one_pitch_ago",
        "two_pitches_ago",
        "three_pitches_ago",
    ]
    
    data[categolical_list] = data[categolical_list].astype("category")
    
    # 対象年度の抽出
    data = data[data["game_year"].isin(select_game_year)]
    del data["game_year"]
    del data["game_month"]
    del data["game_day"]
    
    X = data.drop("pitch_name", axis=1)
    y = data["pitch_name"]
    
    # データを前半80%（train_data）と後半20%（test_data）に分割
    split_index = int(len(data) * 0.8)
    X_train = X.iloc[: split_index]
    y_train = y.iloc[: split_index]
    X_test = X.iloc[split_index : ]
    y_test = y.iloc[split_index : ]

    
    if y_train.nunique() != y_test.nunique():
        raise ValueError("train_dataとtest_dataの要素が一致していません。")
    
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = make_train_test_data(select_game_year=[2021])

# 確認
X_train.shape, X_test.shape

((1621, 39), (406, 39))

In [9]:
# カテゴリー変数の確認
X_train.select_dtypes(include="category").columns

Index(['outs_when_up', 'balls', 'strikes', 'events', 'description', 'zone',
       'type', 'one_pitch_ago', 'two_pitches_ago', 'three_pitches_ago'],
      dtype='object')

### <font color="orange">モデル学習、予測</font>

In [10]:
# カスタムメトリック定義
from sklearn.metrics import f1_score, accuracy_score, log_loss

def custom_f1(y_pred, dataset):
    y_true = dataset.get_label()
    score = f1_score(y_true=y_true, y_pred=np.round(y_pred), average="macro")
    return "f1", score, True

def custom_accuracy(y_pred, dataset):
    y_true = dataset.get_label()
    auc_score = accuracy_score(y_true=y_true, y_pred=np.round(y_pred))
    return "acu", auc_score, True

def custom_multi_logloss(y_pred, dataset):
    y_true = dataset.get_label()
    multi_logloss = log_loss(y_true=y_true, y_pred=y_pred)
    return "multi_logloss", multi_logloss, False

In [11]:
import lightgbm as lgb

num_boost_round = 500

# data作成
select_game_year = [2022]
X_train, y_train, X_test, y_test = make_train_test_data(select_game_year=select_game_year)

# train_dataをtrain_data（前半80%）とvalid_data（後半20%）に分割
split_index = int(len(X_train) * 0.8)
X_valid = X_train.iloc[split_index : ].copy()
y_valid = y_train.iloc[split_index : ].copy()
X_train = X_train.iloc[ : split_index].copy()
y_train = y_train.iloc[ : split_index].copy()

def objective(trial):
    
    params = {
        "boosting" : "gbdt",
        "objective" : "binary",
        "metric" : "None",
        "seed" : 42,
        "verbose" : -1,
        "num_threads" : 2,
        
        "learning_rate" : trial.suggest_float("learning_rate", 0.01, 0.02),
        "is_unbalance" : True,
        "extra_trees" : True
    }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    # 誤差プロットの格納用データ
    evals_result = {}

    model = lgb.train(params=params,
                      train_set=lgb_train,
                      num_boost_round=num_boost_round,
                      valid_sets=[lgb_train, lgb_eval],
                      valid_names=["train", "valid"],
                      feval= [custom_f1, custom_accuracy, custom_multi_logloss],
                      callbacks=[lgb.record_evaluation(evals_result)]
                      )

    # 予測
    best_iteration = np.array(evals_result["valid"]["f1"]).argmax() + 1
    # y_valid_pred_proba = model.predict(X_valid, num_iteration=best_iteration)
    y_valid_pred_proba = model.predict(X_valid, num_iteration=model.best_iteration)
    y_valid_pred = np.round(y_valid_pred_proba)
    f1 = f1_score(y_pred=y_valid_pred, y_true=y_valid, average="macro")
    
    return f1


In [12]:
import optuna

study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=42), direction="maximize")
study.optimize(objective, n_trials=200)

[I 2024-05-09 11:38:42,925] A new study created in memory with name: no-name-76100877-3d1e-4a91-af4d-47d10f3b6646
[I 2024-05-09 11:38:45,645] Trial 0 finished with value: 0.5405685600480166 and parameters: {'learning_rate': 0.013745401188473625}. Best is trial 0 with value: 0.5405685600480166.
[I 2024-05-09 11:38:48,194] Trial 1 finished with value: 0.5632570058465068 and parameters: {'learning_rate': 0.01950714306409916}. Best is trial 1 with value: 0.5632570058465068.
[I 2024-05-09 11:38:50,864] Trial 2 finished with value: 0.542344374935904 and parameters: {'learning_rate': 0.01731993941811405}. Best is trial 1 with value: 0.5632570058465068.
[I 2024-05-09 11:38:53,326] Trial 3 finished with value: 0.563766025315427 and parameters: {'learning_rate': 0.015986584841970367}. Best is trial 3 with value: 0.563766025315427.
[I 2024-05-09 11:38:55,862] Trial 4 finished with value: 0.5476859504132231 and parameters: {'learning_rate': 0.011560186404424366}. Best is trial 3 with value: 0.5637

In [13]:
trial = study.best_trial
trial.params

{'learning_rate': 0.015467674362525642}

In [14]:
trial.value

0.586511704043215

In [15]:
import plotly
import nbformat
optuna.visualization.plot_param_importances(study).show()

In [16]:
optuna.visualization.plot_slice(study, params=["learning_rate"])

In [18]:
optuna.visualization.plot_optimization_history(study)