# インポート

In [1]:
import create_population
import preprocessing
from feature_engineering import FeatureCreator
from feature_engineering_prediction import PredictionFeatureCreator
from train import Trainer
import prediction
from evaluation import Evaluator
%load_ext autoreload

In [13]:
%autoreload

# 学習母集団の作成

In [5]:
population = create_population.create(from_="2018-01-01", to_="2023-12-31")

# データ加工

In [3]:
# レース結果テーブルの前処理
results_preprocessed = preprocessing.process_results()

In [7]:
# 重複チェック
results_preprocessed.duplicated(subset=["race_id", "horse_id"]).sum()

0

In [4]:
# 欠損チェック
results_preprocessed.isnull().sum()

race_id        0
horse_id       0
jockey_id      0
trainer_id     0
owner_id       0
rank           0
umaban         0
wakuban        0
tansho_odds    0
popularity     0
impost         0
sex            0
age            0
weight         0
weight_diff    0
n_horses       0
dtype: int64

In [2]:
# 馬の過去成績テーブルの加工
horse_results_preprocessed = preprocessing.process_horse_results()

In [3]:
# 欠損チェック
horse_results_preprocessed.isnull().sum()

horse_id             0
date                 0
rank                 0
prize                0
rank_diff          735
weather            616
race_type            0
course_len           0
ground_state         0
race_class      358211
n_horses            10
time               385
win                  0
rentai               0
show                 0
place           395008
dtype: int64

In [4]:
# 重複チェック
horse_results_preprocessed.duplicated(subset=["horse_id", "date"]).sum()

0

In [2]:
# レース情報テーブルの前処理
race_info_preprocessed = preprocessing.process_race_info()

In [9]:
# 騎手リーディングテーブルの前処理
jockey_leading_preprocessed = preprocessing.process_jockey_leading()

In [10]:
# 調教師リーディングテーブルの前処理
trainer_leading_preprocessed = preprocessing.process_trainer_leading()

In [32]:
# 血統テーブルの前処理
peds_preprocessed = preprocessing.process_peds()

In [33]:
# 種牡馬リーディングテーブルの前処理
sire_leading_preprocessed = preprocessing.process_sire_leading()

In [2]:
# 払い戻しテーブルの前処理
return_tables_preprocessed = preprocessing.process_return_tables()

#  特徴量作成

In [5]:
fc = FeatureCreator()
features = fc.create_features()

agg_horse_n_races_relative:   0%|          | 0/5 [00:00<?, ?it/s]

running agg_interval()...
running agg_jockey()...
running agg_trainer()...


agg_horse_per_course_len:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_ground_state_race_type:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_race_class:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_race_type:   0%|          | 0/6 [00:00<?, ?it/s]

running agg_sire()...
running cross_feature()...
merging all features...


In [6]:
# 重複チェック
features.duplicated(subset=["race_id", "horse_id"]).sum()

0

# 学習

In [7]:
trainer = Trainer()
evaluation_df = trainer.run(
    valid_start_date="2022-10-01",
    test_start_date="2023-01-01"
)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.21432
[200]	valid_0's binary_logloss: 0.205778
[300]	valid_0's binary_logloss: 0.20358
[400]	valid_0's binary_logloss: 0.202893
[500]	valid_0's binary_logloss: 0.202671
[600]	valid_0's binary_logloss: 0.202533
[700]	valid_0's binary_logloss: 0.20245
[800]	valid_0's binary_logloss: 0.202412
[900]	valid_0's binary_logloss: 0.202424
Early stopping, best iteration is:
[861]	valid_0's binary_logloss: 0.202384
-------------------- result --------------------
test_df's binary_logloss: 0.20518699585201705


In [8]:
# オッズと人気を特徴量から省いた場合
trainer = Trainer(config_filepath="config_odds_removed.yaml")
evaluation_df = trainer.run(
    valid_start_date="2022-10-01",
    test_start_date="2023-01-01",
    importance_filename="importance_odds_removed",
    model_filename="model_odds_removed.pkl",
    evaluation_filename="evaluation_odds_removed.csv"
)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.233648
[200]	valid_0's binary_logloss: 0.226911
[300]	valid_0's binary_logloss: 0.224526
[400]	valid_0's binary_logloss: 0.223321
[500]	valid_0's binary_logloss: 0.222674
[600]	valid_0's binary_logloss: 0.222304
[700]	valid_0's binary_logloss: 0.221998
[800]	valid_0's binary_logloss: 0.221903
[900]	valid_0's binary_logloss: 0.221755
[1000]	valid_0's binary_logloss: 0.221588
[1100]	valid_0's binary_logloss: 0.221476
[1200]	valid_0's binary_logloss: 0.221327
[1300]	valid_0's binary_logloss: 0.221258
[1400]	valid_0's binary_logloss: 0.221205
[1500]	valid_0's binary_logloss: 0.221191
[1600]	valid_0's binary_logloss: 0.221169
[1700]	valid_0's binary_logloss: 0.221107
[1800]	valid_0's binary_logloss: 0.221077
[1900]	valid_0's binary_logloss: 0.221051
[2000]	valid_0's binary_logloss: 0.22101
[2100]	valid_0's binary_logloss: 0.221063
Early stopping, best iteration is:
[2024]	valid_0's binary_logloss

# 精度評価

In [9]:
evaluator = Evaluator()
summary_df = evaluator.summarize_box_exp(exp_name="v3_0_7")
summary_df

Unnamed: 0,topn,bet_type,hitrate_v3_0_7,returnrate_v3_0_7
0,1,単勝,0.33912,0.827315
1,1,複勝,0.64294,0.854369
0,2,ワイド,0.302951,0.787297
1,2,単勝,0.530671,0.811039
2,2,複勝,0.847222,0.841421
3,2,馬単,0.148148,0.742332
4,2,馬連,0.148148,0.770718
0,3,ワイド,0.5489,0.811246
1,3,三連単,0.08941,0.700516
2,3,三連複,0.08941,0.799624


In [10]:
# オッズと人気を特徴量から省いた場合
evaluator = Evaluator(evaluation_filename="evaluation_odds_removed.csv")
summary_df = evaluator.summarize_box_exp(exp_name="v3_0_7")
summary_df

Unnamed: 0,topn,bet_type,hitrate_v3_0_7,returnrate_v3_0_7
0,1,単勝,0.28588,0.795255
1,1,複勝,0.591435,0.834462
0,2,ワイド,0.262442,0.766435
1,2,単勝,0.463542,0.767549
2,2,複勝,0.812789,0.823655
3,2,馬単,0.119502,0.732856
4,2,馬連,0.119502,0.75463
0,3,ワイド,0.485532,0.77202
1,3,三連単,0.069155,0.652011
2,3,三連複,0.069155,0.727286


# 予測

## 事前準備
**当日出走馬が確定した時点**で実行できる

In [16]:
# 当日出走馬の過去成績テーブルの前処理
horse_results_preprocessed = preprocessing.process_horse_results(
    input_filename="horse_results_prediction.csv",
    output_filename="horse_results_prediction.csv"
)

In [11]:
# 当日出走馬の血統テーブルの前処理
peds_preprocessed = preprocessing.process_peds(
    input_filename="peds_prediction.csv",
    output_filename="peds_prediction.csv"
)

In [16]:
%autoreload

In [17]:
pfc = PredictionFeatureCreator()
# 過去成績集計は事前に行うことができる
pfc.create_baselog()
pfc.agg_horse_n_races()
pfc.agg_horse_n_races_relative()
pfc.agg_interval()

agg_horse_n_races_relative:   0%|          | 0/5 [00:00<?, ?it/s]

running agg_interval()...


## 当日の予測処理
レース直前出走直前に実行する

In [18]:
# 特徴量の更新
features = pfc.create_features(
    race_id="202405030101",  # 予測するレースidを指定
    skip_agg_horse=True  # 事前に集計した場合はスキップできる
)

fetching shubuta page html...


  df = pd.read_html(html)[0]


agg_horse_per_course_len:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_ground_state_race_type:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_race_class:   0%|          | 0/6 [00:00<?, ?it/s]

agg_horse_per_race_type:   0%|          | 0/6 [00:00<?, ?it/s]

running agg_jockey()...
running agg_trainer()...
running agg_sire()...
running cross_feature()...
merging all features...


In [19]:
# 予測
prediction.predict(features)

Unnamed: 0,race_id,umaban,tansho_odds,popularity,pred
0,202405030101,1,3.1,1,0.305443
3,202405030101,4,3.5,2,0.227912
1,202405030101,2,5.1,3,0.154929
6,202405030101,7,6.4,4,0.123869
8,202405030101,9,7.5,5,0.110189
4,202405030101,5,10.0,6,0.085866
9,202405030101,10,19.9,7,0.04738
2,202405030101,3,121.9,8,0.005812
5,202405030101,6,146.9,9,0.005297
7,202405030101,8,163.3,10,0.004196


In [20]:
# オッズ除外モデルでの予測
prediction.predict(
    features,
    model_filename="model_odds_removed.pkl",
    config_filepath="config_odds_removed.yaml"
)

Unnamed: 0,race_id,umaban,tansho_odds,popularity,pred
8,202405030101,9,7.5,5,0.116449
3,202405030101,4,3.5,2,0.080595
4,202405030101,5,10.0,6,0.068667
0,202405030101,1,3.1,1,0.067486
2,202405030101,3,121.9,8,0.056033
1,202405030101,2,5.1,3,0.054856
5,202405030101,6,146.9,9,0.047228
6,202405030101,7,6.4,4,0.044341
9,202405030101,10,19.9,7,0.037462
7,202405030101,8,163.3,10,0.037417
