In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import os

OWNDIR = os.getcwd()
PARENTDIR = os.path.dirname(OWNDIR)

# ファイルの読み込み
merged_data = pd.read_csv(PARENTDIR + "/merge_df.csv", low_memory=False)

# ユーザーから日付入力を受け取る
date_str = input("日付を8桁（YYYYMMDD）で入力してください: ")

# ファイルパスを組み立てて読み込む
csv_path = f"{PARENTDIR}/{date_str}_with_frame_type.csv"
real_data = pd.read_csv(csv_path, low_memory=False)
# ガーベル（非デコード可能な）テキストを含む行を削除
import chardet

def is_garbled(row):
    for val in row.astype(str):
        try:
            val.encode('utf-8').decode('utf-8')
        except UnicodeDecodeError:
            return True
    return False

merged_data = merged_data[~merged_data.apply(is_garbled, axis=1)]

# rank を数値に変換（済み）
merged_data['rank'] = pd.to_numeric(merged_data['rank'], errors='coerce')

# target: 3着以内なら1、その他は0（明示的に定義し直す）
merged_data['target'] = merged_data['rank'].apply(lambda x: 1 if x <= 3 else 0)

# float形式になっている日付を文字列にしてから日付型に変換
merged_data['date'] = pd.to_datetime(
    merged_data['date'].dropna().astype(int).astype(str), format='%Y%m%d', errors='coerce')


print("test_data 件数:", len(real_data))

# 明示的に特徴量カラムを指定
feature_cols = ['date','race_course_id','race_number','surface','distance','direction',
                'headcount','frame_number','horse_number',
                #'horse_weight',
                #'odds','popular',
                'age',
                'is_senba','is_mesu','is_osu','horse_id','rider_id','tamer_id','weather','ground_status','is_obstacle',
                'frame_type',
                'analy_horse_type',
                'parent_ml_id','parent_ml_ml_id','parent_ml_fml_id','parent_fml_fml_id']

# 開催場所マッピング用の辞書
racecourse_dict = {
    "01": "札幌", "02": "函館", "03": "福島", "04": "新潟", "05": "東京",
    "06": "中山", "07": "中京", "08": "京都", "09": "阪神", "10": "小倉"
}

# 出力用の整形
def extract_course_and_number(row):
    course_code = str(row['race_course_id']).zfill(2)
    race_number = int(row['race_number'])
    course_name = racecourse_dict.get(course_code, f"未知({course_code})")
    return course_name, race_number

surface_list = merged_data['surface'].dropna().unique()
output_all = []

for surf in surface_list:
    for obst in [0, 1]:
        train_data = merged_data[(merged_data['surface'] == surf) & (merged_data['is_obstacle'] == obst)]
        test_data = real_data[(real_data['surface'] == surf) & (real_data['is_obstacle'] == obst)]
        if train_data.empty or test_data.empty:
            continue

        # ラベルエンコーディング
        X_train = train_data[feature_cols].copy()
        X_test = test_data[feature_cols].copy()

        # 数値に変換できる列はすべて数値型に変換（エラーがあればNaNに）
        for col in X_train.columns:
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

        for col in X_train.select_dtypes(include='object').columns:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            X_test[col] = le.transform(X_test[col].astype(str))

        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)
        y_train = train_data['target']
        y_test = test_data['target'] if 'target' in test_data.columns else None

        # LightGBMで学習・予測
        model = lgb.LGBMClassifier(verbosity=-1)
        model.fit(X_train, y_train)

        # 特徴量重要度の取得
        importance = model.feature_importances_
        feature_names = X_train.columns
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values(by='importance', ascending=False)

        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:, 1]

        # test_data に予測結果を追加
        test_data = test_data.copy()
        test_data['pred'] = preds
        test_data['prob'] = probs

        # 閾値0.6以上を最大7頭
        # 閾値0.6以上を最大7頭
        top_n = (
            test_data[test_data['prob'] >= 0.6]
            .sort_values(['race_id', 'prob'], ascending=[True, False])
            .groupby('race_id')
            .head(7)
        )

        # レースごとに馬をまとめる（馬番と期待値で）
        for _, group in top_n.groupby('race_id'):
            course, number = extract_course_and_number(group.iloc[0])
            group_sorted = group.sort_values('prob', ascending=False)
            horses = group_sorted['horse_number'].astype('Int64').tolist()  # Int64でNaN許容
            probs_list = group_sorted['prob'].tolist()
            row = [course, number]
            for i in range(7):
                if i < len(horses):
                    row.append(horses[i])
                    row.append(round(probs_list[i], 4))
                else:
                    row.append("")  # 馬番の空欄
                    row.append("")  # 期待値の空欄
            output_all.append(row)

        # 出力用 DataFrame
        output_df = pd.DataFrame(output_all, columns=[
            "開催場所", "レース番号",
            "予想1頭目", "予想1頭目期待値",
            "予想2頭目", "予想2頭目期待値",
            "予想3頭目", "予想3頭目期待値",
            "予想4頭目", "予想4頭目期待値",
            "予想5頭目", "予想5頭目期待値",
            "予想6頭目", "予想6頭目期待値",
            "予想7頭目", "予想7頭目期待値"
        ])

        # 表示
        from IPython.display import display
        print(date_str,"のレースの予測をを表示します")
        display(output_df)

test_data 件数: 476
20250525 のレースの予測をを表示します


Unnamed: 0,開催場所,レース番号,予想1頭目,予想1頭目期待値,予想2頭目,予想2頭目期待値,予想3頭目,予想3頭目期待値,予想4頭目,予想4頭目期待値,予想5頭目,予想5頭目期待値,予想6頭目,予想6頭目期待値,予想7頭目,予想7頭目期待値
0,新潟,1,10,0.7391,1.0,0.7275,15.0,0.6674,4.0,0.6478,7.0,0.6465,16.0,0.6374,2.0,0.6279
1,新潟,4,12,0.7817,11.0,0.7748,9.0,0.7708,5.0,0.7306,13.0,0.7281,8.0,0.7107,3.0,0.6956
2,新潟,5,14,0.7303,10.0,0.7173,16.0,0.6976,6.0,0.6829,2.0,0.6524,3.0,0.6343,12.0,0.6258
3,新潟,6,15,0.6862,3.0,0.6362,5.0,0.6234,1.0,0.6184,2.0,0.6013,,,,
4,新潟,8,13,0.62,,,,,,,,,,,,
5,新潟,9,16,0.6317,,,,,,,,,,,,
6,新潟,11,13,0.6404,15.0,0.6093,12.0,0.6086,,,,,,,,
7,東京,7,8,0.7725,6.0,0.7618,4.0,0.7577,7.0,0.7292,1.0,0.706,3.0,0.6884,5.0,0.6747
8,東京,9,6,0.775,8.0,0.7416,5.0,0.6597,9.0,0.617,,,,,,
9,東京,10,5,0.7237,4.0,0.6994,3.0,0.6712,7.0,0.6666,1.0,0.6616,8.0,0.6535,,


20250525 のレースの予測をを表示します


Unnamed: 0,開催場所,レース番号,予想1頭目,予想1頭目期待値,予想2頭目,予想2頭目期待値,予想3頭目,予想3頭目期待値,予想4頭目,予想4頭目期待値,予想5頭目,予想5頭目期待値,予想6頭目,予想6頭目期待値,予想7頭目,予想7頭目期待値
0,新潟,1,10,0.7391,1.0,0.7275,15.0,0.6674,4.0,0.6478,7.0,0.6465,16.0,0.6374,2.0,0.6279
1,新潟,4,12,0.7817,11.0,0.7748,9.0,0.7708,5.0,0.7306,13.0,0.7281,8.0,0.7107,3.0,0.6956
2,新潟,5,14,0.7303,10.0,0.7173,16.0,0.6976,6.0,0.6829,2.0,0.6524,3.0,0.6343,12.0,0.6258
3,新潟,6,15,0.6862,3.0,0.6362,5.0,0.6234,1.0,0.6184,2.0,0.6013,,,,
4,新潟,8,13,0.62,,,,,,,,,,,,
5,新潟,9,16,0.6317,,,,,,,,,,,,
6,新潟,11,13,0.6404,15.0,0.6093,12.0,0.6086,,,,,,,,
7,東京,7,8,0.7725,6.0,0.7618,4.0,0.7577,7.0,0.7292,1.0,0.706,3.0,0.6884,5.0,0.6747
8,東京,9,6,0.775,8.0,0.7416,5.0,0.6597,9.0,0.617,,,,,,
9,東京,10,5,0.7237,4.0,0.6994,3.0,0.6712,7.0,0.6666,1.0,0.6616,8.0,0.6535,,
