In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import os

OWNDIR = os.getcwd()
PARENTDIR = os.path.dirname(OWNDIR)

# ファイルの読み込み
merged_data = pd.read_csv(PARENTDIR + "/merge_df.csv", low_memory=False)

# ユーザーから日付入力を受け取る
date_str = input("日付を8桁（YYYYMMDD）で入力してください: ")

# ファイルパスを組み立てて読み込む
csv_path = f"{PARENTDIR}/{date_str}_with_frame_type.csv"
real_data = pd.read_csv(csv_path, low_memory=False)

def is_garbled(row):
    for val in row.astype(str):
        try:
            val.encode('utf-8').decode('utf-8')
        except UnicodeDecodeError:
            return True
    return False

merged_data = merged_data[~merged_data.apply(is_garbled, axis=1)]

# rank を数値に変換（済み）
merged_data['rank'] = pd.to_numeric(merged_data['rank'], errors='coerce')

# target: 3着以内なら1、その他は0（明示的に定義し直す）
merged_data['target'] = merged_data['rank'].apply(lambda x: 1 if x <= 3 else 0)

# float形式になっている日付を文字列にしてから日付型に変換
merged_data['date'] = pd.to_datetime(
    merged_data['date'].dropna().astype(int).astype(str), format='%Y%m%d', errors='coerce')


print("test_data 件数:", len(real_data))

# 明示的に特徴量カラムを指定
feature_cols = ['date','race_course_id','race_number','surface','distance','direction',
                'headcount','frame_number','horse_number',
                #'horse_weight',
                #'odds','popular',
                'age',
                'is_senba','is_mesu','is_osu',
                'horse_id',
                'rider_id',
                'tamer_id',
                'weather','ground_status','is_obstacle',
                'frame_type',
                'analy_horse_type',
                'parent_ml_id','parent_ml_ml_id','parent_ml_fml_id','parent_fml_fml_id']

# 開催場所マッピング用の辞書
racecourse_dict = {
    "01": "札幌", "02": "函館", "03": "福島", "04": "新潟", "05": "東京",
    "06": "中山", "07": "中京", "08": "京都", "09": "阪神", "10": "小倉"
}

# 出力用の整形
def extract_course_and_number(row):
    course_code = str(row['race_course_id']).zfill(2)
    race_number = int(row['race_number'])
    course_name = racecourse_dict.get(course_code, f"未知({course_code})")
    return course_name, race_number

surface_list = merged_data['surface'].dropna().unique()
output_all = []

for surf in surface_list:
    for obst in [0, 1]:
        train_data = merged_data[(merged_data['surface'] == surf) & (merged_data['is_obstacle'] == obst)]
        test_data = real_data[(real_data['surface'] == surf) & (real_data['is_obstacle'] == obst)]
        if train_data.empty or test_data.empty:
            continue

        # ラベルエンコーディング
        X_train = train_data[feature_cols].copy()
        X_test = test_data[feature_cols].copy()

        # 数値に変換できる列はすべて数値型に変換（エラーがあればNaNに）
        for col in X_train.columns:
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

        for col in X_train.select_dtypes(include='object').columns:
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col].astype(str))
            X_test[col] = le.transform(X_test[col].astype(str))

        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)
        y_train = train_data['target']
        y_test = test_data['target'] if 'target' in test_data.columns else None

        # LightGBMで学習・予測
        model = lgb.LGBMClassifier(verbosity=-1)
        model.fit(X_train, y_train)

        # 特徴量重要度の取得
        importance = model.feature_importances_
        feature_names = X_train.columns
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values(by='importance', ascending=False)

        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:, 1]

        # test_data に予測結果を追加
        test_data = test_data.copy()
        test_data['pred'] = preds
        test_data['prob'] = probs

        # 上位6頭をレースごとに選出
        top6 = (
            test_data.sort_values(['race_id', 'prob'], ascending=[True, False])
            .groupby('race_id')
            .head(6)
        )

        # レースごとに馬をまとめる（馬番と期待値で）
        for _, group in top6.groupby('race_id'):
            course, number = extract_course_and_number(group.iloc[0])
            group_sorted = group.sort_values('prob', ascending=False)
            horses = group_sorted['horse_number'].tolist()
            probs_list = group_sorted['prob'].tolist()
            row = [course, number]
            for i in range(6):
                row.append(horses[i])
                row.append(round(probs_list[i], 4))
            output_all.append(row)

# 開催場所・レース番号でソート
output_all = sorted(output_all, key=lambda x: (x[0], x[1]))
output_df = pd.DataFrame(output_all, columns=[
    "開催場所", "レース番号",
    "予想1頭目", "予想1頭目期待値",
    "予想2頭目", "予想2頭目期待値",
    "予想3頭目", "予想3頭目期待値",
    "予想4頭目", "予想4頭目期待値",
    "予想5頭目", "予想5頭目期待値",
    "予想6頭目", "予想6頭目期待値"
])
from IPython.display import display
print(date_str)
display(output_df)


test_data 件数: 323
20250531


Unnamed: 0,開催場所,レース番号,予想1頭目,予想1頭目期待値,予想2頭目,予想2頭目期待値,予想3頭目,予想3頭目期待値,予想4頭目,予想4頭目期待値,予想5頭目,予想5頭目期待値,予想6頭目,予想6頭目期待値
0,京都,2,11,0.8745,1,0.8563,8,0.855,16,0.8547,5,0.8194,3,0.7953
1,京都,3,9,0.8638,10,0.8314,5,0.7902,12,0.7523,15,0.5076,2,0.4463
2,京都,4,8,0.7308,14,0.6012,7,0.5782,12,0.5747,11,0.5725,13,0.5223
3,京都,5,18,0.668,14,0.6602,16,0.6277,5,0.6143,11,0.5648,3,0.2763
4,京都,6,10,0.782,12,0.7775,7,0.7018,15,0.6913,5,0.6669,6,0.6468
5,京都,7,4,0.8167,5,0.8125,3,0.7986,1,0.7951,6,0.7008,2,0.6823
6,京都,9,9,0.9085,3,0.9025,8,0.8982,5,0.8932,6,0.8889,7,0.8848
7,京都,10,8,0.8263,11,0.7991,15,0.7672,9,0.7653,5,0.7558,7,0.744
8,京都,11,5,0.6432,11,0.6168,2,0.6081,9,0.5853,8,0.5684,14,0.5576
9,京都,12,10,0.6175,2,0.6072,11,0.5996,7,0.5744,6,0.5619,4,0.526
