In [1]:
### インポート ###

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from datetime import datetime
import time
from selenium.webdriver.common.keys import Keys
import re
import csv
import pandas as pd
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from typing import List, TypeVar, Generic
import json
import os
import logging
import functools
from pathlib import Path
import sys

In [2]:
### 定数 ###

# ログ
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=f"C:\\keiba\\tool\\log\\{datetime.now().strftime('%Y%m%d_app.log')}",
    encoding='utf-8',
    force=True
)
# 設定ファイル読み込み
df_config_racecourse = pd.read_excel("C:\\keiba\\tool\\config.xlsx", sheet_name="racecourse", header=0)
df_config_style = pd.read_excel("C:\\keiba\\tool\\config.xlsx", sheet_name="style", header=0)
df_config_scrape = pd.read_excel("C:\\keiba\\tool\\config.xlsx", sheet_name="scrape", header=None, index_col=0)
df_config_netkeiba = pd.read_excel("C:\\keiba\\tool\\config.xlsx", sheet_name="netkeiba", header=None, index_col=0)
# 対象競馬場とレース
PLACE_MAP = df_config_racecourse.set_index('key')['value'].to_dict()
print(f"競馬場: {PLACE_MAP}")
# 脚質
STYLE_MAP = df_config_style.set_index('key')['value'].to_dict()
print(f"脚質: {STYLE_MAP}")
# レース番号
RACE_NUMBERS = [f"{i:02d}" for i in range(1, 13)]
# スクレイピング
PATH_CHROME_DRIVER = df_config_scrape.loc["PATH_CHROME_DRIVER"].iloc[0]
# netkeiba
LOGIN_URL = df_config_netkeiba.loc["LOGIN_URL"].iloc[0]
LOGIN_ID = df_config_netkeiba.loc["LOGIN_ID"].iloc[0]
LOGIN_PASSWORD = df_config_netkeiba.loc["LOGIN_PASSWORD"].iloc[0]
# 天気
LIST_WEATHER = ["小雨", "小雪", "晴", "雨", "曇", "雪"]
# 馬場
LIST_TRACK_CONDITION = ["重", "不", "良", "稍"]

競馬場: {35: '盛岡', 45: '川崎', 55: '佐賀', 65: '帯広(ば)', 46: '金沢', 54: '高知', 44: '大井', 47: '笠松', 50: '園田', 30: '門別', 36: '水沢', 43: '船橋', 48: '名古屋', 42: '浦和'}
脚質: {1: '逃げ', 2: '先行', 3: '差し', 4: '追込'}


In [3]:
### クラス ###

# 馬データ（Shutuba）
@dataclass
class Horse_Shutuba:
    """出走馬のデータを保持するデータクラス"""
    horse_id: str
    horse_name: str
    jockey_id: str
    jockey_name: str
    popularity: str
    odds: str
    sex_and_age: str
    weight: str
    horse_number: str
    frame_number: str
    position_1_top_pred: str
    position_1_left_pred: str
    position_2_top_pred: str
    position_2_left_pred: str
    position_3_top_pred: str
    position_3_left_pred: str
    position_4_top_pred: str
    position_4_left_pred: str 
    position_1_top_pred_jockey_tendency: str
    position_1_left_pred_jockey_tendency: str
    position_2_top_pred_jockey_tendency: str
    position_2_left_pred_jockey_tendency: str
    position_3_top_pred_jockey_tendency: str
    position_3_left_pred_jockey_tendency: str
    position_4_top_pred_jockey_tendency: str
    position_4_left_pred_jockey_tendency: str
    style_pred: str
    impost: str
    last_3_furlongs_pred: str

# 馬データ（Result）
@dataclass
class Horse_Result:
    """出走馬のデータを保持するデータクラス"""
    finish_rank: str
    frame_number: str
    horse_number: str
    horse_id: str
    horse_name: str
    sex_and_age: str
    impost: str    
    jockey_id: str
    jockey_name: str
    time: str
    diff: str    
    popularity: str
    odds: str
    last_3_furlongs: str
    weight: str

# レースデータ（Shutuba）
T = TypeVar('T')
@dataclass
class Race:
    """レースの基本情報と出走馬リストを保持するデータクラス"""
    race_id: str
#    race_name: str
    race_date: str
    race_time: str
    num_horses: str
    race_number: int
#    weather_name: str
#    track_condition_name: str
    racecourse: str
    ground: str
    distance: str
    direction:  str
    reliability: str
    opinion: str
    rank_1_corner: str
    rank_2_corner: str
    rank_3_corner: str
    rank_4_corner: str    
    horses: List[T] = field(default_factory=list)

In [4]:
### 関数 ###
def log_step(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        # メソッド開始のログ
        logging.info(f"[開始] {func.__name__} を実行します。")
        
        try:
            # 本来のメソッド（関数）を実行
            result = func(*args, **kwargs)
            
            # メソッド終了のログ
            logging.info(f"[完了] {func.__name__} が正常に終了しました。")
            return result
            
        except Exception as e:
            # エラー発生時のログ
            logging.error(f"[失敗] {func.__name__} でエラーが発生しました: {e}")
            raise  # エラーをそのまま上に投げる
            
    return wrapper

def is_xpath_present(driver, xpath):
    return len(driver.find_elements(By.XPATH, xpath)) > 0

def extract_style_values(style_str):
    """
    style文字列からtopとleftの数値を抽出する関数
    """
    # 正規表現の解説:
    # -?    : マイナス記号があってもなくても良い
    # [\d\.]+ : 数字またはドットが1回以上続く
    # ()    : このカッコ内の部分を抽出する
    top_match = re.search(r"top:\s*(-?[\d\.]+)%", style_str)
    left_match = re.search(r"left:\s*(-?[\d\.]+)%", style_str)
    # 抽出した文字列をfloatに変換
    top_val = float(top_match.group(1)) if top_match else None
    left_val = float(left_match.group(1)) if left_match else None
    # 戻り値
    return top_val, left_val

@log_step
def login_netkeiba(driver):
    try:
        driver.get(LOGIN_URL)
    except:
        print("タイムアウトしたが処理を継続")
        driver.execute_script("window.stop();")
    driver.find_element(By.NAME, "login_id").send_keys(LOGIN_ID)
    pw_field = driver.find_element(By.NAME, "pswd")
    pw_field.send_keys(LOGIN_PASSWORD)
    pw_field.send_keys(Keys.ENTER)
    time.sleep(3)

@log_step
def get_wether_and_track_condition(condition_data):

    weather = next((k for k in LIST_WEATHER if k in condition_data), "")
    track_condition = next((k for k in LIST_TRACK_CONDITION if k in condition_data), "")

    return weather, track_condition

@log_step
def get_data_from_database(driver, list_race_id):

    list_horse_data = []
    for race_id in list_race_id:
        # データベースの当該レースのページに遷移
        url_race = f"https://db.netkeiba.com/race/{race_id}/"
        try:
            driver.get(url_race)
        except:
            print("タイムアウトしたが処理を継続")
            driver.execute_script("window.stop();")
        # レース単位のデータを取得
        condition_data = driver.find_element(By.XPATH, "//*[@id='main']/div/div/div/diary_snap/div/div/dl/dd/p/diary_snap_cut/span").text
        weather_name, track_condition_name = get_wether_and_track_condition(condition_data)
        # 馬単位のデータを取得
        for i in range(19):
            if is_xpath_present(driver, f"//*[@id='contents_liquid']/table/tbody/tr[{str(i + 2)}]/td[3]"):
                horse_number = driver.find_element(By.XPATH, f"//*[@id='contents_liquid']/table/tbody/tr[{str(i + 2)}]/td[3]").text
                time_index = driver.find_element(By.XPATH, f"//*[@id='contents_liquid']/table/tbody/tr[{str(i + 2)}]/td[10]").text
                position = driver.find_element(By.XPATH, f"//*[@id='contents_liquid']/table/tbody/tr[{str(i + 2)}]/td[11]").text
                position_parts = position.split('-')
                position_parts = (position_parts + [""] * 4)[:4]
                position_1, position_2, position_3, position_4 = position_parts
                horse_data = {
                    "race_id": race_id,
                    "horse_number": horse_number,
                    "time_index": time_index,
                    "position_1": position_1,
                    "position_2": position_2,
                    "position_3": position_3,
                    "position_4": position_4,
                    "weather_name": weather_name,
                    "track_condition_name": track_condition_name
                }
                list_horse_data.append(horse_data)
            else:
                break

    return pd.DataFrame(list_horse_data)

In [5]:
#########################
# historyファイルを開く
#########################

df_history = pd.read_csv(f"C:\\keiba\\tool\\history\\history.csv", encoding="cp932")

  df_history = pd.read_csv(f"C:\\keiba\\tool\\history\\history.csv", encoding="cp932")


In [7]:
###################################################
# netkeibaから最新データを取得してhistoryを更新する
###################################################

# 更新対象のレコードを切り出す
last_idx = df_history['time_index'].last_valid_index()
df_to_update = df_history.loc[last_idx + 1:].copy()
# レースID
list_race_id = df_to_update['race_id'].unique().tolist()
# 更新すべきrace_idのリストが空でない場合はnetkeibaからデータ取得して更新する
if list_race_id:
    # スクレイピング準備
    service = Service(PATH_CHROME_DRIVER)
    options = webdriver.ChromeOptions()
    # options.add_argument('--headless')
    driver = webdriver.Chrome(service=service, options=options)
    # タイムアウト設定
    driver.set_page_load_timeout(30)
    # ログイン
    login_netkeiba(driver)
    # データベースからデータ取得
    df_new_data = get_data_from_database(driver, list_race_id)
    # スクレイピング終了
    driver.quit()
    # 型変換
    df_history["race_id"] = df_history["race_id"].astype(str)
    df_history["horse_number"] = df_history["horse_number"].astype(str)
    df_new_data["race_id"] = df_new_data["race_id"].astype(str)
    df_new_data["horse_number"] = df_new_data["horse_number"].astype(str)
    df_new_data['time_index'] = pd.to_numeric(df_new_data['time_index'], errors='coerce')
    # タイム指数更新
    df_history.set_index(['race_id', 'horse_number'], inplace=True)
    df_new_data.set_index(['race_id', 'horse_number'], inplace=True)
    df_history.update(df_new_data[['time_index', 'position_1', 'position_2', 'position_3', 'position_4', 'weather_name', 'track_condition_name']])
    df_history.reset_index(inplace=True)
    # CSV出力
    df_history.to_csv("C:\\keiba\\tool\\history\\history.csv", index=False, encoding="cp932")

  df_history.update(df_new_data[['time_index', 'position_1', 'position_2', 'position_3', 'position_4', 'weather_name', 'track_condition_name']])
  df_history.update(df_new_data[['time_index', 'position_1', 'position_2', 'position_3', 'position_4', 'weather_name', 'track_condition_name']])
  df_history.update(df_new_data[['time_index', 'position_1', 'position_2', 'position_3', 'position_4', 'weather_name', 'track_condition_name']])
  df_history.update(df_new_data[['time_index', 'position_1', 'position_2', 'position_3', 'position_4', 'weather_name', 'track_condition_name']])
  df_history.update(df_new_data[['time_index', 'position_1', 'position_2', 'position_3', 'position_4', 'weather_name', 'track_condition_name']])
  df_history.update(df_new_data[['time_index', 'position_1', 'position_2', 'position_3', 'position_4', 'weather_name', 'track_condition_name']])
  df_history.update(df_new_data[['time_index', 'position_1', 'position_2', 'position_3', 'position_4', 'weather_name', 'track_cond