### netkeiba.comからのスクレイピング

In [2]:
import pandas as pd
import time

In [3]:
# ネット競馬からスクレイピング
def scrape_race_results(race_id_list, pre_race_results={}):
    race_results = pre_race_results.copy()
    for race_id in race_id_list:
        if race_id in race_results.keys():
            continue
        try:
            time.sleep(1)
            url = 'https://db.netkeiba.com/race/' + race_id
            race_results[race_id] = pd.read_html(url)[0]
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
        except:
            break
    return race_results

In [4]:
# レースIDのリストを作る
race_id_list = []
for place in range(1, 11, 1):
    for kai in range(1, 6, 1):
        for day in range(1, 13, 1):
            for r in range(1, 13, 1):
                race_id = "2019" + str(place).zfill(2) + str(kai).zfill(2) + str(day).zfill(2) + str(r).zfill(2)
                race_id_list.append(race_id)

In [5]:
# スクレイピングしてデータを保存
data = scrape_race_results(race_id_list)
for key in data:
    data[key].index = [key] * len(data[key])
results = pd.concat([data[key] for key in data], sort=False)
results.to_pickle('results.pickle')

### データ整形・前処理

In [6]:
results = pd.read_pickle('results.pickle')

In [68]:
def preprocessing(results):
    df = results.copy()
    
    # 着順に数字以外の文字列が含まれているものを取り除く
    df = df[~(df['着順'].astype(str).str.contains('\D'))]
    df['着順'] = df['着順'].astype(int)
    
    # 性齢を性と年齢に分ける
    df['性'] = df['性齢'].map(lambda x: str(x)[0])
    df['年齢'] = df['性齢'].map(lambda x: str(x)[1:]).astype(int)
    
    # 馬体重を体重と体重変化に分ける
    df['体重'] = df['馬体重'].str.split('(', expand=True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(', expand=True)[1].str[:-1].astype(int)
    
    # 単勝をfloatに変換
    df['単勝'] = df['単勝'].astype(float)
    
    # 不要な列を削除
    df.drop(['タイム', '着差', '調教師', '性齢', '馬体重'], axis=1, inplace=True)
    
    return df

In [73]:
test = preprocessing(results)
test

Unnamed: 0,着順,枠番,馬番,馬名,斤量,騎手,単勝,人気,性,年齢,体重,体重変化
201901010101,1,1,1,ゴルコンダ,54.0,ルメール,1.4,1.0,牡,2,518,-16
201901010101,2,3,3,プントファイヤー,54.0,岩田康誠,3.5,2.0,牡,2,496,-8
201901010101,3,4,4,ラグリマスネグラス,51.0,団野大成,46.6,6.0,牡,2,546,6
201901010101,4,8,9,キタノコドウ,51.0,菅原明良,56.8,7.0,牡,2,458,-8
201901010101,5,5,5,ネモフィラブルー,54.0,川島信二,140.3,9.0,牡,2,436,0
...,...,...,...,...,...,...,...,...,...,...,...,...
201910021212,12,6,11,スリープレッピー,56.0,森裕太朗,120.3,15.0,セ,6,458,8
201910021212,13,1,1,バリオラージュ,54.0,斎藤新,7.5,4.0,牡,5,460,2
201910021212,14,2,3,サンライズアミーゴ,54.0,亀田温心,99.2,12.0,牡,4,478,14
201910021212,15,6,12,トロハ,52.0,武豊,17.5,8.0,牝,3,468,2
