In [1]:
import requests
import pandas as pd
import numpy as np
import re
import time
import math
from datetime import datetime, timedelta
from bs4 import BeautifulSoup

In [2]:
df_race = pd.read_csv('race_data.csv')
df_race

Unnamed: 0,race_id,date,place,track,length,direction,weather,condition,rank,horse_num,horse_id,jockey_id,popularity,weight
0,202002020410,2020-07-12,函館,芝,2600,右,曇,良,1,6,2017101695,1015,10.0,
1,202002020410,2020-07-12,函館,芝,2600,右,曇,良,2,8,2015100243,1091,4.0,
2,202002020410,2020-07-12,函館,芝,2600,右,曇,良,3,11,2016101207,5203,5.0,
3,202002020410,2020-07-12,函館,芝,2600,右,曇,良,4,13,2016104887,1032,2.0,
4,202002020410,2020-07-12,函館,芝,2600,右,曇,良,5,9,2016103294,1127,3.0,
5,202002020410,2020-07-12,函館,芝,2600,右,曇,良,6,3,2013105912,1116,7.0,
6,202002020410,2020-07-12,函館,芝,2600,右,曇,良,7,2,2016104452,1163,6.0,
7,202002020410,2020-07-12,函館,芝,2600,右,曇,良,8,4,2016102311,1095,12.0,
8,202002020410,2020-07-12,函館,芝,2600,右,曇,良,9,7,2016102005,1176,8.0,
9,202002020410,2020-07-12,函館,芝,2600,右,曇,良,10,12,2017104925,5339,1.0,


In [4]:
horse = df_race['horse_id'].unique().tolist()
jockey = df_race['jockey_id'].unique().tolist()

print(len(horse), len(jockey))

8194 265


In [5]:
############## Helper #################
def convert_num(value:str, flg:bool):
    '''
    str型を数値int/float型に変更
    null、空文字、変更不可の場合Nanを返却
    '''
    
    if value is None or value == '':
        return np.nan
    
    try:
        if flg:
            ret = int(value)
        else:
            ret = float(value)
        return ret

    except ValueError:
        return np.nan

In [18]:
def get_horse(horse_id):
    '''
    馬情報取得
    '''
    time.sleep(3)

    res = requests.get(f'https://db.netkeiba.com/horse/{str(horse_id).zfill(10)}/')
    res.encoding = res.apparent_encoding

    bs = BeautifulSoup(res.content, 'lxml')
    # sex
    sex = bs.find('p', class_='txt_01').get_text().split('　')[1][0]

    trs = bs.find('table', class_='db_prof_table').find_all('tr')
    title = []
    for tr in trs:
        title.append(tr.find('th').get_text())
    # birthday
    if title.index('生年月日'):
        birthday = trs[title.index('生年月日')].find('td').get_text()
    else:
        birthday = '1900/1/1'
    # trainer
    if title.index('調教師'):
        trainer = trs[title.index('調教師')].find('td').find('a').get('href').split('/')[2]
    else:
        trainer = 0

    # prize
    def conv_prize(val):
        ret = 0
        if '億' in val:
            ret += int(val[:val.find('億')]) * 10000
        ret += int(val[(val.find('億') + 1):].replace(',', ''))

        return ret
    
    prize_center = 0
    prize_region = 0
    if title.index('獲得賞金'):
        for prize in trs[title.index('獲得賞金')].find('td').get_text().split(' /'):
            if '中央' in prize:
                prize_center = conv_prize(prize.replace('万円 (中央)', ''))
            elif '地方' in prize:
                prize_region = conv_prize(prize.replace('万円 (地方)', ''))

    ret = [sex, birthday, trainer, prize_center, prize_region]

    # attribute items
    table = bs.find('table', class_='tekisei_table')
    for tr in table.find_all('tr'):
        ret.append(int(tr.find_all('img')[1].get('width')))

    return ret

In [7]:
def get_parents(horse_id):
    time.sleep(3)

    res = requests.get(f'https://db.netkeiba.com/horse/ped/{str(horse_id).zfill(10)}')
    res.encoding = res.apparent_encoding

    bs = BeautifulSoup(res.content, 'lxml')

    table = bs.find('table', class_='blood_table detail')

    if not table:
        return []
    
    tr = table.find_all('tr')

    parents_ids = []
    # 父
    parents_ids.append(tr[0].find_all('td')[0].find('a').get('href').split('/')[2])
    # 母父
    parents_ids.append(tr[16].find_all('td')[1].find('a').get('href').split('/')[2])

    ret = []
    for parent_id in parents_ids:
        res = requests.get(f'https://db.netkeiba.com/horse/result/{str(parent_id).zfill(10)}/')
        res.encoding = res.apparent_encoding

        ret = ret + get_result_parents(res.content)

    return ret

def get_result_parents(text):
    '''
    過去戦績取得
    '''

    bs = BeautifulSoup(text, 'lxml')

    table = bs.find('table', class_='db_h_race_results nk_tb_common')

    # 過去戦績がない場合
    if not table:
        return [0, 0, 0, 0]
    
    # tableタイトル取得
    title = []
    for th in table.find('thead').find_all('th'):
        title.append(th.get_text())
    
    data = []
    for tr in table.find('tbody').find_all('tr'):
        if '着順' in title:
            rank = tr.find_all('td')[title.index('着順')]

            try:
                rank = int(rank.get_text())
            except ValueError:
                rank = 0
            data.append(rank)

        else:
            data.append(0)
    # return -> [着順1のレース数, 着順2のレース数, 着順3のレース数, 着順4以降のレース数]
    return [data.count(1), data.count(2), data.count(3), len(data)-(data.count(1)+data.count(2)+data.count(3))]

In [8]:
def get_weight(horse_id):
    time.sleep(3)

    res = requests.get(f'https://db.netkeiba.com/horse/result/{str(horse_id).zfill(10)}/')
    res.encoding = res.apparent_encoding

    bs = BeautifulSoup(res.content, 'lxml')
    table = bs.find('table', class_='db_h_race_results nk_tb_common')
    trs = table.find_all('tr')

    for i, th in enumerate(trs[0].find_all('th')):
        if th.get_text() == 'レース名':
            index_race = i
        if th.get_text() == '馬体重':
            index_weight = i

    data = ''
    for i in range(1, len(trs)):
        race = trs[i].find_all('td')[index_race].find('a').get('href').split('/')[2]
        if i == len(trs[1:]):
            data += race + ',' + trs[i].find_all('td')[index_weight].get_text().split('(')[0]
        else:
            data += race + ',' + trs[i+1].find_all('td')[index_weight].get_text().split('(')[0] + ':'
    
    return [data]

In [19]:
horse_info = []
for i, horse_url in enumerate(horse):
    start = time.time()
    horse_info.append([horse_url] + get_horse(horse_url) + get_parents(horse_url) + get_weight(horse_url))
    print(f'{i} / {len(horse)} : horse_id={horse_url} : {round((time.time()-start)/60, 3)}min')

cols = ['horse_id', 'sex', 'birthday', 'trainer', 'prize_center', 'prize_region', 'attr_course', 'attr_length', 'attr_leg', 'attr_growth', 'attr_condition',
        'f_res_1st', 'f_res_2nd', 'f_res_3rd', 'f_res_lose', 'mf_res_1st', 'mf_res_2nd', 'mf_res_3rd', 'mf_res_lose', 'weight']
df = pd.DataFrame(horse_info, columns=cols)
df.to_csv('horse_data.csv', header=True, index=False)

0 / 8194 : horse_id=2017101695 : 0.244min


Unnamed: 0,horse_id,sex,birthday,trainer,prize_center,prize_region,attr_course,attr_length,attr_leg,attr_growth,attr_condition,f_res_1st,f_res_2nd,f_res_3rd,f_res_lose,mf_res_1st,mf_res_2nd,mf_res_3rd,mf_res_lose,weight
0,2017101695,牡,1900/1/1,5660,1527,1031,58,1,116,58,116,0,0,0,0,12,1,0,1,"202002020410,454:202030061812,458:202030051412..."


In [36]:
def get_jockey(jockey_id):
    time.sleep(3)

    res = requests.get(f'https://db.netkeiba.com/jockey/profile/{str(jockey_id).zfill(5)}/')
    res.encoding = res.apparent_encoding

    bs = BeautifulSoup(res.content, 'lxml')

    # birthday
    birthday = None
    try:
        birthday = bs.find('p', class_='txt_01').get_text().split('\n')[1]
    except Exception as e:
        print(e)

    # weight
    weight = None
    table = bs.find('table', class_='nk_tb_common race_table_01')
    title = []
    if table:
        for tr in table.find_all('tr'):
            title.append(tr.find('th').get_text())
        if '体重' in title:
            weight =table.find_all('tr')[title.index('体重')].find('td').get_text()[:-2]
        else:
            print(jockey_id, ': weight=0 : no column')
    else:
        print(jockey_id, ': weight=0 : no table')

    return [weight, birthday]

In [37]:
jockey_info = []
for i, jockey_url in enumerate(jockey):
    start = time.time()
    jockey_info.append([jockey_url] + get_jockey(jockey_url))
    print(f'{i} / {len(jockey)} : horse_id={jockey_url} : {round((time.time()-start)/60, 3)}min')

cols = ['jockey_id', 'weight', 'birthday']
df = pd.DataFrame(jockey_info, columns=cols)
df.to_csv('horse_data.csv', header=True, index=False)
df

0 / 265 : horse_id=1015 : 0.056min
1 / 265 : horse_id=1091 : 0.055min
2 / 265 : horse_id=5203 : 0.055min
3 / 265 : horse_id=1032 : 0.055min
4 / 265 : horse_id=1127 : 0.055min
5 / 265 : horse_id=1116 : 0.055min
6 / 265 : horse_id=1163 : 0.055min
7 / 265 : horse_id=1095 : 0.055min
8 / 265 : horse_id=1176 : 0.055min
9 / 265 : horse_id=5339 : 0.055min
10 / 265 : horse_id=1093 : 0.055min
11 / 265 : horse_id=1096 : 0.055min
12 / 265 : horse_id=1109 : 0.055min
13 / 265 : horse_id=1030 : 0.055min
14 / 265 : horse_id=1112 : 0.055min
15 / 265 : horse_id=1025 : 0.055min
16 / 265 : horse_id=1124 : 0.055min
17 / 265 : horse_id=1144 : 0.054min
18 / 265 : horse_id=660 : 0.055min
19 / 265 : horse_id=5585 : 0.055min
20 / 265 : horse_id=1122 : 0.055min
21 / 265 : horse_id=1043 : 0.055min
22 / 265 : horse_id=1014 : 0.055min
23 / 265 : horse_id=1161 : 0.055min
24 / 265 : horse_id=1179 : 0.055min
25 / 265 : horse_id=1140 : 0.055min
26 / 265 : horse_id=663 : 0.055min
27 / 265 : horse_id=1125 : 0.055min
28 /

Unnamed: 0,jockey_id,weight,birthday
0,1015,50,1977/09/26
1,1091,47,1985/11/05
2,5203,50,1974/03/12
3,1032,45,1979/07/23
4,1127,47,1990/10/03
...,...,...,...
260,1111,47,1986/05/12
261,711,49,1974/03/19
262,5498,53,1983/02/07
263,635,50,1965/07/22
