馬のスピード分析

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib

Using matplotlib backend: MacOSX


In [5]:
def pickup_data(year, JyoCD=None, race_type=0):
    """
    race_type:
        0: 芝とダート
        1: ダート
        2: 芝
    """


    # x分xx秒の文字列を秒のfloatに変換
    def convert_time_string(s):
        min = float(s[0])
        sec = float(s[1:3])
        sec10 = float(s[3])
        return min * 60 + sec + (sec10/10)

    engine = create_engine('mysql+pymysql://uma:UmaUma123!@localhost/everydb2?charset=utf8')

    # get all uma_race record
    sql = "select a.Year,a.MonthDay,a.JyoCD,a.RaceNum,a.Bamei,a.KakuteiJyuni,a.Time, \
        a.TimeDiff, \
        b.Kyori,b.TrackCD from n_uma_race as a, n_race as b \
        where a.Year = %s and a.Year = b.Year \
            and a.MonthDay = b.MonthDay and a.JyoCD = b.JyoCD and a.RaceNum = b.RaceNum \
        " % year
    all_uma_race = pd.read_sql_query(sql, engine)

    #　文字列を数字に変換
    all_uma_race[["Kyori"]]=all_uma_race[["Kyori"]].astype(float)
    all_uma_race[["TrackCD"]]=all_uma_race[["TrackCD"]].astype(int)

    all_uma_race['Seconds'] = all_uma_race['Time'].map(convert_time_string)

    # Time 0のデータを除外
    all_uma_race = all_uma_race[all_uma_race['Seconds'] > 0]
    # 障害レースを除外
    all_uma_race = all_uma_race[all_uma_race['TrackCD'] < 50]
    
    # ダートレースに限定
    if race_type == 1:
        all_uma_race = all_uma_race[all_uma_race['TrackCD'] >= 23]
    # 芝レースに限定
    elif race_type == 2:
        all_uma_race = all_uma_race[all_uma_race['TrackCD'] <= 22]
        all_uma_race = all_uma_race[all_uma_race['TrackCD'] >= 10]

    # KakuteiJyuni=01のみ対象とする
    # all_uma_race = all_uma_race[all_uma_race['KakuteiJyuni'] == '01']

    # 競馬場を限定する
    if JyoCD is not None:
        all_uma_race = all_uma_race[all_uma_race['JyoCD'] == JyoCD]

    return all_uma_race

トラックコード 10-22：芝 23-29：ダート
51以上は障害レース

JyoCD
01 札幌競馬場
02 函館競馬場
03 福島競馬場
04 新潟競馬場
05 東京競馬場
06 中山競馬場
07 中京競馬場
08 京都競馬場
09 阪神競馬場
10 小倉競馬場

In [4]:
# 最小二乗法で解を求める
def resolve(dataset, m):
    t = dataset.y
    phi = DataFrame()
    for i in range(0,m+1):
        p = dataset.x**i
        p.name="x**%d" % i
        phi = pd.concat([phi,p], axis=1)
    tmp = np.linalg.inv(np.dot(phi.T, phi))
    ws = np.dot(np.dot(tmp, phi.T), t)

    def f(x):
        y = 0
        for i, w in enumerate(ws):
            y += w * (x ** i)
        return y

    return (f,ws)

# 平方根平均二乗誤差（Root mean square error）を計算
def rms_error(dataset, f):
    err = 0.0
    for index, line in dataset.iterrows():
        x, y = line.x, line.y
        err += 0.5 * (y - f(x))**2
    return np.sqrt(2 * err / len(dataset))

def show_result(subplot, train_set, m, year, race_type, jyo_name):
    (f,ws) = resolve(train_set, m)
    print("WS:",ws)

    
    # トレーニングセットを表示
    subplot.scatter(train_set.x, train_set.y, marker='o',
                    color='blue', label=None)

    # 多項式近似の曲線を表示
    xmin = train_set.x.min()
    xmax = train_set.x.max()
    linex = np.linspace(xmin*0.95,xmax*1.05,100)
    liney = f(linex)
    label = "E(RMS)=%.2f" % rms_error(train_set, f)
    subplot.plot(linex, liney, color='red', label=label)
    subplot.legend(loc=1)
    subplot.set_xlabel("Distance(m)")
    subplot.set_ylabel("Time(sec)")
    subplot.set_title("YEAR=%d Data Count=%d, Jyo=%s Race type=%d ws=%8.3e" % 
                      (year, train_set.x.count(),jyo_name, race_type,ws[m]))
    fname='umarace-%d-%s-%d.png' % ( year, jyo_name, race_type)
    plt.savefig(fname)
    plt.close()

jyo_cd_dict = {
    '01' : 'Sapporo',
    '02' : 'Hakodate',
    '03' : 'Fukushima',
    '04' : 'Niigata',
    '05' : 'Tokyo',
    '06' : 'Nakayama',
    '07' : 'Chukyo',
    '08' : 'Kyoto',
    '09' : 'Hanshin',
    '10' : 'Kokura',
    None : 'ALL'
}
        
def process_distance_sec(year, JyoCD=None, race_type=0):
    jyo_name = jyo_cd_dict[JyoCD]
    print('\nYear:%d, Jyo:%s RaceType:%d' % (year,jyo_name,race_type))
    races = pickup_data(year=year, JyoCD=JyoCD, race_type=race_type)
    if races.Kyori.count() == 0:
        print('*** NO Data ***')
        return
    
    x = races['Kyori'].values
    y = races['Seconds'].values

    dataset = DataFrame(columns=['x','y'])
    dataset['x'] = x
    dataset['y'] = y

    fig = plt.figure(figsize=(16, 10))
    dig_list = [2]
    for i, m in enumerate(dig_list):
        subplot = fig.add_subplot(1,1,i+1)
        show_result(subplot, dataset, m , year, race_type , jyo_name)
    

In [26]:
for year in [2015,2014,2013,2012]:
    for t in [1,2]:
        process_distance_sec(year=year, race_type=t)


Year:2015, Jyo:ALL RaceType:1
WS: [ -1.76950382e+01   8.44727699e-02  -5.48879493e-06]

Year:2015, Jyo:ALL RaceType:2
WS: [ -9.67190479e+00   6.69564243e-02  -3.58593265e-07]

Year:2014, Jyo:ALL RaceType:1
WS: [ -1.77426300e+01   8.43556065e-02  -5.37774734e-06]

Year:2014, Jyo:ALL RaceType:2
WS: [ -1.06999100e+01   6.82056970e-02  -6.90993215e-07]

Year:2013, Jyo:ALL RaceType:1
WS: [ -1.69108187e+01   8.35460477e-02  -5.14722869e-06]

Year:2013, Jyo:ALL RaceType:2
WS: [ -9.58683603e+00   6.68080711e-02  -2.82304039e-07]

Year:2012, Jyo:ALL RaceType:1
WS: [ -1.43003250e+01   8.07056437e-02  -4.29081544e-06]

Year:2012, Jyo:ALL RaceType:2
WS: [ -9.77968131e+00   6.69645740e-02  -2.77943146e-07]


In [27]:
for year in [2015,2014,2013,2012,2011,2010]:
    for jyo in [None, '01','02','03','04','05','06','07','08','09','10']:
        for t in [1,2]:
            process_distance_sec(year=year, race_type=t, JyoCD=jyo)


Year:2015, Jyo:ALL RaceType:1
WS: [ -1.76950382e+01   8.44727699e-02  -5.48879493e-06]

Year:2015, Jyo:ALL RaceType:2
WS: [ -9.67190479e+00   6.69564243e-02  -3.58593265e-07]

Year:2015, Jyo:Sapporo RaceType:1
WS: [ -3.57529117e+00   6.14040919e-02   2.22355780e-06]

Year:2015, Jyo:Sapporo RaceType:2
WS: [ -1.21588731e+01   7.01984995e-02  -1.21542430e-06]

Year:2015, Jyo:Hakodate RaceType:1
WS: [ -2.14856685e+00   5.98956108e-02   2.65995194e-06]

Year:2015, Jyo:Hakodate RaceType:2
WS: [ -5.17806231e+00   6.19217882e-02   1.31432031e-06]

Year:2015, Jyo:Fukushima RaceType:1
WS: [ -8.44940658e+00   6.84829381e-02   9.81519280e-08]

Year:2015, Jyo:Fukushima RaceType:2
WS: [ -7.02602087e+00   6.42090934e-02   4.04207910e-07]

Year:2015, Jyo:Niigata RaceType:1
WS: [ -1.65537109e+00   3.08823204e-01  -1.10409688e-05]

Year:2015, Jyo:Niigata RaceType:2
WS: [ -7.14042360e+00   6.32015644e-02   7.28574785e-07]

Year:2015, Jyo:Tokyo RaceType:1
WS: [  1.29010230e+01   4.28070632e-02   7.141206

In [None]:
process_distance_sec(year=2012)

In [None]:
plt.scatter(dirt_race['Kyori'],dirt_race['Seconds'],marker='x', color='blue')

In [None]:
# 距離別レース数のカウント
all_uma_race['Kyori'].value_counts().sort_index()

In [None]:
#レースのTrackCD別カウント
all_uma_race['TrackCD'].value_counts().sort_index()

In [None]:
#競馬場別レース数カウント
all_uma_race['JyoCD'].value_counts().sort_index()

In [None]:
all_uma_race = all_uma_race[all_uma_race['JyoCD'] <= '10']
kyori_counts = all_uma_race['Kyori'].groupby(all_uma_race['JyoCD'])
kyori_counts.value_counts()
# model_ols = pd.ols(y=turf_race['Seconds'], x=turf_race['Kyori'], intercept = True)
# print(model_ols)

In [13]:
data = pickup_data(year=2014,JyoCD='09')


In [14]:
data1=data[data['MonthDay'] == '0629']

In [15]:
data1[data1['RaceNum'] == '11']

Unnamed: 0,Year,MonthDay,JyoCD,RaceNum,Bamei,KakuteiJyuni,Time,TimeDiff,Kyori,TrackCD,Seconds
43981,2014,629,9,11,トーセンジョーダン,10,2153,14,2200,17,135.3
43982,2014,629,9,11,ホッコーブレーヴ,8,2148,9,2200,17,134.8
43983,2014,629,9,11,カレンミロティック,2,2144,5,2200,17,134.4
43984,2014,629,9,11,ウインバリアシオン,7,2148,9,2200,17,134.8
43985,2014,629,9,11,ヴェルデグリーン,12,2161,22,2200,17,136.1
43986,2014,629,9,11,ヒットザターゲット,4,2146,7,2200,17,134.6
43987,2014,629,9,11,ゴールドシップ,1,2139,-5,2200,17,133.9
43988,2014,629,9,11,ジェンティルドンナ,9,2151,12,2200,17,135.1
43989,2014,629,9,11,ヴィルシーナ,3,2146,7,2200,17,134.6
43990,2014,629,9,11,メイショウマンボ,11,2154,15,2200,17,135.4


In [50]:
ws = np.array([-1.76950382e+01, 8.44727699e-02,-5.48879493e-06])
x = 2000
l = np.array([x**0,x**1,x**2])
ws.dot(l)
x**2 * ws[2]

-21.95517972