In [426]:
import pandas as pd

pd.options.display.max_rows = 5
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 10

In [427]:
horse_vs_df = pd.read_csv('horse_vs_data.csv')
# horse_vs_df = horse_vs_df.iloc[:,:30]
df = pd.read_pickle('df_add_id.pickle')

In [428]:
horse_vs_df

Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,人気,着順,騎手,斤量,距離,馬場,馬場指数,タイム,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金
201810...,2022/0...,1小倉2,曇,10,小郡特別(2...,,12,7,10,3.5,2,10,秋山稔樹,54,芝1200,良,**,1:09.8,1.6,**,5-6,32.8-35.4,36.5,474(+2),,,メメントモリ,
201810...,2021/1...,6阪神8,曇,12,猪名川特別(...,,15,2,2,8.2,4,6,亀田温心,54,芝1400,良,**,1:21.9,0.2,**,2-3,35.3-34.8,34.8,472(-6),,,エルカスティージョ,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201810...,2020/0...,2新潟3,曇,2,2歳未勝利,,17,2,3,48.0,9,6,菅原明良,52,芝1600,稍,**,1:35.4,1.3,**,1-1,35.3-33.7,35.0,452(+2),,,フォティノース,
201810...,2020/0...,2福島3,曇,6,2歳新馬,,15,5,8,12.0,6,15,木幡巧也,54,芝1200,稍,**,1:14.7,3.1,**,4-9,35.5-36.1,38.6,450(0),,,ブルーバード,


In [429]:
df

Unnamed: 0,着順,枠,馬番,馬名,性齢,斤量,騎手,タイム,着差,人気,単勝オッズ,後3F,コーナー通過順,厩舎,馬体重(増減),date,race_type,course_ren,weather,ground_state,horse_id,jockey_id
202102010101,1,4,4,ナムラリコリス,牝2,52.0,△泉谷,1:09.3,,1,1.4,35.7,2-2,栗東大橋,466(+2),2021年7月3日,芝,1200,晴,良,201910...,01182
202102010101,2,1,1,プラソン,牝2,54.0,藤岡佑,1:09.4,1/2,2,5.5,35.9,1-1,栗東寺島,430(+2),2021年7月3日,芝,1200,晴,良,201910...,01093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202103010112,15,7,14,トランザクト,牡3,52.0,◇藤田菜,1:10.6,1.3/4,5,14.8,39.3,4-7,美浦武藤,496(-4),2021年7月3日,ダ,1150,曇,稍,201810...,01164
202103010112,16,4,8,リゼ,牝5,55.0,石川,1:12.4,大,14,107.5,39.5,15-15,美浦田中清,446(-4),2021年7月3日,ダ,1150,曇,稍,201610...,01150


## メインデータの前処理

In [430]:
def preprocessing(df):
  # 性齢
  df["性"] = df["性齢"].map(lambda x:str(x)[0])
  df["齢"] = df["性齢"].map(lambda x:str(x)[1:]).astype(int)
  # 人気
  df.loc[df['人気'] == "-", '人気'] = 99
  df["人気"] = df["人気"].map(lambda x:str(x).replace('.0', ''))
  df["人気"] = df["人気"].astype(int)
  # タイム
  df.loc[df['タイム'] == "-", 'タイム'] = "9:99.9"
  # 着差
  df.loc[df['着差'] == "-", '着差'] = "除外"
  df["着差"] = df["着差"].fillna("１着")
  # オッズ
  df.loc[df['単勝オッズ'] == "-", '単勝オッズ'] = "0.0"
  df["単勝オッズ"] = df["単勝オッズ"].astype(float)
  # 後3F
  df.loc[df['後3F'] == "-", '後3F'] = "9.9"
  df["後3F"] = df["後3F"].astype(float)
  # コーナー通過順
  df.loc[df['コーナー通過順'] == "-", 'コーナー通過順'] = "0-0-0-0"
  # 馬体重
  df = df[~(df["馬体重(増減)"].astype(str).str.contains("\."))]
  df["体重"] = df["馬体重(増減)"].str.split('(',expand=True)[0].astype(int)
  df["体重変化"] = df["馬体重(増減)"].str.split('(',expand=True)[1].str[:-1].astype(int)
  df.drop(["馬体重(増減)",'性齢'],axis=1, inplace=True)
  # 日付
  df["date"] = pd.to_datetime(df["date"],format="%Y年%m月%d日")

  
  return df  

## HorseResultsクラス

In [431]:
class HorseResults:
  def __init__(self,horse_vs_df):
    self.horse_results = horse_vs_df[["日付","着順","賞金"]]
    
  def preprocessing(self):
    df = self.horse_results.copy()

    df["着順"] = df["着順"].astype(int)
    df["date"] = pd.to_datetime(df["日付"])
    df.drop(["日付"],axis=1, inplace=True)
    
    self.horse_results = df
  
    return df

In [432]:
hr = HorseResults(horse_vs_df)
hr.preprocessing()
hr.horse_results

Unnamed: 0,着順,賞金,date
201810...,10,,2022-01-16
201810...,6,,2021-12-26
...,...,...,...
201810...,6,,2020-08-01
201810...,15,,2020-07-11


In [433]:
results_p = preprocessing(df)

In [434]:
horse_vs_df.head()

Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,人気,着順,騎手,斤量,距離,馬場,馬場指数,タイム,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金
201810...,2022/0...,1小倉2,曇,10,小郡特別(2...,,12,7,10,3.5,2,10,秋山稔樹,54,芝1200,良,**,1:09.8,1.6,**,5-6,32.8-35.4,36.5,474(+2),,,メメントモリ,
201810...,2021/1...,6阪神8,曇,12,猪名川特別(...,,15,2,2,8.2,4,6,亀田温心,54,芝1400,良,**,1:21.9,0.2,**,2-3,35.3-34.8,34.8,472(-6),,,エルカスティージョ,
201810...,2021/1...,4阪神7,晴,8,3歳以上2勝クラス,,11,2,2,5.8,4,9,池添謙一,53,芝1400,良,**,1:22.5,0.7,**,3-4,35.1-34.9,35.3,478(+2),,,セウラサーリ,
201810...,2021/1...,4東京1,曇,12,3歳以上2勝クラス,,13,5,7,13.8,7,4,三浦皇成,53,芝1400,良,**,1:22.3,0.4,**,5-8,36.3-34.1,33.9,476(+2),,,レガトゥス,170.0
201810...,2021/0...,1函館2,曇,9,3歳以上1勝クラス,,16,3,5,7.3,3,1,秋山稔樹,50,芝1200,良,**,1:08.4,-0.4,**,5-4,33.2-35.2,34.7,474(0),,,(メイショウ...,760.0


In [435]:
results_p.head()

Unnamed: 0,着順,枠,馬番,馬名,斤量,騎手,タイム,着差,人気,単勝オッズ,後3F,コーナー通過順,厩舎,date,race_type,course_ren,weather,ground_state,horse_id,jockey_id,性,齢,体重,体重変化
202102010101,1,4,4,ナムラリコリス,52.0,△泉谷,1:09.3,１着,1,1.4,35.7,2-2,栗東大橋,2021-07-03,芝,1200,晴,良,201910...,1182,牝,2,466,2
202102010101,2,1,1,プラソン,54.0,藤岡佑,1:09.4,1/2,2,5.5,35.9,1-1,栗東寺島,2021-07-03,芝,1200,晴,良,201910...,1093,牝,2,430,2
202102010101,3,8,9,ニシノタマユラ,54.0,黛,1:09.8,2,6,29.6,35.8,5-5,美浦深山,2021-07-03,芝,1200,晴,良,201910...,1109,牝,2,450,-6
202102010101,4,3,3,コスモツカサ,54.0,丹内,1:09.8,クビ,4,13.9,36.0,4-3,美浦伊藤大,2021-07-03,芝,1200,晴,良,201910...,1091,牝,2,430,2
202102010101,5,5,5,カルネアサーダ,51.0,▲小沢,1:09.8,クビ,3,6.0,36.1,3-3,栗東加用,2021-07-03,芝,1200,晴,良,201910...,1185,牝,2,444,2


In [436]:
sample_data = results_p["date"][0]
sample_data

Timestamp('2021-07-03 00:00:00')

sample_dataの日付に走った馬のリスト


In [437]:
horse_id_list = results_p[results_p["date"] == sample_data]["horse_id"].unique()
len(horse_id_list)

313

In [438]:
len(hr.horse_results)

48