# データ加工・前処理



In [9]:
import pandas as pd
import numpy as np
import datetime
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm
import re
from urllib.request import urlopen
!pip3 install pickle5
import pickle5 as pickle



## DataProcessorクラス

訓練データと出馬表データを加工する抽象クラスです。DataProcessorクラスを、この次に定義するResultsクラスとShutubaTableクラスで継承して使います。

In [10]:
class DataProcessor:
    """    
    Attributes:
    ----------
    data : pd.DataFrame
        rawデータ
    data_p : pd.DataFrame
        preprocessing後のデータ
    data_h : pd.DataFrame
        merge_horse_results後のデータ
    data_pe : pd.DataFrame
        merge_peds後のデータ
    data_c : pd.DataFrame
        process_categorical後のデータ
    no_peds: Numpy.array
        merge_pedsを実行した時に、血統データが存在しなかった馬のhorse_id一覧
    """
    
    def __init__(self):
        self.data = pd.DataFrame()
        self.data_p = pd.DataFrame()
        self.data_h = pd.DataFrame()
        self.data_pe = pd.DataFrame()
        self.data_c = pd.DataFrame()
        
    def merge_horse_results(self, hr, n_samples_list=[5, 9, 'all']):
        """
        馬の過去成績データから、
        n_samples_listで指定されたレース分の着順と賞金の平均を追加してdata_hに返す
        Parameters:
        ----------
        hr : HorseResults
            馬の過去成績データ
        n_samples_list : list, default [5, 9, 'all']
            過去何レース分追加するか
        """
        self.data_h = self.data_p.copy()
        for n_samples in n_samples_list:
            self.data_h = hr.merge_all(self.data_h, n_samples=n_samples)
        self.data_h.drop(['開催'], axis=1, inplace=True)    
	    
    def merge_peds(self, peds):
        """
        5世代分血統データを追加してdata_peに返す
        Parameters:
        ----------
        peds : Peds.peds_e
            Pedsクラスで加工された血統データ。
        """
	
        self.data_pe = \
            self.data_h.merge(peds, left_on='horse_id', right_index=True,
                                                             how='left')
        self.no_peds = self.data_pe[self.data_pe['peds_0'].isnull()]\
            ['horse_id'].unique()
        if len(self.no_peds) > 0:
            print('scrape peds at horse_id_list "no_peds"')
            
    def process_categorical(self, le_horse, le_jockey, results_m):
        """
        カテゴリ変数を処理してdata_cに返す
        Parameters:
        ----------
        le_horse : sklearn.preprocessing.LabelEncoder
            horse_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        le_jockey : sklearn.preprocessing.LabelEncoder
            jockey_idを0始まりの整数に変換するLabelEncoderオブジェクト。
        results_m : Results.data_pe
            ダミー変数化のとき、ResultsクラスとShutubaTableクラスで列を合わせるためのもの
        """
	
        df = self.data_pe.copy()
        
        #ラベルエンコーディング。horse_id, jockey_idを0始まりの整数に変換
        mask_horse = df['horse_id'].isin(le_horse.classes_)
        new_horse_id = df['horse_id'].mask(mask_horse).dropna().unique()
        le_horse.classes_ = np.concatenate([le_horse.classes_, new_horse_id])
        df['horse_id'] = le_horse.transform(df['horse_id'])
        mask_jockey = df['jockey_id'].isin(le_jockey.classes_)
        new_jockey_id = df['jockey_id'].mask(mask_jockey).dropna().unique()
        le_jockey.classes_ = np.concatenate([le_jockey.classes_, new_jockey_id])
        df['jockey_id'] = le_jockey.transform(df['jockey_id'])
        
        #horse_id, jockey_idをpandasのcategory型に変換
        df['horse_id'] = df['horse_id'].astype('category')
        df['jockey_id'] = df['jockey_id'].astype('category')
        
        #そのほかのカテゴリ変数をpandasのcategory型に変換してからダミー変数化
        #列を一定にするため
        weathers = results_m['weather'].unique()
        race_types = results_m['race_type'].unique()
        ground_states = results_m['ground_state'].unique()
        sexes = results_m['性'].unique()
        df['weather'] = pd.Categorical(df['weather'], weathers)
        df['race_type'] = pd.Categorical(df['race_type'], race_types)
        df['ground_state'] = pd.Categorical(df['ground_state'], ground_states)
        df['性'] = pd.Categorical(df['性'], sexes)
        df = pd.get_dummies(df, columns=['weather', 'race_type', 'ground_state', '性'])
        
        self.data_c = df

## Resultsクラス

訓練に使うレース結果データを加工するクラス


In [11]:
class Results(DataProcessor):
    def __init__(self, results):
        super(Results, self).__init__()
        self.data = results
        
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(race_id_list):
        """
        レース結果データをスクレイピングする関数
        Parameters:
        ----------
        race_id_list : list
            レースIDのリスト
        Returns:
        ----------
        race_results_df : pandas.DataFrame
            全レース結果データをまとめてDataFrame型にしたもの
        """

        #race_idをkeyにしてDataFrame型を格納
        race_results = {}
        for race_id in tqdm(race_id_list):
            try:
                url = "https://db.netkeiba.com/race/" + race_id
                #メインとなるテーブルデータを取得
                df = pd.read_html(url)[0]

                html = requests.get(url)
                html.encoding = "EUC-JP"
                soup = BeautifulSoup(html.text, "html.parser")

                #天候、レースの種類、コースの長さ、馬場の状態、日付をスクレイピング
                texts = (
                    soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                    + soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
                )
                info = re.findall(r'\w+', texts)
                for text in info:
                    if text in ["芝", "ダート"]:
                        df["race_type"] = [text] * len(df)
                    if "障" in text:
                        df["race_type"] = ["障害"] * len(df)
                    if "m" in text:
                        df["course_len"] = [int(re.findall(r"\d+", text)[0])] * len(df)
                    if text in ["良", "稍重", "重", "不良"]:
                        df["ground_state"] = [text] * len(df)
                    if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                        df["weather"] = [text] * len(df)
                    if "年" in text:
                        df["date"] = [text] * len(df)

                #馬ID、騎手IDをスクレイピング
                horse_id_list = []
                horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/horse")}
                )
                for a in horse_a_list:
                    horse_id = re.findall(r"\d+", a["href"])
                    horse_id_list.append(horse_id[0])
                jockey_id_list = []
                jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all(
                    "a", attrs={"href": re.compile("^/jockey")}
                )
                for a in jockey_a_list:
                    jockey_id = re.findall(r"\d+", a["href"])
                    jockey_id_list.append(jockey_id[0])
                df["horse_id"] = horse_id_list
                df["jockey_id"] = jockey_id_list

                #インデックスをrace_idにする
                df.index = [race_id] * len(df)

                race_results[race_id] = df
                time.sleep(1)
            #存在しないrace_idを飛ばす
            except IndexError:
                continue
            #wifiの接続が切れた時などでも途中までのデータを返せるようにする
            except Exception as e:
                print(e)
                break
            #Jupyterで停止ボタンを押した時の対処
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる
        race_results_df = pd.concat([race_results[key] for key in race_results])

        return race_results_df
    
    #前処理    
    def preprocessing(self):
        df = self.data.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)
        df['rank'] = df['着順'].map(lambda x:1 if x<4 else 0)

        # 性齢を性と年齢に分ける
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df["体重"] = df["馬体重"].str.split("(", expand=True)[0]
        df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1]
	
        #errors='coerce'で、"計不"など変換できない時に欠損値にする
        df['体重'] = pd.to_numeric(df['体重'], errors='coerce')
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')

        # 単勝をfloatに変換
        df["単勝"] = df["単勝"].astype(float)
	# 距離は10の位を切り捨てる
        df["course_len"] = df["course_len"].astype(float) // 100

        # 不要な列を削除
        df.drop(["タイム", "着差", "調教師", "性齢", "馬体重", '馬名', '騎手', '人気', '着順'],
                axis=1, inplace=True)

        df["date"] = pd.to_datetime(df["date"], format="%Y年%m月%d日")
        
        #開催場所
        df['開催'] = df.index.map(lambda x:str(x)[4:6])

        self.data_p = df
    
    #カテゴリ変数の処理
    def process_categorical(self):
        self.le_horse = LabelEncoder().fit(self.data_pe['horse_id'])
        self.le_jockey = LabelEncoder().fit(self.data_pe['jockey_id'])
        super().process_categorical(self.le_horse, self.le_jockey, self.data_pe)

## ShutubaTableクラス

予測に使う出馬表データを加工するクラスです。
ここで出馬表も取得しています

In [12]:
class ShutubaTable(DataProcessor):
    def __init__(self, shutuba_tables):
        super(ShutubaTable, self).__init__()
        self.data = shutuba_tables
    
    @classmethod
    def scrape(cls, race_id_list, date):
        data = pd.DataFrame()
        for race_id in tqdm(race_id_list):
            url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
            df = pd.read_html(url)[0]
            df = df.T.reset_index(level=0, drop=True).T

            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = soup.find('div', attrs={'class': 'RaceData01'}).text
            texts = re.findall(r'\w+', texts)
            for text in texts:
                if 'm' in text:
                    df['course_len'] = [int(re.findall(r'\d+', text)[0])] * len(df)
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:
                    df["weather"] = [text] * len(df)
                if text in ["良", "稍重", "重"]:
                    df["ground_state"] = [text] * len(df)
                if '不' in text:
                    df["ground_state"] = ['不良'] * len(df)
                # 2020/12/13追加
                if '稍' in text:
                    df["ground_state"] = ['稍重'] * len(df)
                if '芝' in text:
                    df['race_type'] = ['芝'] * len(df)
                if '障' in text:
                    df['race_type'] = ['障害'] * len(df)
                if 'ダ' in text:
                    df['race_type'] = ['ダート'] * len(df)
            df['date'] = [date] * len(df)

            # horse_id
            horse_id_list = []
            horse_td_list = soup.find_all("td", attrs={'class': 'HorseInfo'})
            for td in horse_td_list:
                horse_id = re.findall(r'\d+', td.find('a')['href'])[0]
                horse_id_list.append(horse_id)
            # jockey_id
            jockey_id_list = []
            jockey_td_list = soup.find_all("td", attrs={'class': 'Jockey'})
            for td in jockey_td_list:
                jockey_id = re.findall(r'\d+', td.find('a')['href'])[0]
                jockey_id_list.append(jockey_id)
            df['horse_id'] = horse_id_list
            df['jockey_id'] = jockey_id_list

            df.index = [race_id] * len(df)
            data = data.append(df)
            time.sleep(1)
        return cls(data)
             
    #前処理            
    def preprocessing(self):
        df = self.data.copy()
        
        df["性"] = df["性齢"].map(lambda x: str(x)[0])
        df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

        # 馬体重を体重と体重変化に分ける
        df = df[df["馬体重(増減)"] != '--']
        df["体重"] = df["馬体重(増減)"].str.split("(", expand=True)[0].astype(int)
        df["体重変化"] = df["馬体重(増減)"].str.split("(", expand=True)[1].str[:-1]
        # 2020/12/13追加：増減が「前計不」などのとき欠損値にする
        df['体重変化'] = pd.to_numeric(df['体重変化'], errors='coerce')
        
        df["date"] = pd.to_datetime(df["date"])
        
        df['枠'] = df['枠'].astype(int)
        df['馬番'] = df['馬番'].astype(int)
        df['斤量'] = df['斤量'].astype(int)
        
        df['開催'] = df.index.map(lambda x:str(x)[4:6])

        # 使用する列を選択
        df = df[['枠', '馬番', '斤量', 'course_len', 'weather','race_type',
        'ground_state', 'date', 'horse_id', 'jockey_id', '性', '年齢',
       '体重', '体重変化', '開催']]
        
        self.data_p = df.rename(columns={'枠': '枠番'})

## HorseResultsクラス

馬の過去成績データを保持するクラスです。


In [13]:
class HorseResults:
    def __init__(self, horse_results):
        self.horse_results = horse_results[['日付', '着順', '賞金', '着差', '通過', '開催', '距離']]
        self.preprocessing()
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        馬の過去成績データをスクレイピングする関数
        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト
        Returns:
        ----------
        horse_results_df : pandas.DataFrame
            全馬の過去成績データをまとめてDataFrame型にしたもの
        """

        #horse_idをkeyにしてDataFrame型を格納
        horse_results = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = 'https://db.netkeiba.com/horse/' + horse_id
                df = pd.read_html(url)[3]
                #受賞歴がある馬の場合、3番目に受賞歴テーブルが来るため、4番目のデータを取得する
                if df.columns[0]=='受賞歴':
                    df = pd.read_html(url)[4]
                df.index = [horse_id] * len(df)
                horse_results[horse_id] = df
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #pd.DataFrame型にして一つのデータにまとめる        
        horse_results_df = pd.concat([horse_results[key] for key in horse_results])

        return horse_results_df
    
    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'], errors='coerce')
        df.dropna(subset=['着順'], inplace=True)
        df['着順'] = df['着順'].astype(int)

        df["date"] = pd.to_datetime(df["日付"])
        df.drop(['日付'], axis=1, inplace=True)
        
        #賞金のNaNを0で埋める
        df['賞金'].fillna(0, inplace=True)
        
        #1着の着差を0にする
        df['着差'] = df['着差'].map(lambda x: 0 if x<0 else x)
        
        #レース展開データ
        #n=1: 最初のコーナー位置, n=4: 最終コーナー位置
        def corner(x, n):
            if type(x) != str:
                return x
            elif n==4:
                return int(re.findall(r'\d+', x)[-1])
            elif n==1:
                return int(re.findall(r'\d+', x)[0])
        df['first_corner'] = df['通過'].map(lambda x: corner(x, 1))
        df['final_corner'] = df['通過'].map(lambda x: corner(x, 4))
        
        df['final_to_rank'] = df['final_corner'] - df['着順']
        df['first_to_rank'] = df['first_corner'] - df['着順']
        df['first_to_final'] = df['first_corner'] - df['final_corner']
        
        #開催場所
        df['開催'] = df['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
        #race_type
        df['race_type'] = df['距離'].str.extract(r'(\D+)')[0].map(race_type_dict)
        #距離は10の位を切り捨てる
        df['course_len'] = df['距離'].str.extract(r'(\d+)').astype(int) // 100
        df.drop(['距離'], axis=1, inplace=True)
        #インデックス名を与える
        df.index.name = 'horse_id'
        
        self.horse_results = df
        self.target_list = ['着順', '賞金', '着差', 'first_corner', 'final_corner',
                            'first_to_rank', 'first_to_final','final_to_rank']
    
    #n_samplesレース分馬ごとに平均する
    def average(self, horse_id_list, date, n_samples='all'):
        target_df = self.horse_results.query('index in @horse_id_list')
        
        #過去何走分取り出すか指定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date', ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception('n_samples must be >0')
        
	#集計して辞書型に入れる
        self.average_dict = {}
        self.average_dict['non_category'] = filtered_df.groupby(level=0)[self.target_list].mean()\
            .add_suffix('_{}R'.format(n_samples))
        for column in ['course_len', 'race_type', '開催']:
            self.average_dict[column] = filtered_df.groupby(['horse_id', column])\
                [self.target_list].mean().add_suffix('_{}_{}R'.format(column, n_samples))
    
    def merge(self, results, date, n_samples='all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        self.average(horse_id_list, date, n_samples)
        merged_df = df.merge(self.average_dict['non_category'], left_on='horse_id',
                             right_index=True, how='left')
        for column in ['course_len','race_type', '開催']:
            merged_df = merged_df.merge(self.average_dict[column], 
                                        left_on=['horse_id', column],
                                        right_index=True, how='left')
        return merged_df
    
    def merge_all(self, results, n_samples='all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results, date, n_samples) for date in tqdm(date_list)])
        return merged_df

#開催場所をidに変換するための辞書型
place_dict = {
    '札幌':'01',  '函館':'02',  '福島':'03',  '新潟':'04',  '東京':'05', 
    '中山':'06',  '中京':'07',  '京都':'08',  '阪神':'09',  '小倉':'10'
}

#レースタイプをレース結果データと整合させるための辞書型
race_type_dict = {
    '芝': '芝', 'ダ': 'ダート', '障': '障害'
}

## Pedsクラス

血統データを保持するクラスです。

In [15]:
class Peds:
    def __init__(self, peds):
        self.peds = peds
        self.peds_e = pd.DataFrame() #after label encoding and transforming into category
    
    @classmethod
    def read_pickle(cls, path_list):
        df = pd.read_pickle(path_list[0])
        for path in path_list[1:]:
            df = update_data(df, pd.read_pickle(path))
        return cls(df)
    
    @staticmethod
    def scrape(horse_id_list):
        """
        血統データをスクレイピングする関数
        Parameters:
        ----------
        horse_id_list : list
            馬IDのリスト
        Returns:
        ----------
        peds_df : pandas.DataFrame
            全血統データをまとめてDataFrame型にしたもの
        """

        peds_dict = {}
        for horse_id in tqdm(horse_id_list):
            try:
                url = "https://db.netkeiba.com/horse/ped/" + horse_id
                df = pd.read_html(url)[0]

                #重複を削除して1列のSeries型データに直す
                generations = {}
                for i in reversed(range(5)):
                    generations[i] = df[i]
                    df.drop([i], axis=1, inplace=True)
                    df = df.drop_duplicates()
                ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)

                peds_dict[horse_id] = ped.reset_index(drop=True)
                time.sleep(1)
            except IndexError:
                continue
            except Exception as e:
                print(e)
                break
            except:
                break

        #列名をpeds_0, ..., peds_61にする
        peds_df = pd.concat([peds_dict[key] for key in peds_dict], axis=1).T.add_prefix('peds_')

        return peds_df
    
    def encode(self):
        df = self.peds.copy()
        for column in df.columns:
            df[column] = LabelEncoder().fit_transform(df[column].fillna('Na'))
        self.peds_e = df.astype('category')

## データ加工の流れ

まずはResultsクラス内のpreprocessing関数で、着順を0or1に直したり不要な列を削除したりしています。

In [16]:
#pickleのバージョンを修正
with open('results.pickle', "rb") as fh:
  data = pickle.load(fh)
data.to_pickle('results.pickle')
#pickleの読み込み
r = Results.read_pickle(['results.pickle'])
r.preprocessing()
r.data_p.head() #jupyterで出力

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,rank,性,年齢,体重,体重変化,開催
202001010101,6,6,54.0,16.0,18.0,曇,芝,良,2020-07-25,2018101626,1170,1,牝,2,438,4,1
202001010101,2,2,54.0,1.9,18.0,曇,芝,良,2020-07-25,2018105193,5339,1,牡,2,510,0,1
202001010101,3,3,54.0,1.8,18.0,曇,芝,良,2020-07-25,2018104800,1032,1,牡,2,482,-6,1
202001010101,1,1,52.0,22.2,18.0,曇,芝,良,2020-07-25,2018102410,1176,0,牝,2,442,0,1
202001010101,4,4,54.0,55.7,18.0,曇,芝,良,2020-07-25,2018100828,1116,0,牡,2,426,-8,1


次に、馬の直近5レース、9レース、全レースの過去成績の平均を列に加えます。DataProcessorクラスから継承したmerge_horse_results関数を使います。引数にはHorseResultsクラスのオブジェクトと何走分を考えるかのリストを入れます。

In [17]:
!unzip horse_results.zip

Archive:  horse_results.zip
  inflating: horse_results.pickle    


In [20]:
#pickleのバージョンを修正
with open('horse_results.pickle', "rb") as fh:
  data = pickle.load(fh)
data.to_pickle('horse_results.pickle')
#pickleの読み込み
hr = HorseResults.read_pickle(['horse_results.pickle'])
r.merge_horse_results(hr, n_samples_list=[5, 9, 'all'])
r.data_h.head() #jupyterで出力

HBox(children=(FloatProgress(value=0.0, max=109.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=109.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=109.0), HTML(value='')))




Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,rank,性,年齢,体重,体重変化,着順_5R,賞金_5R,着差_5R,first_corner_5R,final_corner_5R,first_to_rank_5R,first_to_final_5R,final_to_rank_5R,着順_course_len_5R,賞金_course_len_5R,着差_course_len_5R,first_corner_course_len_5R,final_corner_course_len_5R,first_to_rank_course_len_5R,first_to_final_course_len_5R,final_to_rank_course_len_5R,着順_race_type_5R,賞金_race_type_5R,着差_race_type_5R,first_corner_race_type_5R,final_corner_race_type_5R,first_to_rank_race_type_5R,first_to_final_race_type_5R,final_to_rank_race_type_5R,...,着順_開催_9R,賞金_開催_9R,着差_開催_9R,first_corner_開催_9R,final_corner_開催_9R,first_to_rank_開催_9R,first_to_final_開催_9R,final_to_rank_開催_9R,着順_allR,賞金_allR,着差_allR,first_corner_allR,final_corner_allR,first_to_rank_allR,first_to_final_allR,final_to_rank_allR,着順_course_len_allR,賞金_course_len_allR,着差_course_len_allR,first_corner_course_len_allR,final_corner_course_len_allR,first_to_rank_course_len_allR,first_to_final_course_len_allR,final_to_rank_course_len_allR,着順_race_type_allR,賞金_race_type_allR,着差_race_type_allR,first_corner_race_type_allR,final_corner_race_type_allR,first_to_rank_race_type_allR,first_to_final_race_type_allR,final_to_rank_race_type_allR,着順_開催_allR,賞金_開催_allR,着差_開催_allR,first_corner_開催_allR,final_corner_開催_allR,first_to_rank_開催_allR,first_to_final_開催_allR,final_to_rank_開催_allR
202001010101,6,6,54.0,16.0,18.0,曇,芝,良,2020-07-25,2018101626,1170,1,牝,2,438,4,11.0,0.0,1.5,9.0,9.0,-2.0,0.0,-2.0,,,,,,,,,11.0,0.0,1.5,9.0,9.0,-2.0,0.0,-2.0,...,,,,,,,,,11.0,0.0,1.5,9.0,9.0,-2.0,0.0,-2.0,,,,,,,,,11.0,0.0,1.5,9.0,9.0,-2.0,0.0,-2.0,,,,,,,,
202001010101,2,2,54.0,1.9,18.0,曇,芝,良,2020-07-25,2018105193,5339,1,牡,2,510,0,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,...,,,,,,,,,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,,,,,,,,
202001010101,3,3,54.0,1.8,18.0,曇,芝,良,2020-07-25,2018104800,1032,1,牡,2,482,-6,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,...,,,,,,,,,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,,,,,,,,
202001010101,1,1,52.0,22.2,18.0,曇,芝,良,2020-07-25,2018102410,1176,0,牝,2,442,0,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,...,,,,,,,,,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,,,,,,,,
202001010101,4,4,54.0,55.7,18.0,曇,芝,良,2020-07-25,2018100828,1116,0,牡,2,426,-8,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,...,,,,,,,,,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,,,,,,,,


In [21]:
self.target_list = ['着順', '賞金', '着差', 'first_corner', 'final_corner',
                    'first_to_rank', 'final_to_rank', 'first_to_final']

NameError: ignored

次に5世代分の血統データを追加するのですが、merge_horse_resultsと同様に、継承したDataProcessorクラスのmerge_peds関数を使います。

In [28]:
#pickleのバージョンを修正
with open('peds_results.pickle', "rb") as fh:
  data = pickle.load(fh)
data.to_pickle('peds_results.pickle')
#pickleの読み込み
p = Peds.read_pickle(['peds_results.pickle'])
p.encode()
print(p.peds_e)
r.merge_peds(p.peds_e)
r.data_pe.head() #jupyterで出力

           peds_0 peds_1 peds_2 peds_3  ... peds_58 peds_59 peds_60 peds_61
2018101626    256   2162    166    373  ...      87     827    1321    3146
2018105193    341   4892    154    239  ...     247     673     931     306
2018104800    292   6072    145    399  ...     391     543     562     930
2018102410    227   7440    176    259  ...      19      91     607    2439
2018100828    449   3302    141    294  ...     154     937     204     316
...           ...    ...    ...    ...  ...     ...     ...     ...     ...
2018109169    293   2342     75    107  ...     486    1140    1056    2816
2017104830    285   3964    183    285  ...     175     668     414     663
2017101519    348   5414    141    328  ...     147     202     966    1512
2018106621    438   6351    139    378  ...     288     910     581    2443
2014102720    189   2535    145    405  ...     244     611    1093    2660

[11702 rows x 62 columns]


Unnamed: 0,枠番,馬番,斤量,単勝,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,rank,性,年齢,体重,体重変化,着順_5R,賞金_5R,着差_5R,first_corner_5R,final_corner_5R,first_to_rank_5R,first_to_final_5R,final_to_rank_5R,着順_course_len_5R,賞金_course_len_5R,着差_course_len_5R,first_corner_course_len_5R,final_corner_course_len_5R,first_to_rank_course_len_5R,first_to_final_course_len_5R,final_to_rank_course_len_5R,着順_race_type_5R,賞金_race_type_5R,着差_race_type_5R,first_corner_race_type_5R,final_corner_race_type_5R,first_to_rank_race_type_5R,first_to_final_race_type_5R,final_to_rank_race_type_5R,...,peds_22,peds_23,peds_24,peds_25,peds_26,peds_27,peds_28,peds_29,peds_30,peds_31,peds_32,peds_33,peds_34,peds_35,peds_36,peds_37,peds_38,peds_39,peds_40,peds_41,peds_42,peds_43,peds_44,peds_45,peds_46,peds_47,peds_48,peds_49,peds_50,peds_51,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61
202001010101,6,6,54.0,16.0,18.0,曇,芝,良,2020-07-25,2018101626,1170,1,牝,2,438,4,11.0,0.0,1.5,9.0,9.0,-2.0,0.0,-2.0,,,,,,,,,11.0,0.0,1.5,9.0,9.0,-2.0,0.0,-2.0,...,87,236,280,567,354,976,1130,3629,8,6,80,56,66,30,149,186,85,129,182,226,135,33,266,348,48,89,66,264,144,124,57,335,64,374,73,488,87,827,1321,3146
202001010101,2,2,54.0,1.9,18.0,曇,芝,良,2020-07-25,2018105193,5339,1,牡,2,510,0,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,2.0,280.0,0.0,3.0,1.0,1.0,2.0,-1.0,...,87,66,286,335,345,708,555,1589,37,40,20,3,57,91,81,38,48,37,137,142,23,193,175,131,48,89,26,239,153,265,184,417,79,334,181,620,247,673,931,306
202001010101,3,3,54.0,1.8,18.0,曇,芝,良,2020-07-25,2018104800,1032,1,牡,2,482,-6,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,2.0,280.0,0.4,3.0,3.0,1.0,0.0,1.0,...,147,146,346,637,125,678,265,1550,8,2,6,50,51,88,54,120,24,21,157,123,87,124,285,376,50,119,116,113,148,327,376,164,156,192,90,675,391,543,562,930
202001010101,1,1,52.0,22.2,18.0,曇,芝,良,2020-07-25,2018102410,1176,0,牝,2,442,0,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,5.0,70.0,1.4,1.0,1.0,-4.0,0.0,-4.0,...,88,141,73,571,120,578,77,1252,8,6,80,56,40,68,17,68,48,105,131,44,130,115,0,252,50,126,92,160,28,273,52,652,36,107,9,51,19,91,607,2439
202001010101,4,4,54.0,55.7,18.0,曇,芝,良,2020-07-25,2018100828,1116,0,牡,2,426,-8,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,12.0,0.0,4.4,9.0,10.0,-3.0,-1.0,-2.0,...,87,66,286,335,207,435,83,973,25,21,56,60,92,74,10,162,25,123,180,228,87,68,86,236,48,89,26,239,153,265,184,417,110,232,454,420,154,937,204,316


最後にカテゴリ変数の処理として、次の2つのことをします。

1. horse_idとjockey_idを血統データと同様にPandasのcategory型に変更する
2. 天気、レース種別、馬場の状態、馬の性別をダミー変数化

レース結果データについては、Resultsクラス内でprocess_categorical関数をオーバーライドします。この時、出馬表データのhorse_idとjockey_idを処理する時のために、Resultsクラス内のle_horseとle_jockeyという変数に、LabelEncoderオブジェクトを保持しておきます。ダミー変数化のところがなぜ複雑になっているかは、次の出馬表データの加工のところで説明します。

In [29]:
r.process_categorical() #r.le_horse, r.le_jockeyに対応関係が保存される

In [30]:
r.data_c

Unnamed: 0,枠番,馬番,斤量,単勝,course_len,date,horse_id,jockey_id,rank,年齢,体重,体重変化,着順_5R,賞金_5R,着差_5R,first_corner_5R,final_corner_5R,first_to_rank_5R,first_to_final_5R,final_to_rank_5R,着順_course_len_5R,賞金_course_len_5R,着差_course_len_5R,first_corner_course_len_5R,final_corner_course_len_5R,first_to_rank_course_len_5R,first_to_final_course_len_5R,final_to_rank_course_len_5R,着順_race_type_5R,賞金_race_type_5R,着差_race_type_5R,first_corner_race_type_5R,final_corner_race_type_5R,first_to_rank_race_type_5R,first_to_final_race_type_5R,final_to_rank_race_type_5R,着順_開催_5R,賞金_開催_5R,着差_開催_5R,first_corner_開催_5R,...,peds_38,peds_39,peds_40,peds_41,peds_42,peds_43,peds_44,peds_45,peds_46,peds_47,peds_48,peds_49,peds_50,peds_51,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61,weather_曇,weather_晴,weather_雨,weather_小雨,weather_雪,weather_小雪,race_type_芝,race_type_ダート,race_type_障害,ground_state_良,ground_state_稍重,ground_state_重,ground_state_不良,性_牝,性_牡,性_セ
202001010101,6,6,54.0,16.0,18.0,2020-07-25,8884,120,1,2,438,4,11.0,0.00,1.50,9.0,9.0,-2.0,0.0,-2.0,,,,,,,,,11.0,0.00,1.50,9.0,9.0,-2.0,0.0,-2.0,,,,,...,85,129,182,226,135,33,266,348,48,89,66,264,144,124,57,335,64,374,73,488,87,827,1321,3146,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0
202001010101,2,2,54.0,1.9,18.0,2020-07-25,10659,143,1,2,510,0,2.0,280.00,0.00,3.0,1.0,1.0,2.0,-1.0,2.000000,280.000000,0.000000,3.000000,1.000000,1.00,2.00,-1.0,2.0,280.00,0.00,3.0,1.0,1.0,2.0,-1.0,,,,,...,48,37,137,142,23,193,175,131,48,89,26,239,153,265,184,417,79,334,181,620,247,673,931,306,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0
202001010101,3,3,54.0,1.8,18.0,2020-07-25,10393,30,1,2,482,-6,2.0,280.00,0.40,3.0,3.0,1.0,0.0,1.0,2.000000,280.000000,0.400000,3.000000,3.000000,1.00,0.00,1.0,2.0,280.00,0.40,3.0,3.0,1.0,0.0,1.0,,,,,...,24,21,157,123,87,124,285,376,50,119,116,113,148,327,376,164,156,192,90,675,391,543,562,930,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0
202001010101,1,1,52.0,22.2,18.0,2020-07-25,9269,125,0,2,442,0,5.0,70.00,1.40,1.0,1.0,-4.0,0.0,-4.0,5.000000,70.000000,1.400000,1.000000,1.000000,-4.00,0.00,-4.0,5.0,70.00,1.40,1.0,1.0,-4.0,0.0,-4.0,,,,,...,48,105,131,44,130,115,0,252,50,126,92,160,28,273,52,652,36,107,9,51,19,91,607,2439,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0
202001010101,4,4,54.0,55.7,18.0,2020-07-25,8587,73,0,2,426,-8,12.0,0.00,4.40,9.0,10.0,-3.0,-1.0,-2.0,12.000000,0.000000,4.400000,9.000000,10.000000,-3.00,-1.00,-2.0,12.0,0.00,4.40,9.0,10.0,-3.0,-1.0,-2.0,,,,,...,25,123,180,228,87,68,86,236,48,89,26,239,153,265,184,417,110,232,454,420,154,937,204,316,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202009060812,4,5,57.0,7.2,14.0,2020-12-27,1111,83,0,6,478,-2,6.8,507.24,0.48,11.4,11.4,4.6,0.0,4.6,2.000000,739.600000,0.000000,11.000000,11.000000,9.00,0.00,9.0,6.8,507.24,0.48,11.4,11.4,4.6,0.0,4.6,9.666667,246.533333,0.566667,12.333333,...,19,96,169,53,87,124,170,266,50,34,169,254,65,60,134,387,130,120,167,654,59,929,497,2095,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
202009060812,3,4,57.0,22.6,14.0,2020-12-27,2987,154,0,4,456,2,8.2,449.32,0.78,2.6,3.0,-5.6,-0.4,-5.2,5.666667,748.866667,0.366667,1.666667,1.666667,-4.00,0.00,-4.0,8.2,449.32,0.78,2.6,3.0,-5.6,-0.4,-5.2,,,,,...,45,70,164,100,77,179,203,228,4,52,97,242,41,311,6,519,102,97,111,894,261,425,482,345,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
202009060812,6,9,55.0,6.9,14.0,2020-12-27,3662,101,0,4,424,-6,3.4,461.52,0.30,3.2,3.2,-0.2,0.0,-0.2,2.000000,740.500000,0.150000,4.000000,3.500000,2.00,0.50,1.5,3.4,461.52,0.30,3.2,3.2,-0.2,0.0,-0.2,,,,,...,24,21,157,123,87,124,285,376,56,47,191,51,109,176,371,305,63,62,447,529,261,538,499,2972,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
202009060812,5,8,57.0,17.2,14.0,2020-12-27,2842,22,0,4,468,-2,7.2,305.18,0.44,8.0,8.4,0.8,-0.4,1.2,6.750000,381.475000,0.400000,7.500000,7.750000,0.75,-0.25,1.0,7.2,305.18,0.44,8.0,8.4,0.8,-0.4,1.2,5.500000,762.950000,0.250000,4.000000,...,45,70,14,184,52,170,148,35,56,47,30,112,50,139,332,460,107,186,105,863,154,936,1159,3179,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0


## 出馬表の加工

次に、予測で使う出馬表データの加工です。出馬表は他のデータと違って、レース直前にその都度スクレイピングすることになります。

In [31]:
race_id_list = ['2020010106{}'.format(str(i).zfill(2)) for i in range(1, 13, 1)]
st = ShutubaTable.scrape(race_id_list, '2020/08/09')

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [35]:
#前処理
st.preprocessing()

#馬の過去成績データの追加。新馬はNaNが追加される
st.merge_horse_results(hr)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




<__main__.ShutubaTable at 0x7feae5cfb0d0>

In [36]:
#5世代分の血統データの追加
st.merge_peds(p.peds_e)

#scrape peds at horse_id_list "no_peds"と表示された場合
#peds_new = Peds.scrape(st.no_peds)
#peds.to_pickle('peds_h.pickle') #pedsを更新する前にバックアップ
#peds = update_data(peds, peds_new)
#peds.to_pickle('peds.pickle')
#p = Peds.read_pickle(['peds.pickle'])
#p.encode()
#st.merge_peds(p.peds_e)

In [37]:
st.process_categorical(r.le_horse, r.le_jockey, r.data_pe)

In [38]:
st.data_c

Unnamed: 0,枠番,馬番,斤量,course_len,date,horse_id,jockey_id,年齢,体重,体重変化,着順_5R,賞金_5R,着差_5R,first_corner_5R,final_corner_5R,first_to_rank_5R,first_to_final_5R,final_to_rank_5R,着順_course_len_5R,賞金_course_len_5R,着差_course_len_5R,first_corner_course_len_5R,final_corner_course_len_5R,first_to_rank_course_len_5R,first_to_final_course_len_5R,final_to_rank_course_len_5R,着順_race_type_5R,賞金_race_type_5R,着差_race_type_5R,first_corner_race_type_5R,final_corner_race_type_5R,first_to_rank_race_type_5R,first_to_final_race_type_5R,final_to_rank_race_type_5R,着順_開催_5R,賞金_開催_5R,着差_開催_5R,first_corner_開催_5R,final_corner_開催_5R,first_to_rank_開催_5R,...,peds_38,peds_39,peds_40,peds_41,peds_42,peds_43,peds_44,peds_45,peds_46,peds_47,peds_48,peds_49,peds_50,peds_51,peds_52,peds_53,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61,weather_曇,weather_晴,weather_雨,weather_小雨,weather_雪,weather_小雪,race_type_芝,race_type_ダート,race_type_障害,ground_state_良,ground_state_稍重,ground_state_重,ground_state_不良,性_牝,性_牡,性_セ
202001010601,1,1,54,1200,2020-08-09,8817,62,2,464,0,10.0,0.00,3.00,8.0,9.0,-2.0,-1.0,-1.0,,,,,,,,,10.000000,0.000000,3.000000,8.000000,9.000000,-2.000000,-1.000000,-1.000000,,,,,,,...,11,87,105,166,142,142,59,256,50,48,58,172,65,60,312,569,130,120,433,176,44,788,290,278,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
202001010601,2,2,54,1200,2020-08-09,10681,120,2,448,4,4.0,110.00,0.30,3.0,3.0,-1.0,0.0,-1.0,,,,,,,,,4.000000,110.000000,0.300000,3.000000,3.000000,-1.000000,0.000000,-1.000000,,,,,,,...,32,108,69,147,87,86,29,235,12,122,72,145,101,177,349,577,100,130,0,401,250,436,616,1931,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
202001010601,3,3,53,1200,2020-08-09,9972,129,2,402,0,5.0,70.00,1.00,5.0,5.0,0.0,0.0,0.0,,,,,,,,,5.000000,70.000000,1.000000,5.000000,5.000000,0.000000,0.000000,0.000000,5.0,70.0,1.0,5.0,5.0,0.0,...,13,85,11,151,87,212,4,200,21,8,14,150,118,241,137,455,79,334,181,620,322,248,1209,2761,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
202001010601,4,4,54,1200,2020-08-09,10007,50,2,408,2,5.0,65.00,1.25,4.0,4.5,-1.0,-0.5,-0.5,,,,,,,,,5.000000,65.000000,1.250000,4.000000,4.500000,-1.000000,-0.500000,-0.500000,,,,,,,...,76,89,119,181,119,12,156,57,83,81,82,132,123,54,146,665,215,386,548,969,211,1184,1278,2951,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
202001010601,5,5,54,1200,2020-08-09,10837,43,2,418,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,45,100,76,46,47,254,210,26,21,8,14,150,118,241,137,455,112,291,211,115,233,489,602,1860,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202001010612,6,12,57,1200,2020-08-09,1936,30,5,474,2,7.2,97.20,0.76,2.2,2.4,-5.0,-0.2,-4.8,,,,,,,,,7.200000,97.200000,0.760000,2.200000,2.400000,-5.000000,-0.200000,-4.800000,,,,,,,...,48,105,131,44,51,209,102,4,21,17,198,163,115,384,406,700,168,171,148,323,183,583,261,618,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
202001010612,7,13,57,1200,2020-08-09,804,61,6,494,6,7.2,32.00,0.98,10.4,9.4,3.2,1.0,2.2,,,,,,,,,4.000000,160.000000,0.600000,8.000000,8.000000,4.000000,0.000000,4.000000,10.0,0.0,1.0,6.0,7.0,-4.0,...,45,70,51,201,84,65,240,279,81,93,122,202,142,97,132,588,75,229,428,741,46,463,10,1009,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
202001010612,7,14,57,1200,2020-08-09,3804,26,4,486,-4,7.8,22.00,1.98,2.8,3.8,-5.0,-1.0,-4.0,,,,,,,,,9.000000,0.000000,2.566667,2.333333,3.666667,-6.666667,-1.333333,-5.333333,11.0,0.0,3.9,1.0,5.0,-10.0,...,46,88,103,108,64,127,132,370,21,17,198,163,118,95,168,412,115,256,165,55,311,168,278,270,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
202001010612,8,15,52,1200,2020-08-09,6471,80,3,460,4,9.2,15.40,1.56,8.8,9.2,-0.4,-0.4,0.0,,,,,,,,,9.333333,25.666667,1.366667,9.000000,9.333333,-0.333333,-0.333333,0.000000,,,,,,,...,76,89,119,181,119,12,156,57,21,8,142,33,153,262,352,252,64,374,504,905,233,209,607,2237,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0


# モデルの作成

### データの分割
時系列に分けるためソートしてから分割

In [None]:
#時系列に沿って訓練データとテストデータに分ける関数
def split_data(df, test_size=0.3):
    sorted_id_list = df.sort_values("date").index.unique()
    train_id_list = sorted_id_list[: round(len(sorted_id_list) * (1 - test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list) * (1 - test_size)) :]
    train = df.loc[train_id_list]
    test = df.loc[test_id_list]
    return train, test

train, test = split_data(r.data_c)