# French Trot Racing

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras import layers
import warnings

warnings.filterwarnings("ignore")

In [2]:
a = pd.read_parquet("./trots_2013-2022.parquet")
df = a
print(df.shape)
df.head()

(1200412, 43)


Unnamed: 0,AgeRestriction,Barrier,BeatenMargin,ClassRestriction,CourseIndicator,DamID,Disqualified,Distance,FinishPosition,FoalingCountry,...,StartType,StartingLine,Surface,TrackID,TrainerID,NoFrontCover,PositionInRunning,WideOffRail,WeightCarried,WetnessScale
0,6yo,5,1.55,NW$101 CD,,1491946,False,2150.0,2,FR,...,M,1,S,951,38190,-9,-9,-9,0.0,3
1,6yo,6,3.55,NW$101 CD,,1509392,False,2150.0,4,FR,...,M,1,S,951,38432,-9,-9,-9,0.0,3
2,6yo,7,5.55,NW$101 CD,,1507967,False,2150.0,6,FR,...,M,1,S,951,37826,-9,-9,-9,0.0,3
3,6yo,8,999.0,NW$101 CD,,1508536,False,2150.0,BS,FR,...,M,1,S,951,38293,-9,-9,-9,0.0,3
4,6yo,9,999.0,NW$101 CD,,1514055,False,2150.0,BS,FR,...,M,2,S,951,38945,-9,-9,-9,0.0,3


# Feature Engineering

In [3]:
df.drop(columns = ['AgeRestriction', 'Barrier', 'CourseIndicator', 'FoalingCountry', 
                    'Gender', 'GoingAbbrev', 'GoingID', 'HandicapType', 'PIRPosition', 'NoFrontCover',
                    'RaceGroup', 'RacePrizemoney', 'Saddlecloth', 'SexRestriction', 'StartingLine', 'WideOffRail'], inplace=True)
df.head()

Unnamed: 0,BeatenMargin,ClassRestriction,DamID,Disqualified,Distance,FinishPosition,FoalingDate,FrontShoes,HandicapDistance,HindShoes,...,RaceStartTime,RacingSubType,SireID,StartType,Surface,TrackID,TrainerID,PositionInRunning,WeightCarried,WetnessScale
0,1.55,NW$101 CD,1491946,False,2150.0,2,2009-04-13,0,0.0,0,...,2015-04-18 14:01:00,T,1474781,M,S,951,38190,-9,0.0,3
1,3.55,NW$101 CD,1509392,False,2150.0,4,2009-05-24,0,0.0,0,...,2015-04-18 14:01:00,T,1474692,M,S,951,38432,-9,0.0,3
2,5.55,NW$101 CD,1507967,False,2150.0,6,2009-04-07,0,0.0,0,...,2015-04-18 14:01:00,T,1507583,M,S,951,37826,-9,0.0,3
3,999.0,NW$101 CD,1508536,False,2150.0,BS,2009-05-07,0,0.0,0,...,2015-04-18 14:01:00,T,1499267,M,S,951,38293,-9,0.0,3
4,999.0,NW$101 CD,1514055,False,2150.0,BS,2009-04-26,0,0.0,0,...,2015-04-18 14:01:00,T,1475423,M,S,951,38945,-9,0.0,3


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200412 entries, 0 to 1200411
Data columns (total 27 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   BeatenMargin       1200412 non-null  float64       
 1   ClassRestriction   1200412 non-null  object        
 2   DamID              1200412 non-null  int64         
 3   Disqualified       1200412 non-null  bool          
 4   Distance           1200412 non-null  float64       
 5   FinishPosition     1200412 non-null  object        
 6   FoalingDate        1200412 non-null  datetime64[ns]
 7   FrontShoes         1200412 non-null  int64         
 8   HandicapDistance   1200412 non-null  float64       
 9   HindShoes          1200412 non-null  int64         
 10  HorseAge           1200412 non-null  int64         
 11  HorseID            1200412 non-null  int64         
 12  JockeyID           1200412 non-null  int64         
 13  PriceSP            1200412 

In [5]:
df.loc[df['BeatenMargin'] == 999, 'BeatenMargin'] = df.loc[df['BeatenMargin'] == 999, 'RaceOverallTime']

In [6]:
#df = df.sort_values(by=['HorseID', 'RaceStartTime'])
#df['RaceStartTime'] = pd.to_datetime(df['RaceStartTime'])
#df['days_since_last_race'] = df.groupby('HorseID')['RaceStartTime'].diff().dt.days.fillna(0)
#df[['HorseID','RaceStartTime','days_since_last_race']]

In [7]:
df = df.sort_values(by=['HorseID', 'RaceStartTime'])
df['cum_prize_money'] = df.groupby('HorseID')['Prizemoney'].cumsum()

In [8]:
df['normalized_beaten_margin'] = df['BeatenMargin'] / df['Distance']
df['avg_normalized_beaten_margin'] = df.groupby('HorseID')['normalized_beaten_margin'].transform('mean')

In [9]:
df = df.sort_values(by=['TrainerID', 'RaceStartTime'])
df['trainer_cum_prize_money'] = df.groupby('TrainerID')['Prizemoney'].cumsum()

In [10]:
df['trainer_beaten_margin'] = df['BeatenMargin'] / df['Distance']
df['avg_trainer_beaten_margin'] = df.groupby('TrainerID')['trainer_beaten_margin'].transform('mean')

In [11]:
df = df.sort_values(by=['SireID', 'RaceStartTime'])
df['sire_cum_prize_money'] = df.groupby('SireID')['Prizemoney'].cumsum()

In [12]:
df['sire_beaten_margin'] = df['BeatenMargin'] / df['Distance']
df['avg_sire_beaten_margin'] = df.groupby('SireID')['sire_beaten_margin'].transform('mean')

In [13]:
df = df.sort_values(by=['JockeyID', 'RaceStartTime'])
df['jockey_cum_prize_money'] = df.groupby('JockeyID')['Prizemoney'].cumsum()

In [14]:
df['jockey_beaten_margin'] = df['BeatenMargin'] / df['Distance']
df['avg_jockey_beaten_margin'] = df.groupby('JockeyID')['sire_beaten_margin'].transform('mean')

In [15]:
def map_to_win(value):
    if value.strip() == '1':
        return 1
    else:
        return 0

df['win'] = df['FinishPosition'].apply(map_to_win)

In [16]:
df['HandicapDistance'] = np.sqrt(df.HandicapDistance**2)

# Preprocessing & Model Fitting

In [17]:
df['FrontShoes'] = df['FrontShoes'].astype(object)
df['HindShoes'] = df['HindShoes'].astype(object)
train_df = df[df['RaceStartTime'] < pd.Timestamp(2021, 11, 1)]
test_df = df[df['RaceStartTime'] >= pd.Timestamp(2021, 11, 1)]
train_df.drop(columns=['BeatenMargin', 'DamID', 'Disqualified', 'Distance', 'FinishPosition', 'FoalingDate',
                'JockeyID', 'Prizemoney','RaceOverallTime', 'RaceStartTime',
                'SireID', 'TrackID', 'TrainerID', 'PositionInRunning'],inplace=True)
test_df.drop(columns=['BeatenMargin', 'DamID', 'Disqualified', 'Distance', 'FinishPosition', 'FoalingDate',
                'JockeyID', 'Prizemoney','RaceOverallTime', 'RaceStartTime',
                'SireID', 'TrackID', 'TrainerID', 'PositionInRunning'],inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1172292 entries, 91893 to 680819
Data columns (total 26 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   ClassRestriction              1172292 non-null  object 
 1   FrontShoes                    1172292 non-null  object 
 2   HandicapDistance              1172292 non-null  float64
 3   HindShoes                     1172292 non-null  object 
 4   HorseAge                      1172292 non-null  int64  
 5   HorseID                       1172292 non-null  int64  
 6   PriceSP                       1172292 non-null  float64
 7   RaceID                        1172292 non-null  int64  
 8   RacingSubType                 1172292 non-null  object 
 9   StartType                     1172292 non-null  object 
 10  Surface                       1172292 non-null  object 
 11  WeightCarried                 1172292 non-null  float64
 12  WetnessScale             

In [18]:
X_train = train_df.drop(columns=['win'])
y_train = train_df['win'].values

X_test = test_df.drop(columns=['win'])
y_test = test_df['win'].values

categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
passthrough_cols = ['RaceID','HorseID']

categorical_cols = [col for col in categorical_cols if col not in passthrough_cols]
numerical = [col for col in numerical_cols if col not in passthrough_cols]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
 #       ('pass','passthrough',passthrough_cols)
    ])
preprocessor

In [19]:
def create_model():
    model = tf.keras.Sequential([
        layers.Dense(10, activation='relu', input_shape=[X_train_processed.shape[1]], kernel_initializer = 'he_normal'),
        layers.Dropout(0.5),
        layers.Dense(5, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    
    return model

In [20]:
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [21]:
model = create_model()

In [22]:
history = model.fit(X_train_processed, y_train, epochs=2, validation_data=(X_test_processed, y_test))

Epoch 1/2
Epoch 2/2


# Evaluating & Predictions

In [23]:
loss, accuracy, auc = model.evaluate(X_test_processed, y_test)
print(f'Test accuracy: {accuracy}, AUC: {auc}')

Test accuracy: 0.9240753650665283, AUC: 0.9744850397109985


In [24]:
raw_predictions = model.predict(X_test_processed)

predictions_df = pd.DataFrame(raw_predictions, columns=['raw_score'])
predictions_df['RaceID'] = X_test['RaceID'].values
predictions_df['HorseID'] = X_test['HorseID'].values

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

probabilities_df = predictions_df.groupby('RaceID').raw_score.apply(softmax).reset_index(name='winprobability')
predictions_df['winprobability'] = probabilities_df['winprobability'].values
#predictions_df[predictions_df['RaceID']==1681810]



In [26]:
predictions_df.to_parquet('probabilities.parquet')