In [1]:
import sys
sys.path.append('../src')

import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler # type: ignore
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore
import pandas as pd
import xgboost as xgb

from functions import *

from data_cleaning import DataCleaning

pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../data/modelling_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Race Time,Course,SP Odds Decimal,"Won (1=Won, 0=Lost)","Place (1=Placed, 0=UnPlaced)",Pace Rating Rank,Trainer/Jky Stats Rank,Proform Speed Rating,LTO Speed Rating Rank,MR Career Speed Rating Rank,WON SR Before,Won P/L Before,Plc SR Before,Official Rating LTO,Position LTO,distance_bucket,evening_morning_price,breakfast_morning_price,weight,sp_odds_rank
0,0,01/04/2022 13:00:00,Leicester,4,0,0,2,3,23,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,0.833333,0.909091,131,2
1,1,01/04/2022 13:00:00,Leicester,3,0,0,2,7,24,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,0.75,0.75,131,1
2,2,01/04/2022 13:00:00,Leicester,13,0,0,2,2,42,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,1.0,1.0,131,6
3,3,01/04/2022 13:00:00,Leicester,5,1,0,2,11,46,1,1,0.0,-1.0,100.0,0.0,3.0,1000-1200,1.5,1.5,131,3
4,4,01/04/2022 13:00:00,Leicester,67,0,0,2,6,35,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,1.313725,1.313725,121,9


Create a df with 5 independent features and 'Won (1=Won, 0=Lost)' as the dependent variable

In [3]:
df = data[['Race Time', 'Course','Won (1=Won, 0=Lost)', 'sp_odds_rank', 'WON SR Before', 
           'Proform Speed Rating', 'Trainer/Jky Stats Rank', 'Position LTO']]

In [10]:
grouped = df.groupby(['Race Time', 'Course'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x17e0ecdd0>


In [5]:
df.head(5)

Unnamed: 0,Race Time,Course,"Won (1=Won, 0=Lost)",sp_odds_rank,WON SR Before,Proform Speed Rating,Trainer/Jky Stats Rank,Position LTO
0,01/04/2022 13:00:00,Leicester,0,2,0.0,23,3,0.0
1,01/04/2022 13:00:00,Leicester,0,1,0.0,24,7,0.0
2,01/04/2022 13:00:00,Leicester,0,6,0.0,42,2,0.0
3,01/04/2022 13:00:00,Leicester,1,3,0.0,46,11,3.0
4,01/04/2022 13:00:00,Leicester,0,9,0.0,35,6,0.0


In [6]:
normalized_df = DataCleaning.normalize_columns(df, ['Proform Speed Rating', 'WON SR Before'])

normalized_df.head(5)

Unnamed: 0,Race Time,Course,"Won (1=Won, 0=Lost)",sp_odds_rank,WON SR Before,Proform Speed Rating,Trainer/Jky Stats Rank,Position LTO
0,01/04/2022 13:00:00,Leicester,0,2,0.0,0.264368,3,0.0
1,01/04/2022 13:00:00,Leicester,0,1,0.0,0.275862,7,0.0
2,01/04/2022 13:00:00,Leicester,0,6,0.0,0.482759,2,0.0
3,01/04/2022 13:00:00,Leicester,1,3,0.0,0.528736,11,3.0
4,01/04/2022 13:00:00,Leicester,0,9,0.0,0.402299,6,0.0


In [16]:
train_data, test_data = DataCleaning.split_data(df = normalized_df)

X_train = train_data[['sp_odds_rank', 'WON SR Before', 'Proform Speed Rating', 'Trainer/Jky Stats Rank', 'Position LTO']]
y_train = train_data['Won (1=Won, 0=Lost)']

X_test = test_data[['sp_odds_rank', 'WON SR Before', 'Proform Speed Rating', 'Trainer/Jky Stats Rank', 'Position LTO']]
y_test = test_data['Won (1=Won, 0=Lost)']


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [17]:
# Instantiate and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94     11524
           1       0.34      0.21      0.26      1096

    accuracy                           0.90     12620
   macro avg       0.64      0.59      0.60     12620
weighted avg       0.88      0.90      0.89     12620

