In [1]:
import sys
sys.path.append('../src')

import mlflow
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

from functions import *

from data_cleaning import DataCleaning

pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../data/modelling_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Race Time,Course,SP Odds Decimal,"Won (1=Won, 0=Lost)","Place (1=Placed, 0=UnPlaced)",Pace Rating Rank,Trainer/Jky Stats Rank,Proform Speed Rating,LTO Speed Rating Rank,MR Career Speed Rating Rank,WON SR Before,Won P/L Before,Plc SR Before,Official Rating LTO,Position LTO,distance_bucket,evening_morning_price,breakfast_morning_price,weight,sp_odds_rank
0,0,01/04/2022 13:00:00,Leicester,4,0,0,2,3,23,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,0.833333,0.909091,131,2
1,1,01/04/2022 13:00:00,Leicester,3,0,0,2,7,24,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,0.75,0.75,131,1
2,2,01/04/2022 13:00:00,Leicester,13,0,0,2,2,42,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,1.0,1.0,131,6
3,3,01/04/2022 13:00:00,Leicester,5,1,0,2,11,46,1,1,0.0,-1.0,100.0,0.0,3.0,1000-1200,1.5,1.5,131,3
4,4,01/04/2022 13:00:00,Leicester,67,0,0,2,6,35,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,1.313725,1.313725,121,9


Create a df with 5 independent features and 'Won (1=Won, 0=Lost)' as the dependent variable

In [3]:
df = data[['Race Time', 'Course', 'distance_bucket', 'Won (1=Won, 0=Lost)', 'SP Odds Decimal', 'weight', 
           'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]

In [4]:
df.head(5)

Unnamed: 0,Race Time,Course,distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,Proform Speed Rating,Won P/L Before,evening_morning_price
0,01/04/2022 13:00:00,Leicester,1000-1200,0,4,131,23,0.0,0.833333
1,01/04/2022 13:00:00,Leicester,1000-1200,0,3,131,24,0.0,0.75
2,01/04/2022 13:00:00,Leicester,1000-1200,0,13,131,42,0.0,1.0
3,01/04/2022 13:00:00,Leicester,1000-1200,1,5,131,46,-1.0,1.5
4,01/04/2022 13:00:00,Leicester,1000-1200,0,67,121,35,0.0,1.313725


In [5]:
normalized_df = DataCleaning.normalize_columns(df, ['SP Odds Decimal', 'weight', 
           'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price'])

normalized_df.head(5)

Unnamed: 0,Race Time,Course,distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,Proform Speed Rating,Won P/L Before,evening_morning_price
0,01/04/2022 13:00:00,Leicester,1000-1200,0,0.006,0.647059,0.264368,0.129663,0.072046
1,01/04/2022 13:00:00,Leicester,1000-1200,0,0.004,0.647059,0.275862,0.129663,0.063279
2,01/04/2022 13:00:00,Leicester,1000-1200,0,0.024,0.647059,0.482759,0.129663,0.08958
3,01/04/2022 13:00:00,Leicester,1000-1200,1,0.008,0.647059,0.528736,0.12611,0.142182
4,01/04/2022 13:00:00,Leicester,1000-1200,0,0.132,0.45098,0.402299,0.129663,0.122585


In [6]:
train_data, test_data = DataCleaning.split_data(df = normalized_df)

X_train = train_data[['SP Odds Decimal', 'weight', 'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]
y_train = train_data['Won (1=Won, 0=Lost)']

X_test = test_data[['SP Odds Decimal', 'weight', 'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]
y_test = test_data['Won (1=Won, 0=Lost)']


Run the data through a random forest classifier.

In [7]:
# Instantiate and train the Random Forest classifier
params = {
    'n_estimators' : 100,
    'random_state' : 42
}

rf_classifier = RandomForestClassifier(**params)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

test_data['model_preds'] = y_pred

# Evaluate the model
print_metrics(y_test, y_pred)


Confusion Matrix: 
[[11257   267]
 [  905   191]]
Accuracy: 0.9071315372424723
Precision: 0.4170305676855895
Recall: 0.17427007299270073
F1 score: 0.24581724581724582
ROC AUC score: 0.575550517232206


In [9]:
test_data.to_csv('test_data.csv')

In [8]:
# Instantiate and train the Gradient Boosting classifier

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb_classifier.predict(X_test)

print_metrics(y_test, y_pred_gb)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- model_preds


Look at utilizing Random Oversampling on the dataset. This is due to the imbalanced nature of the target column.

In [None]:
# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print(f"Length of  X_train: {len(X_train)}\nLength of y_train: {len(y_train)}")
print(f"\nLength of  X_resampled: {len(X_resampled)}\nLength of y_resampled: {len(y_resampled)}")

Length of  X_train: 38910
Length of y_train: 38910

Length of  X_resampled: 68504
Length of y_resampled: 68504


In [None]:
# Instantiate and train the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_gb = gb_classifier.predict(X_test)

print_metrics(y_test, y_pred_gb)


Confusion Matrix: 
[[9124 2400]
 [ 273  823]]
Accuracy: 0.7881933438985737
Precision: 0.25535215637604713
Recall: 0.7509124087591241
F1 score: 0.3811067376707571
ROC AUC score: 0.7713256941400619
