In [42]:
import sys
sys.path.append('../src')

import mlflow
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pickle

from functions import *

from data_cleaning import DataCleaning

import os

pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../data/modelling_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Race Time,Course,Horse,Distance (y),SP Odds Decimal,"Won (1=Won, 0=Lost)","Place (1=Placed, 0=UnPlaced)",Pace Rating Rank,Trainer/Jky Stats Rank,Proform Speed Rating,LTO Speed Rating Rank,MR Career Speed Rating Rank,WON SR Before,Won P/L Before,Plc SR Before,Official Rating LTO,Position LTO,distance_bucket,evening_morning_price,breakfast_morning_price,weight,sp_odds_rank
0,0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,4,0,0,2,3,23,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,0.833333,0.909091,131,2
1,1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,3,0,0,2,7,24,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,0.75,0.75,131,1
2,2,01/04/2022 13:00:00,Leicester,Global Effort,1100,13,0,0,2,2,42,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,1.0,1.0,131,6
3,3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,5,1,0,2,11,46,1,1,0.0,-1.0,100.0,0.0,3.0,1000-1200,1.5,1.5,131,3
4,4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,67,0,0,2,6,35,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,1.313725,1.313725,121,9


In [3]:
data['distance_bucket'].unique()

array(['1000-1200', '1200-1400', '2400-2600', '2200-2400', '1400-1600',
       '1600-1800', '2000-2200', '1800-2000', '2600-2800', '3000-3200',
       '2800-3000', '3400-3600'], dtype=object)

Create a df with 5 independent features and 'Won (1=Won, 0=Lost)' as the dependent variable

In [4]:
df = data[['Race Time', 'Course', 'Horse', 'Distance (y)','distance_bucket', 'Won (1=Won, 0=Lost)', 'SP Odds Decimal', 'weight', 
           'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]

In [5]:
df.head(5)

Unnamed: 0,Race Time,Course,Horse,Distance (y),distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,Proform Speed Rating,Won P/L Before,evening_morning_price
0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,1000-1200,0,4,131,23,0.0,0.833333
1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,1000-1200,0,3,131,24,0.0,0.75
2,01/04/2022 13:00:00,Leicester,Global Effort,1100,1000-1200,0,13,131,42,0.0,1.0
3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,1000-1200,1,5,131,46,-1.0,1.5
4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,1000-1200,0,67,121,35,0.0,1.313725


In [6]:
df['SP Odds Decimal1'] = df['SP Odds Decimal']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SP Odds Decimal1'] = df['SP Odds Decimal']


In [7]:
normalized_df = DataCleaning.normalize_columns(df, ['SP Odds Decimal', 'weight', 
           'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price'])

normalized_df.head(5)

Unnamed: 0,Race Time,Course,Horse,Distance (y),distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,Proform Speed Rating,Won P/L Before,evening_morning_price,SP Odds Decimal1
0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,1000-1200,0,0.006,0.647059,0.264368,0.129663,0.072046,4
1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,1000-1200,0,0.004,0.647059,0.275862,0.129663,0.063279,3
2,01/04/2022 13:00:00,Leicester,Global Effort,1100,1000-1200,0,0.024,0.647059,0.482759,0.129663,0.08958,13
3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,1000-1200,1,0.008,0.647059,0.528736,0.12611,0.142182,5
4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,1000-1200,0,0.132,0.45098,0.402299,0.129663,0.122585,67


In [8]:
train_data, test_data = DataCleaning.split_data(df = normalized_df)

X_train = train_data[['SP Odds Decimal', 'weight', 'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]
y_train = train_data['Won (1=Won, 0=Lost)']

X_test = test_data[['SP Odds Decimal', 'weight', 'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]
y_test = test_data['Won (1=Won, 0=Lost)']


In [58]:
# count of unique races
unique_race_count = test_data['Race Time'].nunique()

print(f'Count of unique races in the test data is {unique_race_count}')

Count of unique races in the test data is 1149


# Use grid search to idenify the best hyperparameters for the model

In [63]:
test_data_ = test_data.copy()

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred1 = best_model.predict(X_test)
test_data_['model_preds'] = y_pred
print_metrics(y_test, y_pred1)

"\nparam_grid = {\n    'n_estimators': [100, 200],\n    'max_depth': [None, 10, 20],\n    'min_samples_split': [2, 5],\n    'min_samples_leaf': [1, 2]\n}\n\ngrid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1')\ngrid_search.fit(X_train, y_train)\nbest_model = grid_search.best_estimator_\n\n"

# Run the data through a random forest classifier.

In [9]:
test_data1 = test_data.copy()

# Instantiate and train the Random Forest classifier
params = {
    'n_estimators' : 100,
    'random_state' : 42
}
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    rf_classifier = RandomForestClassifier(**params)
    rf_classifier.fit(X_train, y_train)

    # Log the trained model
    mlflow.sklearn.log_model(rf_classifier, "random_forest_model")

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    test_data1['model_preds'] = y_pred

    # Evaluate the model
    # use the eval method
    print_metrics(y_test, y_pred)
    mlflow.end_run()

with open('base_model.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)


Confusion Matrix: 
[[8714  302]
 [ 916  235]]
Accuracy: 0.880200649159044
Precision: 0.4376163873370577
Recall: 0.20417028670721113
F1 score: 0.278436018957346
ROC AUC score: 0.5853371398043598




In [10]:
profit_calculation(test_data1)

Total return from betting £1 on each prediction where model_preds == 1: £166.00


In [17]:
test_data2 = test_data.copy()


# Example class weights for an imbalanced dataset
class_weights = {0: 1, 1: 5}

# Initialize and fit the RandomForest model
model = RandomForestClassifier(class_weight=class_weights, max_depth=10, min_samples_leaf=5)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred1 = model.predict(X_test)

test_data2['model_preds'] = y_pred

print_metrics(y_test, y_pred)



Confusion Matrix: 
[[8707  309]
 [ 919  232]]
Accuracy: 0.8792170748500049
Precision: 0.4288354898336414
Recall: 0.20156385751520417
F1 score: 0.27423167848699764
ROC AUC score: 0.5836457264505923


In [18]:
profit_calculation(test_data2)

Total return from betting £1 on each prediction where model_preds == 1: £114.00


In [71]:
test_data1.to_csv('test_data1.csv')

# Instantiate and train the Gradient Boosting classifier

In [59]:
test_data_gbm = test_data.copy()

# Instantiate the Gradient Boosting Classifier
gbm = GradientBoostingClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=2)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_gbm = best_model.predict(X_test)

test_data_gbm['model_preds'] = y_pred_gbm

print_metrics(y_test, y_pred_gbm)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; 

In [60]:
profit_calculation(test_data_gbm)

Total return from betting £1 on each prediction where model_preds == 1: £230.00


In [23]:
# Instantiate and train the Gradient Boosting classifier

test_data3 = test_data.copy()

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb_classifier.predict(X_test)

test_data3['model_preds'] = y_pred_gb

print_metrics(y_test, y_pred_gb)


Confusion Matrix: 
[[8873  143]
 [ 965  186]]
Accuracy: 0.8910199665584735
Precision: 0.5653495440729484
Recall: 0.16159860990443092
F1 score: 0.2513513513513514
ROC AUC score: 0.5728689589007514


In [24]:
profit_calculation(test_data3)

Total return from betting £1 on each prediction where model_preds == 1: £78.00


In [26]:
test_data4 = test_data.copy()

model = XGBClassifier(reg_lambda=1, reg_alpha=0)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = model.predict(X_test)

test_data4['model_preds'] = y_pred_xgb

print_metrics(y_test, y_pred_gb)


Confusion Matrix: 
[[8873  143]
 [ 965  186]]
Accuracy: 0.8910199665584735
Precision: 0.5653495440729484
Recall: 0.16159860990443092
F1 score: 0.2513513513513514
ROC AUC score: 0.5728689589007514


In [27]:
profit_calculation(test_data4)

Total return from betting £1 on each prediction where model_preds == 1: £178.00


# Look at utilizing Random Oversampling on the dataset. This is due to the imbalanced nature of the target column.

In [29]:
# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print(f"Length of  X_train: {len(X_train)}\nLength of y_train: {len(y_train)}")
print(f"\nLength of  X_resampled: {len(X_resampled)}\nLength of y_resampled: {len(y_resampled)}")

Length of  X_train: 41363
Length of y_train: 41363

Length of  X_resampled: 73520
Length of y_resampled: 73520


In [32]:
# Instantiate and train the Gradient Boosting classifier
test_data5 = test_data.copy()

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_gbx = gb_classifier.predict(X_test)

test_data5['model_preds'] = y_pred_gbx

print_metrics(y_test, y_pred_gbx)


Confusion Matrix: 
[[6643 2373]
 [ 214  937]]
Accuracy: 0.7455493262515983
Precision: 0.28308157099697884
Recall: 0.8140747176368376
F1 score: 0.4200851826944631
ROC AUC score: 0.7754379799364313


In [33]:
profit_calculation(test_data5)

Total return from betting £1 on each prediction where model_preds == 1: £1359.00


# Try an approach using smote

In [35]:
test_data6 = test_data.copy()

smote= SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred_gbs = gb_classifier.predict(X_test)

test_data6['model_preds'] = y_pred_gbs

# Evaluate the model
# use the eval method
print_metrics(y_test, y_pred)


Confusion Matrix: 
[[8707  309]
 [ 919  232]]
Accuracy: 0.8792170748500049
Precision: 0.4288354898336414
Recall: 0.20156385751520417
F1 score: 0.27423167848699764
ROC AUC score: 0.5836457264505923


In [36]:
profit_calculation(test_data6)

Total return from betting £1 on each prediction where model_preds == 1: £656.00


In [37]:
test_data7 = test_data.copy()

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

rf_classifier = RandomForestClassifier(**params)
rf_classifier.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

test_data7['model_preds'] = y_pred

# Evaluate the model
# use the eval method
print_metrics(y_test, y_pred)


Confusion Matrix: 
[[8497  519]
 [ 830  321]]
Accuracy: 0.8673158257106325
Precision: 0.3821428571428571
Recall: 0.2788879235447437
F1 score: 0.32245102963335004
ROC AUC score: 0.6106617967324428


In [38]:
profit_calculation(test_data7)

Total return from betting £1 on each prediction where model_preds == 1: £252.00


In [39]:
test_data8 = test_data.copy()

# Assume X_train, X_test, y_train, y_test are already defined
# Setting class_weight to 'balanced' makes the algorithm cost-sensitive
model = RandomForestClassifier(class_weight='balanced')

# Training the model
model.fit(X_train, y_train)

# Predictions
y_predx = model.predict(X_test)

test_data8['model_preds'] = y_predx

# Evaluation
print_metrics(y_test, y_predx)


Confusion Matrix: 
[[8694  322]
 [ 929  222]]
Accuracy: 0.8769548539392151
Precision: 0.40808823529411764
Recall: 0.19287576020851432
F1 score: 0.2619469026548672
ROC AUC score: 0.5785807372471143


In [40]:
profit_calculation(test_data8)

Total return from betting £1 on each prediction where model_preds == 1: £83.00


In [45]:
test_data9 = test_data.copy()

# Define cost matrix
C_FN = 5  # Cost of false negative
C_FP = 1  # Cost of false positive

# Assuming y_train has values 0 and 1
weights = np.ones(y_train.shape[0])
weights[y_train == 1] = C_FN
weights[y_train == 0] = C_FP

# Train logistic regression model with custom weights
model = LogisticRegression()
model.fit(X_train, y_train, sample_weight=weights)

# Predictions
y_predz = model.predict(X_test)

test_data9['model_preds'] = y_predz

# Evaluation
print_metrics(y_test, y_predz)



Confusion Matrix: 
[[7353 1663]
 [ 393  758]]
Accuracy: 0.7977771220615717
Precision: 0.3130937629078893
Recall: 0.6585577758470895
F1 score: 0.4244120940649496
ROC AUC score: 0.7370539544719032


In [46]:
profit_calculation(test_data9)

Total return from betting £1 on each prediction where model_preds == 1: £1442.00


# Try a tensorflow model

In [48]:
test_data10 = test_data.copy()

model = Sequential([
    Dense(64, activation='relu', input_shape = (X_train.shape[1], )), # input layer
    Dense(32, activation='relu'), # hidden layer
    Dense(1, activation='sigmoid') # output layer for binary classification
])

# compile the model
model.compile(optimizer='adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Get model predictions as probabilities
predictions = model.predict(X_test)

# Convert probabilities to binary outcomes (0 or 1)
predicted_classes = (predictions > 0.5).astype("int32").flatten()

test_data10['model_preds'] = predicted_classes
test_data10['probability'] = predictions

# Evaluation
print_metrics(y_test, predicted_classes)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 669us/step - accuracy: 0.8671 - loss: 0.3670 - val_accuracy: 0.8875 - val_loss: 0.2790
Epoch 2/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 438us/step - accuracy: 0.8905 - loss: 0.2731 - val_accuracy: 0.8876 - val_loss: 0.2761
Epoch 3/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 445us/step - accuracy: 0.8888 - loss: 0.2764 - val_accuracy: 0.8877 - val_loss: 0.2737
Epoch 4/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 439us/step - accuracy: 0.8909 - loss: 0.2734 - val_accuracy: 0.8869 - val_loss: 0.2731
Epoch 5/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 440us/step - accuracy: 0.8894 - loss: 0.2679 - val_accuracy: 0.8886 - val_loss: 0.2714
Epoch 6/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 437us/step - accuracy: 0.8930 - loss: 0.2648 - val_accuracy: 0.8875 - val_loss: 0.2735
Epoch 7/10
[1m

In [49]:
profit_calculation(test_data10)

Total return from betting £1 on each prediction where model_preds == 1: £94.00


In [57]:
test_data10.head(5)

Unnamed: 0,Race Time,Course,Horse,Distance (y),distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,Proform Speed Rating,Won P/L Before,evening_morning_price,SP Odds Decimal1,model_preds,probability
72,01/05/2021 15:40:00,Newmarket,Battleground,1760,1600-1800,0,0.01,0.54902,0.747126,0.14167,0.129031,6,0,0.703784
73,01/05/2021 15:40:00,Newmarket,Chindit,1760,1600-1800,0,0.024,0.54902,0.931034,0.172753,0.077203,13,0,0.73965
74,01/05/2021 15:40:00,Newmarket,Devilwala,1760,1600-1800,0,0.2,0.54902,0.758621,0.127886,0.054744,101,0,0.179047
75,01/05/2021 15:40:00,Newmarket,Legion Of Honour,1760,1600-1800,0,0.16,0.54902,0.873563,0.124938,0.068951,81,0,0.293623
76,01/05/2021 15:40:00,Newmarket,Lucky Vega,1760,1600-1800,0,0.024,0.54902,0.977011,0.149201,0.064826,13,0,0.792556


In [54]:
test_data11 = test_data.copy()

# Train the model
history = model.fit(X_resampled, y_resampled, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Get model predictions as probabilities
predictions1 = model.predict(X_test)

# Convert probabilities to binary outcomes (0 or 1)
predicted_classes1 = (predictions > 0.5).astype("int32").flatten()

test_data11['model_preds'] = predicted_classes1
test_data11['probability'] = predictions1

# Evaluation
print_metrics(y_test, predicted_classes1)

Epoch 1/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 442us/step - accuracy: 0.7715 - loss: 0.4697 - val_accuracy: 0.7152 - val_loss: 0.5253
Epoch 2/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 407us/step - accuracy: 0.7676 - loss: 0.4733 - val_accuracy: 0.7444 - val_loss: 0.4477
Epoch 3/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 407us/step - accuracy: 0.7673 - loss: 0.4721 - val_accuracy: 0.7124 - val_loss: 0.5044
Epoch 4/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 407us/step - accuracy: 0.7695 - loss: 0.4706 - val_accuracy: 0.7165 - val_loss: 0.5045
Epoch 5/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 418us/step - accuracy: 0.7704 - loss: 0.4684 - val_accuracy: 0.7340 - val_loss: 0.4868
Epoch 6/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 405us/step - accuracy: 0.7735 - loss: 0.4689 - val_accuracy: 0.7242 - val_loss: 0.4821
Epoc

In [55]:
profit_calculation(test_data11)

Total return from betting £1 on each prediction where model_preds == 1: £94.00
