- take the best performing model and do some clustering to identify the races it performs well on.

In [1]:
import sys
sys.path.append('../src')

import mlflow
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pickle

from functions import *

from data_cleaning import DataCleaning

import os

pd.set_option('display.max_columns', None)


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
data = pd.read_csv('../data/modelling_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Race Time,Course,Horse,Distance (y),SP Odds Decimal,"Won (1=Won, 0=Lost)","Place (1=Placed, 0=UnPlaced)",Pace Rating Rank,Trainer/Jky Stats Rank,LTO Speed Rating Rank,MR Career Speed Rating Rank,WON SR Before,Won P/L Before,Plc SR Before,Official Rating LTO,Position LTO,distance_bucket,Morning Price,evening_morning_price,breakfast_morning_price,weight,PFR,sp_odds_rank
0,0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,4,0,0,2,3,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,10,0.833333,0.909091,131,3,2
1,1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,3,0,0,2,7,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,3,0.75,0.75,131,3,1
2,2,01/04/2022 13:00:00,Leicester,Global Effort,1100,13,0,0,2,2,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,9,1.0,1.0,131,7,6
3,3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,5,1,0,2,11,1,1,0.0,-1.0,100.0,0.0,3.0,1000-1200,3,1.5,1.5,131,30,3
4,4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,67,0,0,2,6,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,67,1.313725,1.313725,121,10,9


In [3]:
data['distance_bucket'].unique()

array(['1000-1200', '1200-1400', '2400-2600', '2200-2400', '1400-1600',
       '1600-1800', '2000-2200', '1800-2000', '2600-2800', '3000-3200',
       '2800-3000', '3400-3600'], dtype=object)

Create a df with 5 independent features and 'Won (1=Won, 0=Lost)' as the dependent variable

In [4]:
df = data[['Race Time', 'Course', 'Horse', 'Distance (y)','distance_bucket', 'Won (1=Won, 0=Lost)', 'SP Odds Decimal', 'weight', 
           'PFR', 'Won P/L Before', 'evening_morning_price']]

In [5]:
df.head(5)

Unnamed: 0,Race Time,Course,Horse,Distance (y),distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,PFR,Won P/L Before,evening_morning_price
0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,1000-1200,0,4,131,3,0.0,0.833333
1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,1000-1200,0,3,131,3,0.0,0.75
2,01/04/2022 13:00:00,Leicester,Global Effort,1100,1000-1200,0,13,131,7,0.0,1.0
3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,1000-1200,1,5,131,30,-1.0,1.5
4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,1000-1200,0,67,121,10,0.0,1.313725


In [6]:
df['SP Odds Decimal1'] = df['SP Odds Decimal']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SP Odds Decimal1'] = df['SP Odds Decimal']


In [7]:
normalized_df = DataCleaning.normalize_columns(df, ['SP Odds Decimal', 'weight', 
           'PFR', 'Won P/L Before', 'evening_morning_price'])

normalized_df.head(5)

Unnamed: 0,Race Time,Course,Horse,Distance (y),distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,PFR,Won P/L Before,evening_morning_price,SP Odds Decimal1
0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,1000-1200,0,0.006,0.647059,0.07438,0.129663,0.072046,4
1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,1000-1200,0,0.004,0.647059,0.07438,0.129663,0.063279,3
2,01/04/2022 13:00:00,Leicester,Global Effort,1100,1000-1200,0,0.024,0.647059,0.107438,0.129663,0.08958,13
3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,1000-1200,1,0.008,0.647059,0.297521,0.12611,0.142182,5
4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,1000-1200,0,0.132,0.45098,0.132231,0.129663,0.122585,67


In [8]:
train_data, test_data = DataCleaning.split_data(df = normalized_df)

X_train = train_data[['SP Odds Decimal', 'weight', 'PFR', 'Won P/L Before', 'evening_morning_price']]
y_train = train_data['Won (1=Won, 0=Lost)']

X_test = test_data[['SP Odds Decimal', 'weight', 'PFR', 'Won P/L Before', 'evening_morning_price']]
y_test = test_data['Won (1=Won, 0=Lost)']


In [9]:
# count of unique races
unique_race_count = test_data['Race Time'].nunique()

print(f'Count of unique races in the test data is {unique_race_count}')

Count of unique races in the test data is 1149


# Use grid search to idenify the best hyperparameters for the model

In [10]:
%%script false --no-raise-error

est_data_ = test_data.copy()

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred1 = best_model.predict(X_test)
est_data_['model_preds'] = y_pred1
print_metrics(y_test, y_pred1)

# Run the data through a random forest classifier.

In [11]:
test_data1 = test_data.copy()

# Instantiate and train the Random Forest classifier
params = {
    'n_estimators' : 100,
    'random_state' : 42
}
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    rf_classifier = RandomForestClassifier(**params)
    rf_classifier.fit(X_train, y_train)

    # Log the trained model
    mlflow.sklearn.log_model(rf_classifier, "random_forest_model")

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    test_data1['model_preds'] = y_pred

    # Evaluate the model
    # use the eval method
    print_metrics(y_test, y_pred)
    mlflow.end_run()

with open('../models/base_model.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)




Confusion Matrix: 
[[8762  254]
 [1008  143]]
Accuracy: 0.8758729221992722
Precision: 0.3602015113350126
Recall: 0.12423979148566464
F1 score: 0.1847545219638243
ROC AUC score: 0.5480338265325394


In [12]:
profit_calculation(test_data1)

Total number of bets: 397
Total return from betting £1.00 on each prediction where model_preds == 1: £-64.00
Return per pound invested: £-0.16
Model accuracy: 36.02%


In [13]:
test_data2 = test_data.copy()


# Example class weights for an imbalanced dataset
class_weights = {0: 1, 1: 5}

# Initialize and fit the RandomForest model
model = RandomForestClassifier(class_weight=class_weights, max_depth=10, min_samples_leaf=5)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred1 = model.predict(X_test)

test_data2['model_preds'] = y_pred1

print_metrics(y_test, y_pred1)



Confusion Matrix: 
[[7568 1448]
 [ 535  616]]
Accuracy: 0.8049572145175568
Precision: 0.29844961240310075
Recall: 0.5351867940920938
F1 score: 0.383203732503888
ROC AUC score: 0.6872917111542989


In [14]:
profit_calculation(test_data2)

Total number of bets: 2064
Total return from betting £1.00 on each prediction where model_preds == 1: £-205.00
Return per pound invested: £-0.10
Model accuracy: 29.84%


# Instantiate and train the Gradient Boosting classifier

In [43]:
test_data_gbm = test_data.copy()

# Instantiate the Gradient Boosting Classifier
gbm = GradientBoostingClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=2)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_gbm = best_model.predict(X_test)

test_data_gbm['model_preds'] = y_pred_gbm

print_metrics(y_test, y_pred_gbm)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; 

In [44]:
profit_calculation(test_data_gbm)

Total number of bets: 386
Total return from betting £1.00 on each prediction where model_preds == 1: £-77.00
Return per pound invested: £-0.20
Model accuracy: 34.46%


In [17]:
# Instantiate and train the Gradient Boosting classifier

test_data3 = test_data.copy()

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb_classifier.predict(X_test)

test_data3['model_preds'] = y_pred_gb

print_metrics(y_test, y_pred_gb)


Confusion Matrix: 
[[8962   54]
 [1090   61]]
Accuracy: 0.8874790990459329
Precision: 0.5304347826086957
Recall: 0.052997393570807995
F1 score: 0.09636650868878358
ROC AUC score: 0.5235040206540819


In [18]:
profit_calculation(test_data3)

Total number of bets: 115
Total return from betting £1.00 on each prediction where model_preds == 1: £-28.00
Return per pound invested: £-0.24
Model accuracy: 53.04%


In [19]:
test_data4 = test_data.copy()

model = XGBClassifier(reg_lambda=1, reg_alpha=0)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = model.predict(X_test)

test_data4['model_preds'] = y_pred_xgb

print_metrics(y_test, y_pred_gb)


Confusion Matrix: 
[[8962   54]
 [1090   61]]
Accuracy: 0.8874790990459329
Precision: 0.5304347826086957
Recall: 0.052997393570807995
F1 score: 0.09636650868878358
ROC AUC score: 0.5235040206540819


In [20]:
profit_calculation(test_data4)

Total number of bets: 244
Total return from betting £1.00 on each prediction where model_preds == 1: £-23.00
Return per pound invested: £-0.09
Model accuracy: 48.77%


# Look at utilizing Random Oversampling on the dataset. This is due to the imbalanced nature of the target column.

In [21]:
# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print(f"Length of  X_train: {len(X_train)}\nLength of y_train: {len(y_train)}")
print(f"\nLength of  X_resampled: {len(X_resampled)}\nLength of y_resampled: {len(y_resampled)}")

Length of  X_train: 41363
Length of y_train: 41363

Length of  X_resampled: 73520
Length of y_resampled: 73520


In [22]:
# Instantiate and train the Gradient Boosting classifier
test_data5 = test_data.copy()

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_gbx = gb_classifier.predict(X_test)

test_data5['model_preds'] = y_pred_gbx

print_metrics(y_test, y_pred_gbx)


Confusion Matrix: 
[[6225 2791]
 [ 307  844]]
Accuracy: 0.6952886790597029
Precision: 0.23218707015130674
Recall: 0.733275412684622
F1 score: 0.3526953614709569
ROC AUC score: 0.7118573159252746


In [23]:
profit_calculation(test_data5)

Total number of bets: 3635
Total return from betting £1.00 on each prediction where model_preds == 1: £-353.00
Return per pound invested: £-0.10
Model accuracy: 23.22%


# Try an approach using smote

In [24]:
test_data6 = test_data.copy()

smote= SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred_gbs = gb_classifier.predict(X_test)

test_data6['model_preds'] = y_pred_gbs

# Evaluate the model
# use the eval method
print_metrics(y_test, y_pred)


Confusion Matrix: 
[[8762  254]
 [1008  143]]
Accuracy: 0.8758729221992722
Precision: 0.3602015113350126
Recall: 0.12423979148566464
F1 score: 0.1847545219638243
ROC AUC score: 0.5480338265325394


In [25]:
profit_calculation(test_data6)

Total number of bets: 1371
Total return from betting £1.00 on each prediction where model_preds == 1: £-141.00
Return per pound invested: £-0.10
Model accuracy: 32.31%


In [26]:
test_data7 = test_data.copy()

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

rf_classifier = RandomForestClassifier(**params)
rf_classifier.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

test_data7['model_preds'] = y_pred

# Evaluate the model
# use the eval method
print_metrics(y_test, y_pred)


Confusion Matrix: 
[[8576  440]
 [ 930  221]]
Accuracy: 0.8652503196616504
Precision: 0.33434190620272314
Recall: 0.19200695047784536
F1 score: 0.24392935982339956
ROC AUC score: 0.5716024104651871


In [27]:
profit_calculation(test_data7)

Total number of bets: 661
Total return from betting £1.00 on each prediction where model_preds == 1: £-84.00
Return per pound invested: £-0.13
Model accuracy: 33.43%


In [28]:
test_data8 = test_data.copy()

# Assume X_train, X_test, y_train, y_test are already defined
# Setting class_weight to 'balanced' makes the algorithm cost-sensitive
model = RandomForestClassifier(class_weight='balanced')

# Training the model
model.fit(X_train, y_train)

# Predictions
y_predx = model.predict(X_test)

test_data8['model_preds'] = y_predx

# Evaluation
print_metrics(y_test, y_predx)


Confusion Matrix: 
[[8741  275]
 [ 998  153]]
Accuracy: 0.8747909904593292
Precision: 0.3574766355140187
Recall: 0.13292788879235448
F1 score: 0.19379354021532616
ROC AUC score: 0.5512132789125924


In [29]:
profit_calculation(test_data8)

Total number of bets: 428
Total return from betting £1.00 on each prediction where model_preds == 1: £-65.00
Return per pound invested: £-0.15
Model accuracy: 35.75%


In [30]:
test_data9 = test_data.copy()

# Define cost matrix
C_FN = 5  # Cost of false negative
C_FP = 1  # Cost of false positive

# Assuming y_train has values 0 and 1
weights = np.ones(y_train.shape[0])
weights[y_train == 1] = C_FN
weights[y_train == 0] = C_FP

# Train logistic regression model with custom weights
model = LogisticRegression()
model.fit(X_train, y_train, sample_weight=weights)

# Predictions
y_predz = model.predict(X_test)

test_data9['model_preds'] = y_predz

# Evaluation
print_metrics(y_test, y_predz)



Confusion Matrix: 
[[7228 1788]
 [ 523  628]]
Accuracy: 0.7726959771810761
Precision: 0.2599337748344371
Recall: 0.5456125108601216
F1 score: 0.3521166246145219
ROC AUC score: 0.6736492013040625


In [31]:
profit_calculation(test_data9)

Total number of bets: 2416
Total return from betting £1.00 on each prediction where model_preds == 1: £-280.00
Return per pound invested: £-0.12
Model accuracy: 25.99%


# Try a tensorflow model

In [32]:
test_data10 = test_data.copy()

model = Sequential([
    Dense(64, activation='relu', input_shape = (X_train.shape[1], )), # input layer
    Dense(32, activation='relu'), # hidden layer
    Dense(1, activation='sigmoid') # output layer for binary classification
])

# compile the model
model.compile(optimizer='adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Get model predictions as probabilities
predictions = model.predict(X_test)

# Convert probabilities to binary outcomes (0 or 1)
predicted_classes = (predictions > 0.5).astype("int32").flatten()

test_data10['model_preds'] = predicted_classes
test_data10['probability'] = predictions

# Evaluation
print_metrics(y_test, predicted_classes)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 524us/step - accuracy: 0.8879 - loss: 0.3567 - val_accuracy: 0.8868 - val_loss: 0.3092
Epoch 2/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 444us/step - accuracy: 0.8867 - loss: 0.3082 - val_accuracy: 0.8868 - val_loss: 0.3056
Epoch 3/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 429us/step - accuracy: 0.8883 - loss: 0.3009 - val_accuracy: 0.8868 - val_loss: 0.3017
Epoch 4/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 441us/step - accuracy: 0.8882 - loss: 0.2977 - val_accuracy: 0.8868 - val_loss: 0.3013
Epoch 5/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 459us/step - accuracy: 0.8907 - loss: 0.2918 - val_accuracy: 0.8868 - val_loss: 0.2995
Epoch 6/10
[1m1293/1293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 453us/step - accuracy: 0.8893 - loss: 0.2980 - val_accuracy: 0.8868 - val_loss: 0.2977
Epoch 7/10
[1m

In [33]:
profit_calculation(test_data10)

Total number of bets: 16
Total return from betting £1.00 on each prediction where model_preds == 1: £-5.00
Return per pound invested: £-0.31
Model accuracy: 68.75%


In [34]:
test_data10.head(5)

Unnamed: 0,Race Time,Course,Horse,Distance (y),distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,PFR,Won P/L Before,evening_morning_price,SP Odds Decimal1,model_preds,probability
72,01/05/2021 15:40:00,Newmarket,Battleground,1760,1600-1800,0,0.01,0.54902,0.520661,0.14167,0.129031,6,0,0.162927
73,01/05/2021 15:40:00,Newmarket,Chindit,1760,1600-1800,0,0.024,0.54902,0.471074,0.172753,0.077203,13,0,0.059246
74,01/05/2021 15:40:00,Newmarket,Devilwala,1760,1600-1800,0,0.2,0.54902,0.181818,0.127886,0.054744,101,0,0.002369
75,01/05/2021 15:40:00,Newmarket,Legion Of Honour,1760,1600-1800,0,0.16,0.54902,0.355372,0.124938,0.068951,81,0,0.002589
76,01/05/2021 15:40:00,Newmarket,Lucky Vega,1760,1600-1800,0,0.024,0.54902,0.438017,0.149201,0.064826,13,0,0.06085


In [35]:
test_data11 = test_data.copy()

# Train the model
history = model.fit(X_resampled, y_resampled, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Get model predictions as probabilities
predictions1 = model.predict(X_test)

# Convert probabilities to binary outcomes (0 or 1)
predicted_classes1 = (predictions > 0.5).astype("int32").flatten()

test_data11['model_preds'] = predicted_classes1
test_data11['probability'] = predictions1

# Evaluation
print_metrics(y_test, predicted_classes1)

Epoch 1/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 418us/step - accuracy: 0.7084 - loss: 0.5579 - val_accuracy: 0.6640 - val_loss: 0.5946
Epoch 2/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 427us/step - accuracy: 0.7138 - loss: 0.5489 - val_accuracy: 0.6452 - val_loss: 0.5915
Epoch 3/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 422us/step - accuracy: 0.7144 - loss: 0.5455 - val_accuracy: 0.7111 - val_loss: 0.5311
Epoch 4/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 414us/step - accuracy: 0.7121 - loss: 0.5497 - val_accuracy: 0.7528 - val_loss: 0.4860
Epoch 5/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 454us/step - accuracy: 0.7145 - loss: 0.5460 - val_accuracy: 0.6657 - val_loss: 0.5972
Epoch 6/10
[1m2298/2298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 395us/step - accuracy: 0.7158 - loss: 0.5448 - val_accuracy: 0.6334 - val_loss: 0.6349
Epoc

In [36]:
profit_calculation(test_data11)

Total number of bets: 16
Total return from betting £1.00 on each prediction where model_preds == 1: £-5.00
Return per pound invested: £-0.31
Model accuracy: 68.75%


# Play around ith the logistic regression model as that is showing the best profit
- finetune it by playing with penalties and subsets of the data
    - distance buckets, classes
- also try running balanced datasets throught the model.

In [37]:
test_data9 = test_data.copy()

# Define cost matrix
C_FN = 5  # Cost of false negative
C_FP = 1  # Cost of false positive

# Assuming y_train has values 0 and 1
weights = np.ones(y_train.shape[0])
weights[y_train == 1] = C_FN
weights[y_train == 0] = C_FP

# Train logistic regression model with custom weights
model = LogisticRegression()
model.fit(X_train, y_train, sample_weight=weights)

# Predictions
y_predz = model.predict(X_test)

test_data9['model_preds'] = y_predz

# Evaluation
print_metrics(y_test, y_predz)

profit_calculation(test_data9)


Confusion Matrix: 
[[7228 1788]
 [ 523  628]]
Accuracy: 0.7726959771810761
Precision: 0.2599337748344371
Recall: 0.5456125108601216
F1 score: 0.3521166246145219
ROC AUC score: 0.6736492013040625
Total number of bets: 2416
Total return from betting £1.00 on each prediction where model_preds == 1: £-280.00
Return per pound invested: £-0.12
Model accuracy: 25.99%


# firstly lets try the smote dataset

In [38]:
test_data_smote = test_data.copy()

smote= SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define cost matrix
C_FN = 5  # Cost of false negative
C_FP = 1  # Cost of false positive

# Assuming y_train has values 0 and 1
weights = np.ones(y_train_smote.shape[0])
weights[y_train_smote == 1] = C_FN
weights[y_train_smote == 0] = C_FP

# Train logistic regression model with custom weights
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote, sample_weight=weights)

# Predictions
y_pred_smote = model.predict(X_test)

test_data_smote['model_preds'] = y_pred_smote

# Evaluation
print_metrics(y_test, y_pred_smote)

profit_calculation(test_data_smote)


Confusion Matrix: 
[[1925 7091]
 [  16 1135]]
Accuracy: 0.30097373856594867
Precision: 0.13797714563578897
Recall: 0.9860990443092963
F1 score: 0.2420816892396289
ROC AUC score: 0.5998041805397414
Total number of bets: 8226
Total return from betting £1.00 on each prediction where model_preds == 1: £-1043.00
Return per pound invested: £-0.13
Model accuracy: 13.80%


# try random oversampling

In [39]:
# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

test_data_ros = test_data.copy()

# Define cost matrix
C_FN = 5  # Cost of false negative
C_FP = 1  # Cost of false positive

# Assuming y_train has values 0 and 1
weights = np.ones(y_resampled.shape[0])
weights[y_resampled == 1] = C_FN
weights[y_resampled == 0] = C_FP

# Train logistic regression model with custom weights
model = LogisticRegression()
model.fit(X_resampled, y_resampled, sample_weight=weights)

# Predictions
y_pred_ros = model.predict(X_test)

test_data_ros['model_preds'] = y_pred_ros

# Evaluation
print_metrics(y_test, y_pred_ros)

profit_calculation(test_data_ros)


Confusion Matrix: 
[[1709 7307]
 [  12 1139]]
Accuracy: 0.28012196321432087
Precision: 0.13485673691688374
Recall: 0.9895742832319722
F1 score: 0.23736584349275816
ROC AUC score: 0.5895630954757909
Total number of bets: 8446
Total return from betting £1.00 on each prediction where model_preds == 1: £-1118.00
Return per pound invested: £-0.13
Model accuracy: 13.49%


# play around with some different penalties

In [40]:
%%script false --no-raise-error

# Copy test data
test_data91 = test_data.copy()

# Define grid of penalty values to test
C_FN_values = range(1,30)  # Range of false negative penalties
C_FP_values = range(1,30)  # Range of false positive penalties

# Initialize variables to store the best results
best_C_FN = None
best_C_FP = None
best_profit = -np.inf  # Start with negative infinity to ensure any positive profit is better

# Iterate over each combination of C_FN and C_FP
for C_FN in C_FN_values:
    for C_FP in C_FP_values:
        # Assign weights based on the current C_FN and C_FP
        weights = np.ones(y_train.shape[0])
        weights[y_train == 1] = C_FN
        weights[y_train == 0] = C_FP

        # Train the logistic regression model
        model = LogisticRegression()
        model.fit(X_train, y_train, sample_weight=weights)

        # Make predictions on the test data
        y_predz = model.predict(X_test)

        # Add predictions to the test data
        test_data91['model_preds'] = y_predz

        # Calculate the profit or loss (assuming profit_calculation function is defined)
        profit_calculation(test_data91)
        

        # Optional: Print progress (for larger grids)
        print(f"C_FN: {C_FN}, C_FP: {C_FP}\n")


# this is the model moving forward

In [41]:
test_data1 = test_data.copy()

# Instantiate and train the Random Forest classifier
params = {
    'n_estimators' : 100,
    'random_state' : 42
}
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    rf_classifier = RandomForestClassifier(**params)
    rf_classifier.fit(X_train, y_train)

    # Log the trained model
    mlflow.sklearn.log_model(rf_classifier, "random_forest_model")

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    test_data1['model_preds'] = y_pred

    # Evaluate the model
    # use the eval method
    print_metrics(y_test, y_pred)
    mlflow.end_run()

with open('../models/base_model.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)


Confusion Matrix: 
[[8762  254]
 [1008  143]]
Accuracy: 0.8758729221992722
Precision: 0.3602015113350126
Recall: 0.12423979148566464
F1 score: 0.1847545219638243
ROC AUC score: 0.5480338265325394


In [42]:
test_data_best = test_data.copy()

# Define cost matrix
# we are assigning a greater cost to false negatives.
C_FN = 6  # Cost of false negative
C_FP = 1  # Cost of false positive

# Assuming y_train has values 0 and 1

# creating a numpy array of one's where the length of the array == len(y_train)
# initially every sampple in the dataset is given a value of one
weights = np.ones(y_train.shape[0])

# here we are modifying the weights of all the samples in the training set that belong to class 1
# therefore the weights for samples where the label is 1 are updated to C_FN
weights[y_train == 1] = C_FN

# here we are modifying the weights of all the samples in the training set that belong to class 0
# therefore the weights for samples where the label is 1 are updated to C_FP
weights[y_train == 0] = C_FP

# Train logistic regression model with custom weights
model = LogisticRegression()
model.fit(X_train, y_train, sample_weight=weights)

# Save the model to a file using pickle
with open('../models/logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

test_data_best['model_preds'] = y_predz

# Evaluation
print_metrics(y_test, y_predz)
print()
profit_calculation(test_data_best, stake = 1)



Confusion Matrix: 
[[7228 1788]
 [ 523  628]]
Accuracy: 0.7726959771810761
Precision: 0.2599337748344371
Recall: 0.5456125108601216
F1 score: 0.3521166246145219
ROC AUC score: 0.6736492013040625

Total number of bets: 2416
Total return from betting £1.00 on each prediction where model_preds == 1: £-280.00
Return per pound invested: £-0.12
Model accuracy: 25.99%
