In [17]:
import sys
sys.path.append('../src')

import mlflow
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pickle

from functions import *

from data_cleaning import DataCleaning

import os

pd.set_option('display.max_columns', None)

In [18]:
data = pd.read_csv('../data/modelling_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Race Time,Course,Horse,Distance (y),SP Odds Decimal,"Won (1=Won, 0=Lost)","Place (1=Placed, 0=UnPlaced)",Pace Rating Rank,Trainer/Jky Stats Rank,Proform Speed Rating,LTO Speed Rating Rank,MR Career Speed Rating Rank,WON SR Before,Won P/L Before,Plc SR Before,Official Rating LTO,Position LTO,distance_bucket,evening_morning_price,breakfast_morning_price,weight,sp_odds_rank
0,0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,4,0,0,2,3,23,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,0.833333,0.909091,131,2
1,1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,3,0,0,2,7,24,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,0.75,0.75,131,1
2,2,01/04/2022 13:00:00,Leicester,Global Effort,1100,13,0,0,2,2,42,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,1.0,1.0,131,6
3,3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,5,1,0,2,11,46,1,1,0.0,-1.0,100.0,0.0,3.0,1000-1200,1.5,1.5,131,3
4,4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,67,0,0,2,6,35,3,3,0.0,0.0,0.0,0.0,0.0,1000-1200,1.313725,1.313725,121,9


Create a df with 5 independent features and 'Won (1=Won, 0=Lost)' as the dependent variable

In [19]:
df = data[['Race Time', 'Course', 'Horse', 'Distance (y)','distance_bucket', 'Won (1=Won, 0=Lost)', 'SP Odds Decimal', 'weight', 
           'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]

In [20]:
df.head(5)

Unnamed: 0,Race Time,Course,Horse,Distance (y),distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,Proform Speed Rating,Won P/L Before,evening_morning_price
0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,1000-1200,0,4,131,23,0.0,0.833333
1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,1000-1200,0,3,131,24,0.0,0.75
2,01/04/2022 13:00:00,Leicester,Global Effort,1100,1000-1200,0,13,131,42,0.0,1.0
3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,1000-1200,1,5,131,46,-1.0,1.5
4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,1000-1200,0,67,121,35,0.0,1.313725


In [21]:
normalized_df = DataCleaning.normalize_columns(df, ['SP Odds Decimal', 'weight', 
           'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price'])

normalized_df.head(5)

Unnamed: 0,Race Time,Course,Horse,Distance (y),distance_bucket,"Won (1=Won, 0=Lost)",SP Odds Decimal,weight,Proform Speed Rating,Won P/L Before,evening_morning_price
0,01/04/2022 13:00:00,Leicester,Camacho Star,1100,1000-1200,0,0.006,0.647059,0.264368,0.129663,0.072046
1,01/04/2022 13:00:00,Leicester,Cheeky Maxi,1100,1000-1200,0,0.004,0.647059,0.275862,0.129663,0.063279
2,01/04/2022 13:00:00,Leicester,Global Effort,1100,1000-1200,0,0.024,0.647059,0.482759,0.129663,0.08958
3,01/04/2022 13:00:00,Leicester,Jiffy Boy,1100,1000-1200,1,0.008,0.647059,0.528736,0.12611,0.142182
4,01/04/2022 13:00:00,Leicester,Man Made Of Smoke,1100,1000-1200,0,0.132,0.45098,0.402299,0.129663,0.122585


In [22]:
train_data, test_data = DataCleaning.split_data(df = normalized_df)

X_train = train_data[['SP Odds Decimal', 'weight', 'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]
y_train = train_data['Won (1=Won, 0=Lost)']

X_test = test_data[['SP Odds Decimal', 'weight', 'Proform Speed Rating', 'Won P/L Before', 'evening_morning_price']]
y_test = test_data['Won (1=Won, 0=Lost)']


# Use grid search to idenify the best hyperparameters for the model

In [23]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [24]:
y_pred1 = best_model.predict(X_test)
print_metrics(y_test, y_pred1)


Confusion Matrix: 
[[8714  302]
 [ 916  235]]
Accuracy: 0.880200649159044
Precision: 0.4376163873370577
Recall: 0.20417028670721113
F1 score: 0.278436018957346
ROC AUC score: 0.5853371398043598


# Run the data through a random forest classifier.

In [25]:
# Instantiate and train the Random Forest classifier
params = {
    'n_estimators' : 100,
    'random_state' : 42
}
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    rf_classifier = RandomForestClassifier(**params)
    rf_classifier.fit(X_train, y_train)

    # Log the trained model
    mlflow.sklearn.log_model(rf_classifier, "random_forest_model")

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    test_data['model_preds'] = y_pred

    # Evaluate the model
    # use the eval method
    print_metrics(y_test, y_pred)
    mlflow.end_run()

with open('base_model.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)


Confusion Matrix: 
[[8714  302]
 [ 916  235]]
Accuracy: 0.880200649159044
Precision: 0.4376163873370577
Recall: 0.20417028670721113
F1 score: 0.278436018957346
ROC AUC score: 0.5853371398043598




In [26]:
test_data.to_csv('test_data.csv')

# Instantiate and train the Gradient Boosting classifier

In [27]:
# Instantiate and train the Gradient Boosting classifier

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb_classifier.predict(X_test)

print_metrics(y_test, y_pred_gb)


Confusion Matrix: 
[[8873  143]
 [ 965  186]]
Accuracy: 0.8910199665584735
Precision: 0.5653495440729484
Recall: 0.16159860990443092
F1 score: 0.2513513513513514
ROC AUC score: 0.5728689589007514


# Look at utilizing Random Oversampling on the dataset. This is due to the imbalanced nature of the target column.

In [28]:
# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print(f"Length of  X_train: {len(X_train)}\nLength of y_train: {len(y_train)}")
print(f"\nLength of  X_resampled: {len(X_resampled)}\nLength of y_resampled: {len(y_resampled)}")

Length of  X_train: 41363
Length of y_train: 41363

Length of  X_resampled: 73520
Length of y_resampled: 73520


In [29]:
# Instantiate and train the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_gb = gb_classifier.predict(X_test)

print_metrics(y_test, y_pred_gb)


Confusion Matrix: 
[[6643 2373]
 [ 214  937]]
Accuracy: 0.7455493262515983
Precision: 0.28308157099697884
Recall: 0.8140747176368376
F1 score: 0.4200851826944631
ROC AUC score: 0.7754379799364313


# Try an approach using smote

In [30]:
smote= SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

rf_classifier = RandomForestClassifier(**params)
rf_classifier.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
# use the eval method
print_metrics(y_test, y_pred)


Confusion Matrix: 
[[8497  519]
 [ 830  321]]
Accuracy: 0.8673158257106325
Precision: 0.3821428571428571
Recall: 0.2788879235447437
F1 score: 0.32245102963335004
ROC AUC score: 0.6106617967324428


In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Assume X_train, X_test, y_train, y_test are already defined
# Setting class_weight to 'balanced' makes the algorithm cost-sensitive
model = RandomForestClassifier(class_weight='balanced')

# Training the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print(confusion_matrix(y_test, y_pred))
print_metrics(y_test, y_pred)

[[8710  306]
 [ 925  226]]

Confusion Matrix: 
[[8710  306]
 [ 925  226]]
Accuracy: 0.8789220025572932
Precision: 0.424812030075188
Recall: 0.19635099913119028
F1 score: 0.26856803327391565
ROC AUC score: 0.5812056681547699


In [33]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# Define cost matrix
C_FN = 5  # Cost of false negative
C_FP = 1  # Cost of false positive

# Assuming y_train has values 0 and 1
weights = np.ones(y_train.shape[0])
weights[y_train == 1] = C_FN
weights[y_train == 0] = C_FP

# Train logistic regression model with custom weights
model = LogisticRegression()
model.fit(X_train, y_train, sample_weight=weights)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print_metrics(y_test, y_pred)


[[7353 1663]
 [ 393  758]]

Confusion Matrix: 
[[7353 1663]
 [ 393  758]]
Accuracy: 0.7977771220615717
Precision: 0.3130937629078893
Recall: 0.6585577758470895
F1 score: 0.4244120940649496
ROC AUC score: 0.7370539544719032
