In [9]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd
import xgboost as xgb

from src.functions import *

In [2]:
from src.db_connection import DatabaseConnector

db_connector = DatabaseConnector('horse')

db_connector.connect()

query = "select * from df1 where Course = 'Ascot'"

df = db_connector.fetch_data(query)

db_connector.disconnect()

df.head()

Connected to the database
Disconnected from the database


Unnamed: 0,Race Time,Course,Distance (y),Draw,Weight Rank,Breakfast Price,Morning Price,SP Odds Decimal,"Won (1=Won, 0=Lost)","Place (1=Placed, 0=UnPlaced)",Trainer/Jky Stats Rank,WON SR Before,Position LTO,Horse Stats Rank,Proform Speed Rating,WON SR Before_[0],unique_id,unique_id_int
0,2020-10-02 13:55:00,Ascot,1760,1,1,6.5,7.5,19.0,0,0,6,21.88,1.0,3,63,21.88,2020-10-02 13:55:00_Ascot,16
1,2020-10-02 13:55:00,Ascot,1760,2,1,5.0,6.5,6.0,0,0,5,7.69,4.0,5,62,7.69,2020-10-02 13:55:00_Ascot,16
2,2020-10-02 13:55:00,Ascot,1760,6,1,11.0,15.0,19.0,0,0,1,25.0,8.0,2,49,25.0,2020-10-02 13:55:00_Ascot,16
3,2020-10-02 13:55:00,Ascot,1760,5,1,4.5,4.5,3.75,0,0,3,0.0,14.0,6,60,0.0,2020-10-02 13:55:00_Ascot,16
4,2020-10-02 13:55:00,Ascot,1760,3,5,6.5,5.0,4.0,0,1,4,33.33,3.0,4,68,33.33,2020-10-02 13:55:00_Ascot,16


In [3]:
data = df

# Convert date column to Timestamp if it's not already
data['Race Time'] = pd.to_datetime(data['Race Time'])

train_data, test_data = split_data(data, 'unique_id_int', test_size=0.2)

# Extract features and target variable for training and testing sets
train_features = train_data[['Draw', 'Weight Rank', 'SP Odds Decimal', 'Horse Stats Rank']]
train_target = train_data['Won (1=Won, 0=Lost)']

test_features = test_data[['Draw', 'Weight Rank', 'SP Odds Decimal', 'Horse Stats Rank']]
test_target = test_data['Won (1=Won, 0=Lost)']

# Initialize StandardScaler()
scaler = StandardScaler()

# Fit and and transform
scaled_train_features = scaler.fit_transform(train_features)
scaled_test_features = scaler.transform(test_features)

In [10]:
run_name = 'rf_loop_est=200_ascot'
with mlflow.start_run(run_name=run_name):
    scaler = StandardScaler()

    clf = RandomForestClassifier(n_estimators=200, random_state=42)

    # Train the model using the training data
    clf.fit(scaled_train_features, train_target)

    # Make predictions on the test set
    predictions = clf.predict(scaled_test_features)

    # Evaluate the model
    eval_print_log(test_target, predictions)

    mlflow.end_run()

Accuracy: 0.8595317725752508
Precision: 0.23809523809523808
Recall: 0.16129032258064516
F1 score: 0.1923076923076923
ROC AUC score: 0.5507944150216659


In [5]:
run_name = 'rf_loop_est=100_ascot'
with mlflow.start_run(run_name=run_name):
    scaler = StandardScaler()

    clf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model using the training data
    clf.fit(scaled_train_features, train_target)

    # Make predictions on the test set
    predictions = clf.predict(scaled_test_features)

    # Evaluate the model
    eval_print_log(test_target, predictions)

    mlflow.end_run()

    
    

Accuracy: 0.862876254180602
Precision: 0.25
Recall: 0.16129032258064516
F1 score: 0.19607843137254902
ROC AUC score: 0.5526600866634569


In [6]:
run_name = 'rf_loop_est=50_ascot'
with mlflow.start_run(run_name=run_name):
    scaler = StandardScaler()

    clf = RandomForestClassifier(n_estimators=50, random_state=42)

    # Train the model using the training data
    clf.fit(scaled_train_features, train_target)

    # Make predictions on the test set
    predictions = clf.predict(scaled_test_features)

    # Evaluate the model
    eval_print_log(test_target, predictions)

    mlflow.end_run()

Accuracy: 0.862876254180602
Precision: 0.25
Recall: 0.16129032258064516
F1 score: 0.19607843137254902
ROC AUC score: 0.5526600866634569


In [7]:
run_name = 'rfc_ascot'
with mlflow.start_run(run_name=run_name):

    # Initialize the Random Forest Classifier
    clf = RandomForestClassifier()

    # Train the model using the training data
    clf.fit(scaled_train_features, train_target)

    # Make predictions on the test set
    predictions = clf.predict(scaled_test_features)

    # Evaluate the model
    eval_print_log(test_target, predictions)

    mlflow.end_run()

Accuracy: 0.8662207357859532
Precision: 0.2631578947368421
Recall: 0.16129032258064516
F1 score: 0.19999999999999998
ROC AUC score: 0.554525758305248


In [8]:


with mlflow.start_run(run_name='xgb_ascot'):
    # Initialize XGBoost Classifier
    xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)

    # Fit the model using the scaled training features and target
    xgb_model.fit(scaled_train_features, train_target)

    # Make predictions on the scaled test features
    y_pred = xgb_model.predict(scaled_test_features)

    # Evaluate accuracy
    eval_print_log(test_target, y_pred)

Accuracy: 0.862876254180602
Precision: 0.25
Recall: 0.16129032258064516
F1 score: 0.19607843137254902
ROC AUC score: 0.5526600866634569


In [6]:
best_model_uri = get_best_model()
loaded_best_model = mlflow.sklearn.load_model(best_model_uri)

OSError: No such file or directory: '/Users/andrewbarwise/Desktop/horse_trading/mlruns/0/51b333be2a2b4dea8ff87db1be422821/artifacts/mlruns'