In [1]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import xgboost as xgb

from src.functions import *

In [2]:
from src.db_connection import DatabaseConnector

db_connector = DatabaseConnector('horse')

db_connector.connect()

query = "select * from df1"

df = db_connector.fetch_data(query)

db_connector.disconnect()

df.head()

Connected to the database
Disconnected from the database


Unnamed: 0,Race Time,Course,Distance (y),Draw,Weight Rank,Breakfast Price,Morning Price,SP Odds Decimal,"Won (1=Won, 0=Lost)","Place (1=Placed, 0=UnPlaced)",Trainer/Jky Stats Rank,WON SR Before,Position LTO,Horse Stats Rank,Proform Speed Rating,WON SR Before_[0],unique_id,unique_id_int
0,2020-10-01 12:20:00,Salisbury,1760,2,1,10.0,11.0,13.0,0,1,1,0.0,0.0,1,61,0.0,2020-10-01 12:20:00_Salisbury,0
1,2020-10-01 12:20:00,Salisbury,1760,9,1,13.0,10.0,12.0,1,0,12,0.0,0.0,1,61,0.0,2020-10-01 12:20:00_Salisbury,0
2,2020-10-01 12:20:00,Salisbury,1760,12,1,11.0,11.0,17.0,0,0,8,0.0,4.0,9,51,0.0,2020-10-01 12:20:00_Salisbury,0
3,2020-10-01 12:20:00,Salisbury,1760,8,1,67.0,67.0,126.0,0,0,9,0.0,11.0,7,41,0.0,2020-10-01 12:20:00_Salisbury,0
4,2020-10-01 12:20:00,Salisbury,1760,10,1,21.0,19.0,21.0,0,0,3,0.0,0.0,1,37,0.0,2020-10-01 12:20:00_Salisbury,0


In [3]:
data = df

# Convert date column to Timestamp if it's not already
data['Race Time'] = pd.to_datetime(data['Race Time'])

# Determine the splitting point based on a specific date
split_date = pd.to_datetime('2021-10-01')  # Replace with your desired date

# Split the data into training and testing based on the date
train_data = data[data['Race Time'] < split_date]
test_data = data[data['Race Time'] >= split_date]

# Extract features and target variable for training and testing sets
train_features = train_data[['Draw', 'Weight Rank', 'SP Odds Decimal', 'Horse Stats Rank']]
train_target = train_data['Won (1=Won, 0=Lost)']

test_features = test_data[['Draw', 'Weight Rank', 'SP Odds Decimal', 'Horse Stats Rank']]
test_target = test_data['Won (1=Won, 0=Lost)']

# Initialize StandardScaler()
scaler = StandardScaler()

# Fit and and transform
scaled_train_features = scaler.fit_transform(train_features)
scaled_test_features = scaler.transform(test_features)

In [7]:
run_name = 'rfc'
with mlflow.start_run(run_name=run_name):

    # Initialize the Random Forest Classifier
    clf = RandomForestClassifier()

    # Train the model using the training data
    clf.fit(scaled_train_features, train_target)

    # Make predictions on the test set
    predictions = clf.predict(scaled_test_features)

    # Evaluate the model
    eval_print_log(test_target, predictions)

    mlflow.end_run()

Accuracy: 0.8812825431625103
Precision: 0.28437132784958874
Recall: 0.13451917732073373
F1 score: 0.18264150943396226
ROC AUC score: 0.5487444482006806


In [6]:


with mlflow.start_run(run_name='xgb'):
    # Initialize XGBoost Classifier
    xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)

    # Fit the model using the scaled training features and target
    xgb_model.fit(scaled_train_features, train_target)

    # Make predictions on the scaled test features
    y_pred = xgb_model.predict(scaled_test_features)

    # Evaluate accuracy
    eval_print_log(test_target, y_pred)

Accuracy: 0.9026034530008221
Precision: 0.5544554455445545
Recall: 0.0622568093385214
F1 score: 0.111944027986007
ROC AUC score: 0.5283921770151199
