In [1]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [2]:
from src.db_connection import DatabaseConnector

db_connector = DatabaseConnector('horse')

db_connector.connect()

query = "select * from df1"

df = db_connector.fetch_data(query)

db_connector.disconnect()

if df is not None:
    print(df.head())

Connected to the database
Disconnected from the database
   MyUnknownColumn            Race Time     Course  Distance (y)  Draw  \
0                0  2020-10-01 12:20:00  Salisbury          1760     2   
1                1  2020-10-01 12:20:00  Salisbury          1760     9   
2                2  2020-10-01 12:20:00  Salisbury          1760    12   
3                3  2020-10-01 12:20:00  Salisbury          1760     8   
4                4  2020-10-01 12:20:00  Salisbury          1760    10   

   Weight Rank  Breakfast Price  Morning Price  SP Odds Decimal  \
0            1             10.0           11.0             13.0   
1            1             13.0           10.0             12.0   
2            1             11.0           11.0             17.0   
3            1             67.0           67.0            126.0   
4            1             21.0           19.0             21.0   

   Won (1=Won, 0=Lost)  Place (1=Placed, 0=UnPlaced)  Trainer/Jky Stats Rank  \
0              

In [3]:
data = df

# Convert date column to Timestamp if it's not already
data['Race Time'] = pd.to_datetime(data['Race Time'])

# Determine the splitting point based on a specific date
split_date = pd.to_datetime('2021-10-01')  # Replace with your desired date

# Split the data into training and testing based on the date
train_data = data[data['Race Time'] < split_date]
test_data = data[data['Race Time'] >= split_date]

# Extract features and target variable for training and testing sets
train_features = train_data[['Draw', 'Weight Rank', 'SP Odds Decimal', 'Horse Stats Rank']]
train_target = train_data['Won (1=Won, 0=Lost)']

test_features = test_data[['Draw', 'Weight Rank', 'SP Odds Decimal', 'Horse Stats Rank']]
test_target = test_data['Won (1=Won, 0=Lost)']

# Initialize StandardScaler()
scaler = StandardScaler()

# Fit and and transform
scaled_train_features = scaler.fit_transform(train_features)
scaled_test_features = scaler.transform(test_features)

In [4]:
# Initialize the Random Forest Classifier (you can choose other classifiers as well)
clf = RandomForestClassifier()

# Train the model using the training data
clf.fit(scaled_train_features, train_target)

# Make predictions on the test set
predictions = clf.predict(scaled_test_features)

# Evaluate the model
accuracy = accuracy_score(test_target, predictions)
precision = precision_score(test_target, predictions)
recall = recall_score(test_target, predictions)
f1 = f1_score(test_target, predictions)
roc_auc = roc_auc_score(test_target, predictions)

print(f"Model Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC Score: {roc_auc}")

Model Accuracy: 0.8806796382570568
Precision: 0.2802325581395349
Recall: 0.1339633129516398
F1 Score: 0.18127115456938697
ROC AUC Score: 0.5481624907212291


In [5]:
import xgboost as xgb
from sklearn.metrics import accuracy_score


# Initialize XGBoost Classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)

# Fit the model using the scaled training features and target
xgb_model.fit(scaled_train_features, train_target)

# Make predictions on the scaled test features
y_pred = xgb_model.predict(scaled_test_features)

# Evaluate accuracy
accuracy = accuracy_score(test_target, y_pred)
precision = precision_score(test_target, y_pred)
recall = recall_score(test_target, y_pred)
f1 = f1_score(test_target, y_pred)
roc_auc = roc_auc_score(test_target, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC Score: {roc_auc}")


Accuracy: 0.9026034530008221
Precision: 0.5544554455445545
Recall: 0.0622568093385214
F1 Score: 0.111944027986007
ROC AUC Score: 0.5283921770151199
