# Part 4: Horse Race Prediction
## Regression Modelling
- In this section, we want to predict the finishing times of horses in a race, and then use it to predict the winner.
- We will use RMSE to evaluate, then after classification of the horse with the fastest time, find the accuracy of our prediction.

In [None]:
!pip install lightgbm

In [None]:
!pip install xgboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, roc_curve, auc, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

import time
import joblib

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
# Read the train and test files
df_train1 = pd.read_csv('D:\\documentos\\IA Caballos\\hurdle_data\\df_train.csv')
df_test1 = pd.read_csv('D:\\documentos\\IA Caballos\\hurdle_data\\df_test.csv')
df_unseen1 = pd.read_csv('D:\\documentos\\IA Caballos\\hurdle_data\\df_unseen.csv')

In [None]:
df_train = df_train1#.dropna()
df_train

In [None]:
df_test = df_test1#.dropna()
df_test

In [None]:
df_unseen = df_unseen1#.dropna()
df_unseen

In [None]:
df_train.fillna(0, inplace=True)
df_train

In [None]:
df_test.fillna(0, inplace=True)
df_test

In [None]:
df_unseen.fillna(0, inplace=True)
df_unseen

In [None]:
# View the shape of the train and test files
print(df_train.shape)
print(df_test.shape)
print(df_unseen.shape)

In [None]:
# View the first 2 rows of the train file
df_train.head(2)

In [None]:
# View the first 2 rows of the test files
df_test.head(2)

In [None]:
# View the first 2 rows of the unseen file
df_unseen.head(2)

### Preprocessing of Train and Test Data

In [None]:
df_test['win_odds'] = df_test['win_odds'].str.rstrip('%').astype('float') / 100.0
df_train['win_odds'] = df_train['win_odds'].str.rstrip('%').astype('float') / 100.0
df_unseen['win_odds'] =df_unseen['win_odds'].str.rstrip('%').astype('float') / 100.0

In [None]:
import re
from numba import jit, cuda

@jit(target_backend='cuda')
def convert_to_km(distance):
    '''
    distance can be a string with km or m as units
    e.g. 300km, 1.1km, 200m, 4.5m
    '''
    
    # split the string into value and unit ['300', 'km']
    #split_dist = re.match('([\d\.]+)?([a-zA-Z]+)', distance)
    split_dist = re.findall('([\d\.]+)?([a-zA-Z]+)', distance, re.U)
    print(split_dist)
    dist = 0.0
    dist1 = 0.0
    dist2 = 0.0
    dist3 = 0.0
    for value in split_dist:
        if value[1] == 'm':
            dist1 = float(value[0])
        elif value[1] == 'f':
            dist2 = float(value[0])*0.125
        elif value[1] == 'y':
            dist3 = float(value[0])*0.0005681818
        else:
            pass
      
    dist = dist1 + dist2 + dist3
    
    return dist

In [None]:
df_train['race_distance'] = df_train.apply(lambda row: convert_to_km(row['race_distance']), axis=1)

df_train.head(2)

In [None]:
df_test['race_distance'] = df_test.apply(lambda row: convert_to_km(row['race_distance']), axis=1)

df_test.head(2)

In [None]:
df_unseen['race_distance'] = df_unseen.apply(lambda row: convert_to_km(row['race_distance']), axis=1)

df_unseen.head(2)

In [None]:
X_train = df_train[[#'actual_weight', 
                    'declared_horse_weight',
                    'draw', 
                    'win_odds',
                    'jockey_ave_rank','trainer_ave_rank',
                    'recent_ave_rank','race_distance', 
                     'jockey_hurdle_rate','trainers_hurdle_rate']]

# Define the target
y_train = df_train['jockey_ave_rank']

# Convert the target to seconds
#y_train = y_train.apply(lambda x: x.split('.'))
#y_train = y_train.apply(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/100)

In [None]:
y_train.head()

In [None]:
# Define the testing set
X_test = df_test[[#'actual_weight', 
                  'declared_horse_weight',
                    'draw', 
                     'win_odds', 
                    'jockey_ave_rank', 'trainer_ave_rank',
                    'recent_ave_rank', 'race_distance', 
                     'jockey_hurdle_rate','trainers_hurdle_rate']]

In [None]:
# Define the target
y_test = df_test['jockey_ave_rank']

# Convert the target to seconds
#y_test = y_test.apply(lambda x: x.split('.'))
#y_test = y_test.apply(lambda x: int(x[0])*60 + int(x[1]) + int(x[2])/100)

In [None]:
# Define the unseen set
X_unseen = df_unseen[[#'actual_weight', 
                      'declared_horse_weight',
                    'draw', 
                     'win_odds',
                    'jockey_ave_rank', 'trainer_ave_rank',
                    'recent_ave_rank', 'race_distance', 
                     'jockey_hurdle_rate','trainers_hurdle_rate']]

### Define functions to run and evaluate models

In [None]:
# This function finds the accuracy of the model for predicting the Top and Top 3 finishers
def find_prob(y_pred):
    
    i=0
    count_top_winners = 0
    count_top_correct = 0

    count_top3_winners = 0
    count_top3_correct = 0

    for column in ['HorseWin', 'HorseRankTop3']:
            
        for race in df_test['race_id'].unique():
            
            # Create temp dataframe
            temp = df_test[df_test['race_id']==race]

            # Get the index of the temp dataframe
            temp_index = temp.index

            # Find the index of the winners from the temp dataframe
            if i == 0:
                winners_index = temp[temp['finishing_position']==1].index
            else:
                winners_index = temp[temp['finishing_position']<=3].index

            # Create a temp dataframe for the predicted probabilities
            temp_pred = y_pred.iloc[temp_index]

            # Sort the temp dataframe by the predicted timings
            temp_pred = temp_pred.sort_values(by=temp_pred.columns[0])

            # Get the index of the winners from the temp pred dataframe
            if i == 0:
                winners_pred_index = temp_pred[:1].index
            else:
                winners_pred_index = temp_pred[:3].index

            # Count the number of winners and correct predictions
            if i == 0:
                count_top_winners += len(winners_index)
                count_top_correct += len(set(winners_index).intersection(set(winners_pred_index)))
            else:
                count_top3_winners += len(winners_index)
                count_top3_correct += len(set(winners_index).intersection(set(winners_pred_index)))
        i+=1
    
    # Calculate the accuracy
    top_accuracy = round(count_top_correct/count_top_winners, 3)
    top3_accuracy = round(count_top3_correct/count_top3_winners, 3)

    return top_accuracy, top3_accuracy

In [None]:
# Create a dataframe to store the results
results = pd.DataFrame(columns=['Model', 'RMSE_train', 'RMSE_test', 
                                'Generalization', 'Top1_Train_Accuracy', 'Top1_Test_Accuracy',
                                'Top3_Train_Accuracy', 'Top3_Test_Accuracy'])

In [None]:
results

In [None]:
# Define function to run the model
def run_model(model, X_train, y_train, X_test, y_test, X_unseen):

        # Store model name
        model_name = model.__class__.__name__

        # Scale the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        X_unseen = scaler.transform(X_unseen)

        # Fit the model         
        model.fit(X_train, y_train)
        
        # Predict on the training set
        y_train_pred = model.predict(X_train)
        y_train_pred = pd.DataFrame(y_train_pred)

        # Predict on the testing set
        y_test_pred = model.predict(X_test)
        y_test_pred = pd.DataFrame(y_test_pred)

        # Calculate the RMSE
        train_rmse = round(math.sqrt(mean_squared_error(y_train, y_train_pred)), 3)
        test_rmse = round(math.sqrt(mean_squared_error(y_test, y_test_pred)), 3)
        
        # Calculate the accuracy
        train_accuracy, train_accuracy_top3 = find_prob(y_train_pred)
        test_accuracy, test_accuracy_top3 = find_prob(y_test_pred)

        # Calculate generalization error percentage
        generalization_error = round((test_rmse - train_rmse)/train_rmse*100, 3)

        # Print the results
        print('Model results for', model_name, ':')
        print('Train RMSE: ', train_rmse)
        print('Test RMSE: ', test_rmse)
        print('Generalization Error: ', generalization_error, '%', '\n')

        print('Train Accuracy for finding Top position: ', train_accuracy)
        print('Test Accuracy for finding Top position: ', test_accuracy, '\n')

        print('Train Accuracy for finding Top 3 positions: ', train_accuracy_top3)
        print('Test Accuracy for finding Top 3 positions: ', test_accuracy_top3)

        # Append the results to the dataframe
        results.loc[len(results)] = [model_name, train_rmse, test_rmse, generalization_error,
                                  train_accuracy, test_accuracy, train_accuracy_top3, test_accuracy_top3]
        
        # predict on unseen data
        y_unseen_pred = model.predict(X_unseen)
        y_unseen_pred = pd.DataFrame(y_unseen_pred)

        return y_unseen_pred
        


### Model 1: Ridge Regression

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
# Run the model
number = len(X_train)
print(number)
alpha_range= np.arange(0,1000)
ridge = Ridge(alpha=60396, solver='cholesky')
#ridge = Ridge()

In [None]:
# Run the model
ridge_pred = run_model(ridge, X_train, y_train, X_test, y_test, X_unseen)

### Model 2: K-Nearest Neighbors Regressor

In [None]:
# KNN
knn = KNeighborsRegressor(n_neighbors=500)

knn_pred = run_model(knn, X_train, y_train, X_test, y_test, X_unseen)


### Model 3: Random Forest Regressor

In [None]:
# Run the model
rf = RandomForestRegressor(n_estimators=30, max_depth=4, random_state=42, max_features=5,
                            min_samples_split=20, min_samples_leaf=200, n_jobs=-1)

rf_pred = run_model(rf, X_train, y_train, X_test, y_test, X_unseen)

### Model 4: Light Gradient Boosting Machine (LightGBM)

In [None]:
# Run the model
lgbm = LGBMRegressor(n_estimators=20, max_depth=5, random_state=42, num_leaves=100,
                     min_child_samples=10, min_child_weight=10, n_jobs=-1)

lgbm_pred = run_model(lgbm, X_train, y_train, X_test, y_test, X_unseen)

### View results of our 4 regression models

In [None]:
# View the results
results

Since our objective is to have low RMSE, good generalisation, and good training accuracy, the LGBMRegressor meets all the criteria and we will choose it as our final model for backtesting.

In [None]:
# Save predictions
ridge_pred.to_csv('D:\\documentos\\IA Caballos\\flat_data\\ridge_pred.csv')
knn_pred.to_csv('D:\\documentos\\IA Caballos\\flat_data\\knn_pred.csv')
rf_pred.to_csv('D:\\documentos\\IA Caballos\\flat_data\\rf_pred.csv')
lgbm_pred.to_csv('D:\\documentos\\IA Caballos\\flat_data\\lgbm_pred.csv')

In [None]:
# Save the model
joblib.dump(lgbm, 'D:\\documentos\\IA Caballos\\flat_data\\lgbm_model.pkl')

In [None]:
# Save the results
results.to_csv('D:\\documentos\\IA Caballos\\flat_data\\reg_results.csv')