# Part 5: Horse Race Prediction
## Evaluation of Model

In [None]:
!pip install lightgbm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, roc_curve, auc, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import time
import joblib
import pickle

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
# Read the train and test files
df_train = pd.read_csv('D:\\documentos\\IA Caballos\\flat_data\\df_train.csv')
df_test = pd.read_csv('D:\\documentos\\IA Caballos\\flat_data\\df_test.csv')

In [None]:
df_train.fillna(0, inplace=True)
df_train

In [None]:
df_test.fillna(0, inplace=True)
df_test

In [None]:
# View the shape of the train and test files
print(df_train.shape)
print(df_test.shape)

In [None]:
import re
from numba import jit, cuda

@jit(target_backend='cuda')
def convert_to_km(distance):
    '''
    distance can be a string with km or m as units
    e.g. 300km, 1.1km, 200m, 4.5m
    '''
    
    # split the string into value and unit ['300', 'km']
    #split_dist = re.match('([\d\.]+)?([a-zA-Z]+)', distance)
    split_dist = re.findall('([\d\.]+)?([a-zA-Z]+)', distance, re.U)
    print(split_dist)
    dist = 0.0
    dist1 = 0.0
    dist2 = 0.0
    dist3 = 0.0
    for value in split_dist:
        if value[1] == 'm':
            dist1 = float(value[0])
        elif value[1] == 'f':
            dist2 = float(value[0])*0.125
        elif value[1] == 'y':
            dist3 = float(value[0])*0.0005681818
        else:
            pass
      
    dist = dist1 + dist2 + dist3
    
    return dist

In [None]:
df_train['race_distance'] = df_train.apply(lambda row: convert_to_km(row['race_distance']), axis=1)

df_train.head(2)

In [None]:
df_test['race_distance'] = df_test.apply(lambda row: convert_to_km(row['race_distance']), axis=1)

df_test.head(2)

In [None]:
df_test['win_odds'] = df_test['win_odds'].str.rstrip('%').astype('float') / 100.0
df_train['win_odds'] = df_train['win_odds'].str.rstrip('%').astype('float') / 100.0

In [None]:
# View the first 2 rows of the train file
df_train.head(2)

In [None]:
# View the first 2 rows of the test files
df_test.head(2)

In [None]:
# Keep the features we want to train our model on
X_train = df_train[[#'actual_weight', 
                    'declared_horse_weight',
                    'draw', 
                    'win_odds', 
                    'jockey_ave_rank',
                    'trainer_ave_rank', 'recent_ave_rank', 'race_distance', 
                    'jockey_flat_aw_rate',
                    'jockey_flat_turf_rate',
                    'trainers_flat_aw_rate',
                    'trainers_flat_turf_rate']]

y_train = df_train['HorseWin']

# Keep the features we want to train our model on
X_test = df_test[[#'actual_weight', 
                  'declared_horse_weight',
                   'draw', 
                    'win_odds',
                   'jockey_ave_rank',
                    'trainer_ave_rank', 'recent_ave_rank','race_distance', 
                    'jockey_flat_aw_rate',
                    'jockey_flat_turf_rate',
                    'trainers_flat_aw_rate',
                    'trainers_flat_turf_rate']]
                    
y_test = df_test[['HorseWin', 'HorseRankTop3']]


In [None]:
X_train.shape

# Find the feature importances

In [None]:
# Open the pickled models
smote_rfc = joblib.load('D:\\documentos\\IA Caballos\\flat_data\\smote_rfc_model.pkl')
lgbm = joblib.load('D:\\documentos\\IA Caballos\\flat_data\\lgbm_model.pkl')

In [None]:
# find feature importance with random forest
def feature_importance(model, X_train, y_train):
    # fit the model
    model.fit(X_train, y_train)

    # get importance
    importance = model[1][1].feature_importances_

    # sort the index of the importance
    sorted_idx = np.argsort(importance)

    # name of the features
    feature_names = X_train.columns[sorted_idx]

    # summarize feature importance with feature names
    for i,v in enumerate(importance[sorted_idx]):
        print('Feature: %s, Score: %.5f' % (feature_names[i],v))

    # plot a barh graph of feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_names, importance[sorted_idx])
    plt.title('Feature Importance')
    plt.show()

In [None]:
# find feature importance
feature_importance(smote_rfc, X_train, y_train)


## Plot SHAP values

In [None]:
!pip install shap

In [None]:
# SHAP values
import shap

# fit the model
lgbm.fit(X_train, y_train)

# Create object that can calculate shap values
explainer = shap.Explainer(lgbm.predict, X_test)

# Calculate Shap values
shap_values = explainer(X_test)

In [None]:
# Plot the SHAP values
shap.plots.beeswarm(shap_values)

The bee swarm plot is used to understand the importance or contribution of features for the whole dataset. Looking at the Recent Average Rank variable, low values (in blue) has a very high contribution towards the prediction. All the little dots represent a single observation. The horizontal axis represents the SHAP value.

All variables are shown in order of global feature importance, the first one being most important and last one being least important.

In [None]:

# Plot the SHAP values
shap.plots.bar(shap_values)


In the bar plot, the features are ordered from the highest to lowest effect on prediction. It takes in account the absolute SHAP value, so it does not matter if the feature affects the prediction in a positive or negative way.

In [None]:

# Plot the SHAP values
shap.plots.waterfall(shap_values[0])

This local plot (a waterfall plot) shows what are the main features affecting the prediction of a single observation, and the magnitude of the SHAP value for each feature. Here we can see how the sum of all SHAP values equals the difference between the prediction and the expected value.