# Part 2: Horse Race Prediction
## Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%load_ext autotime

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Read the train data
df_train = pd.read_csv("D:\\documentos\\IA Caballos\\hurdle_data\\df_train.csv")

# Read the test data
df_test = pd.read_csv("D:\\documentos\\IA Caballos\\hurdle_data\\df_test.csv")

# Concatenate the train and test data
df = pd.concat([df_train, df_test], axis=0)
df.head()

# Save df as df_train
df_train = df.copy()

In [None]:
# Check the info of the train data
df_train.info()

In [None]:
# Draw the mean of a horse weight on same plot
plt.figure(figsize=(12, 8))
sns.distplot(df_train['declared_horse_weight'], bins=100, kde=False)
plt.title("Distribution of a horse weight")
plt.xlabel("Weight")
plt.ylabel("Count")
plt.axvline(df_train['declared_horse_weight'].mean(), color='r', linestyle='dashed', linewidth=2)

plt.show()


In [None]:
# Find the number of horses each race
numHorsePerRace = df_train.groupby('race_id')['horse_id'].count().value_counts()

In [None]:
# Plot the distribution of the number of horses
plt.figure(figsize=(10, 5))
sns.barplot(x=numHorsePerRace.index, y=numHorsePerRace.values)
plt.xlabel('Number of Horses', fontsize=12)
plt.ylabel

# Set title
plt.title('Distribution of Number of Horses per Race', fontsize=15)


In [None]:
# Plot the distribution of actual weight
plt.figure(figsize=(10, 5))
sns.distplot(df_train['declared_horse_weight'], kde=False)
plt.xlabel('Actual Weight', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Actual Weight', fontsize=15)


In [None]:
# plot recent average rank against win_odds
plt.figure(figsize=(10, 5))
sns.scatterplot(x='recent_ave_rank', y='win_odds', data=df_train, s=5, color='g')
plt.xlabel('Recent Average Rank', fontsize=12)
plt.ylabel('Win Odds', fontsize=12)
plt.title('Recent Average Rank vs Win Odds', fontsize=15)


In [None]:
# Plot mean of win odds against finishings
df_train['win_odds'] = df_train['win_odds'].str.rstrip('%').astype('float') / 100.0
plt.figure(figsize=(10, 5))
sns.barplot(x=df_train['finishing_position'].unique(), y=df_train.groupby('finishing_position')['win_odds'].mean(), palette='Greens_d')
plt.xlabel('Finishing Position', fontsize=12)
plt.ylabel('Mean Win Odds', fontsize=12)
plt.title('Mean Win Odds against Finishing Position', fontsize=15)

In [None]:
# Plot distribution of actual weight against win odds
plt.figure(figsize=(10, 5))
sns.scatterplot(x='declared_horse_weight', y='win_odds', data=df_train)
plt.xlabel('Actual Weight', fontsize=12)
plt.ylabel('Win Odds', fontsize=12)
plt.title('Distribution of Actual Weight against Win Odds', fontsize=15)


In [None]:
# Plot distribution of actual weight against horse number
plt.figure(figsize=(10, 5))
sns.scatterplot(x='declared_horse_weight', y='horse_rate', data=df_train)
plt.xlabel('Actual Weight', fontsize=12)
plt.ylabel('Horse Rate', fontsize=12)
plt.title('Distribution of Actual Weight against Horse Number', fontsize=15)


Hong Kong horse racing uses the handicap system. Horse number #1 has the highest rating and carry the heaviest weight.

In [None]:
# Find the mean of actual weight for each horse number
meanWtPerHorse = df_train.groupby('horse_rate')['declared_horse_weight'].mean()

# Plot the distribution of mean actual weight for each horse number
plt.figure(figsize=(10, 5))
sns.barplot(x=meanWtPerHorse, y=meanWtPerHorse.values, palette='Greens_d')
plt.xlabel('Horse Number', fontsize=12)
plt.ylabel('Mean Actual Weight', fontsize=12)
plt.title('Distribution of Mean Actual Weight for Each Horse Number', fontsize=15)


In [None]:
# Plot the distribution of finishing position against draw
plt.figure(figsize=(10, 5))
sns.scatterplot(x='draw', y='finishing_position', data=df_train)
plt.xlabel('Draw', fontsize=12)
plt.ylabel('Finishing Position', fontsize=12)
plt.title('Distribution of Finishing Position against Draw', fontsize=15)


The gate number for each horse is drawn 2 days before the races. Gate number 1 is the closest to the inside rail. Thus, we would expect better performance for horses with lower draw number.

In [None]:
# Return only 14 columns. Draw #15 is not included.
avgPos_vs_Draw = df_train.groupby('draw')['finishing_position'].mean()[:14]

# Plot the distribution of average finishing position against draw
plt.figure(figsize=(10, 5))
sns.barplot(x=avgPos_vs_Draw.index, y=avgPos_vs_Draw.values, palette='Greens_d')
plt.xlabel('Draw', fontsize=12)
plt.ylabel('Average Finishing Position', fontsize=12)
plt.title('Distribution of Average Finishing Position against Draw', fontsize=15)


In [None]:
# Find the mean of horse weight for each position
meanHorseWtPerPos = df_train.groupby('finishing_position')['declared_horse_weight'].mean()#[:14]
mean = meanHorseWtPerPos.values
print(mean)

# Plot the distribution of mean horse weight for each position
plt.figure(figsize=(10, 5))
sns.barplot(x=meanHorseWtPerPos.index, y=mean, palette='Greens_d')
plt.xlabel('Finishing Position', fontsize=12)
plt.ylabel('Mean Horse Weight', fontsize=12)
plt.title('Distribution of Mean Horse Weight for Each Finishing Position', fontsize=15)

# Set y limit
#plt.ylim(1070, 1120)


Not conclusive to say that there is a relationship between horse weight and finishing position.

In [None]:
# view df_train
df_train.head()


In [None]:
# keep only features we want
#'actual_weight',
cols = ['finishing_position',  'declared_horse_weight', 'draw', 'recent_ave_rank', 
        'jockey_ave_rank','trainer_ave_rank', 'race_distance', 'jockey_hurdle_rate','trainers_hurdle_rate']

# view the correlation matrix
corr = df_train[cols].corr()

# do a mask to hide the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# plot the heatmap with the mask and correct aspect ratio
plt.figure(figsize=(10, 5))
sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=1, vmin=-1, 
            center=0, square=False, linewidths=.5, 
            cbar_kws={"shrink": .8}, annot=True)
plt.title('Correlation Matrix', fontsize=15)

# rotate the ticklabels for easier reading
plt.xticks(rotation=45)



## Find the best jockey

In [None]:
# Find out how many jockeys are there
print("There are {} jockeys in the dataset.".format(len(df_train['jockey'].unique())))

In [None]:
# Finding the number of wins per jockey
jockeyWins = df_train.groupby('jockey')['finishing_position'].apply(lambda x: (x==1).sum())

# Finding the win rate per jockey
jockeyWinRate = df_train.groupby('jockey')['HorseWin'].mean()

In [None]:
# Plot win rate against number of wins
plt.figure(figsize=(10, 5))
sns.scatterplot(x=jockeyWinRate, y=jockeyWins)
plt.xlabel('Win Rate', fontsize=12)
plt.ylabel('Number of Wins', fontsize=12)
plt.title('Distribution of Win Rate against Number of Wins', fontsize=15)

# Annotate the top 5 jockeys with the most wins
for i in jockeyWins.sort_values(ascending=False)[:5].index:
    plt.annotate(i, (jockeyWinRate[i], jockeyWins[i]))

# Annotation for the top 5 jockeys with the highest win rate
for i in jockeyWinRate.sort_values(ascending=False)[:5].index:
    plt.annotate(i, (jockeyWinRate[i], jockeyWins[i]))
    



In [None]:
# Find the frequency of each jockey
jockeyFreq = df_train['jockey'].value_counts()

# Plot histogram of jockey frequency
plt.figure(figsize=(10, 5))
sns.distplot(jockeyFreq, kde=False, bins=100)
plt.xlabel('Jockey Frequency', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Jockey Frequency', fontsize=15)

## Find the best trainer

In [None]:
# Find out how many trainers are there
print('Number of trainers: ', len(df_train['trainer'].unique()))

In [None]:
# Find the number of wins per trainer
trainerWins = df_train.groupby('trainer')['finishing_position'].apply(lambda x: (x==1).sum())

# Find the win rate per trainer
trainerWinRate = df_train.groupby('trainer')['HorseWin'].mean()

In [None]:
# Plot win rate against number of wins
plt.figure(figsize=(10, 5))
sns.scatterplot(x=trainerWinRate, y=trainerWins)
plt.xlabel('Win Rate', fontsize=12)
plt.ylabel('Number of Wins', fontsize=12)
plt.title('Distribution of Win Rate against Number of Wins', fontsize=15)

# Annotate the top 5 trainers with the most wins
for i in trainerWins.sort_values(ascending=False)[:5].index:
    plt.annotate(i, (trainerWinRate[i], trainerWins[i]))

In [None]:
# Find the frequency of each trainer
trainerFreq = df_train['trainer'].value_counts()

# Plot the distribution of trainer frequency
plt.figure(figsize=(10, 5))
sns.distplot(trainerFreq, kde=False, bins=100)
plt.xlabel('Trainer Frequency', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Trainer Frequency', fontsize=15);

In [None]:
# Trainer data also heavy skewed
trainerFreq

## Find the best horse

In [None]:
# Find out how many horses are there
print('Number of horses: ', len(df_train['horse_id'].unique()))

In [None]:
# Find the number of wins per horse
horseWins = df_train.groupby('horse_name')['finishing_position'].apply(lambda x: (x==1).sum())

# Find the win rate per horse
horseWinRate = df_train.groupby('horse_name')['HorseWin'].mean()

# Plot win rate against number of wins
plt.figure(figsize=(20, 10))
sns.scatterplot(x=horseWinRate, y=horseWins)
plt.xlabel('Win Rate', fontsize=12)
plt.ylabel('Number of Wins', fontsize=12)
plt.title('Distribution of Win Rate against Number of Wins', fontsize=15)

# Annotate the top 5 horses with the most wins
for i in horseWins.sort_values(ascending=False)[:5].index:
     plt.annotate(i, (horseWinRate[i], horseWins[i]))

In [None]:
horseWins.sort_values(ascending=False)[:5]

In [None]:
horseWinRate.sort_values(ascending=False)[:20]

In [None]:
# Find the frequency of each horse
horseFreq = df_train['horse_id'].value_counts()

# Plot the distribution of horse frequency
plt.figure(figsize=(10, 5))
sns.distplot(horseFreq, kde=False, bins=100)
plt.xlabel('Races a horse run', fontsize=12)
plt.ylabel('Frquency', fontsize=12)
plt.title('Distribution of Number of Races a Horse runs', fontsize=15);


In [None]:
# Plot the recent average rank of each horse against finishing position
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df_train['recent_ave_rank'], y=df_train['finishing_position'])
plt.xlabel('Recent Average Rank', fontsize=12)
plt.ylabel('Finishing Position', fontsize=12)
plt.title('Distribution of Recent Average Rank of Horse against Finishing Position', fontsize=15)
