# Part 7: Deployment
- This section aims to simplify the machine learning model into a simple and usable app that punters can use to decide whether to make a bet or not.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import f1_score, roc_curve, auc, confusion_matrix, average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

import joblib
import pickle
import time

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
# Read in the train file
df_train = pd.read_csv('D:\\documentos\\IA Caballos\\flat_data\\df_train.csv')
#df_train.reset_index(inplace=True, drop=True)

# Read in the test file
df_test = pd.read_csv('D:\\documentos\\IA Caballos\\flat_data\\df_test.csv')
#df_test.reset_index(inplace=True, drop=True)

In [None]:
df_train.fillna(0, inplace=True)
df_train

In [None]:
df_test.fillna(0, inplace=True)
df_test

In [None]:
df_test['win_odds'] = df_test['win_odds'].str.rstrip('%').astype('float') / 100.0
df_train['win_odds'] = df_train['win_odds'].str.rstrip('%').astype('float') / 100.0

In [None]:
import re
from numba import jit, cuda

@jit(target_backend='cuda')
def convert_to_km(distance):
    '''
    distance can be a string with km or m as units
    e.g. 300km, 1.1km, 200m, 4.5m
    '''
    
    # split the string into value and unit ['300', 'km']
    #split_dist = re.match('([\d\.]+)?([a-zA-Z]+)', distance)
    split_dist = re.findall('([\d\.]+)?([a-zA-Z]+)', distance, re.U)
    print(split_dist)
    dist = 0.0
    dist1 = 0.0
    dist2 = 0.0
    dist3 = 0.0
    for value in split_dist:
        if value[1] == 'm':
            dist1 = float(value[0])
        elif value[1] == 'f':
            dist2 = float(value[0])*0.125
        elif value[1] == 'y':
            dist3 = float(value[0])*0.0005681818
        else:
            pass
      
    dist = dist1 + dist2 + dist3
    
    return dist

In [None]:
df_train['race_distance'] = df_train.apply(lambda row: convert_to_km(row['race_distance']), axis=1)

df_train.head(2)

In [None]:
df_test['race_distance'] = df_test.apply(lambda row: convert_to_km(row['race_distance']), axis=1)

df_test.head(2)

In [None]:
df_train.info()

In [None]:
# Keep the features we want to train our model on
df = df_train[['horse_name', 'horse_rate', 'jockey',
                #'actual_weight', 
               'declared_horse_weight',
                    'draw', 'win_odds', 'jockey_ave_rank',
                     'recent_ave_rank', 'race_distance', 'HorseWin', 'HorseRankTop3', 
                    'jockey_flat_aw_rate',
                    'jockey_flat_turf_rate',
                    'trainers_flat_aw_rate',
                    'trainers_flat_turf_rate']]

dftest = df_test[['horse_name', 'horse_rate', 'jockey',
                # 'actual_weight', 
                  'declared_horse_weight',
                'draw', 'win_odds', 'jockey_ave_rank',    
                 'recent_ave_rank', 'race_distance', 'HorseWin', 'HorseRankTop3',
                 'jockey_flat_aw_rate',
                    'jockey_flat_turf_rate',
                    'trainers_flat_aw_rate',
                    'trainers_flat_turf_rate']]

In [None]:
df.notnull().sum()

In [None]:
df.info()

In [None]:
# Use the below code if we want to encode certain features, else we dont have to use this.
# # instantiate labelencoder object
# le_horse_name = LabelEncoder()

# # apply le on categorical feature columns
# df['horse_name'] = le_horse_name.fit_transform(df['horse_name'])

# # instantiate labelencoder object
# le_jockey = LabelEncoder()

# # apply le on categorical feature columns
# df['jockey'] = le_jockey.fit_transform(df['jockey'])


In [None]:
# After encoding, we can see that the horse_name and jockey columns are now integers
df.info()

In [None]:
df.head()

In [None]:
## split df into X and y
X_train = df.drop(['horse_name','jockey', 'horse_rate', 'jockey', 'HorseWin', 'HorseRankTop3','draw','jockey_flat_aw_rate',
                    'jockey_flat_turf_rate',
                    'trainers_flat_aw_rate',
                    'trainers_flat_turf_rate'], axis=1)
y_train = df['HorseWin']

# split dftest into X and y
X_test = dftest.drop(['horse_name', 'jockey', 'horse_rate', 'jockey', 'HorseWin', 'HorseRankTop3','draw','jockey_flat_aw_rate',
                    'jockey_flat_turf_rate',
                    'trainers_flat_aw_rate',
                    'trainers_flat_turf_rate'], axis=1)
y_test = dftest['HorseWin']

In [None]:
# Smote the training data
sm = SMOTE(random_state = 42)
rfc = RandomForestClassifier(max_depth=20, min_samples_leaf=10,
                               random_state = 42)

# Steps for the pipeline
steps = [('smote', sm), ('rfc', rfc)]

# Create the pipeline
smote_rfc = Pipeline(steps = steps)

In [None]:
# Fit the pipeline to the training data
smote_rfc.fit(X_train, y_train)

### Checking if the model is ok

In [None]:
# Specify kfold cross validation
kfold = KFold(n_splits=5)

# Calculate the cross validation score
cv_score = cross_val_score(smote_rfc, X_train, y_train, cv=kfold, scoring='f1_weighted').mean()
cv_score = round(cv_score, 3)

# Create a dataframe to store the predictions
df_pred = pd.DataFrame()
df_pred['RaceID'] = df_test['race_id']
df_pred['HorseID'] = df_test['horse_id']






# Make predictions
y_test_pred = smote_rfc.predict(X_test)

# Store the predictions in the dataframe
df_pred['HorseWin'] = y_test_pred

# Calculate the f1 score
f1 = f1_score(y_test, y_test_pred, average='weighted')
f1 = round(f1, 3)

# Calculate PR AUC
pr_auc = average_precision_score(y_test, y_test_pred, average='weighted')
pr_auc = round(pr_auc, 3)

# Calculate TPR
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
tpr = tp / (tp + fn)
tpr = round(tpr, 3)


 # Print the results
print('Cross Validation Score: ', cv_score)
print('F1 Score: ', f1)
print('PR AUC (Avg Precision): ', pr_auc)
print('TPR: ', tpr)


In [None]:
# Define a function to backtest the betting strategy
def simple_class_strategy(model_pred, graph=True):

    df_test_results = df_test[['finishing_position', 'win_odds', 
                                'race_id', 
                               'HorseWin', 'horse_id']]

    # rename columns
    df_test_results = df_test_results.rename(columns={'race_id': 'RaceID', 
                                                    'horse_id': 'HorseID', 
                                                    'HorseWin':'ActualWin'})
    
    # merge the prediction with the test data
    df_backtest = pd.merge(model_pred, df_test_results, on=('RaceID', 'HorseID'), how='left')

    money = 0
    bets_made = []
    cumulative_money = [0]

    for race_id in df_backtest['RaceID'].unique():

        # make a temporary dataframe one for that particular race
        temp_df = df_backtest[df_backtest['RaceID']==race_id]

        # find out the bets we made
        bets = temp_df[temp_df['HorseWin']==1]

        # deduct money for bets we made
        deduction  = -len(bets)

        # amount won from bets
        # sum of multiplying the odds with the prediction
        amount_won = sum(bets['win_odds']*bets['ActualWin'])
        
        # add the amount won to the money
        money += (amount_won + deduction)

        # append the money to the cumulative money list
        cumulative_money.append(money)

        # append the bets made to the bets made list
        bets_made.append(len(bets))
    
    if graph==True:
        # plot the cumulative money
        plt.figure(figsize=(10, 6))
        plt.plot(cumulative_money)
        plt.title('Cumulative Money')
        plt.xlabel
        plt.show()

        # plot the bets made
        plt.figure(figsize=(10, 6))
        plt.plot(bets_made)
        plt.title('Bets Made')
        plt.show()

    # print the final money and bets made
    print('Final Money: ', round(money, 3))
    print('Total Bets Made: ', round(sum(bets_made),3), '\n')

    return money, bets_made

In [None]:
# Check that the strategy works
app_testing = simple_class_strategy(df_pred, graph=True)

In [None]:
# Save predictions to csv
pd.DataFrame(df_pred).to_csv('D:\\documentos\\IA Caballos\\flat_data\\deploy_pred.csv')


In [None]:
df_test

In [None]:
# pickle the model

# create the data to pickle
data = {"model": smote_rfc, 'train_data': df_train, 'test_data': df_test}

# open a file, where you ant to store the data
file = open('D:\\documentos\\IA Caballos\\flat_data\\saved_steps.pkl', 'wb')

# dump information to that file
pickle.dump(data, file)

### For demo purpose

In [None]:
# We want to be able to show that our deployment app is working
# Find index of df_pred where HorseWin = 1
index = df_pred[df_pred['HorseWin']==1].index

# Pass the index through to the test data
horsesPred1 = df_test.iloc[index]

# Keep only the columns we need
horsesPredCol = ['horse_name', 'horse_rate', 'jockey', 'draw', 
                 'win_odds', 'race_distance',
                #'actual_weight', 
                 'declared_horse_weight', 'recent_ave_rank',
                'finishing_position', 'HorseWin', 'jockey_ave_rank', 'trainer_ave_rank','HorseRankTop3','jockey_flat_aw_rate',
                    'jockey_flat_turf_rate',
                    'trainers_flat_aw_rate',
                    'trainers_flat_turf_rate']

In [None]:
horsesPred1[horsesPredCol][:20]

In [None]:
import pickle
def load_model():
    #with open('C:\\Users\\ROOM\Documents\\IA Caballos\\Lingfield\\saved_steps_lingfield_flat.pkl', 'rb') as f:
     #   data = pickle.load(f)
    openFile = open('saved_steps.pkl', "rb")
    data = pickle.load(openFile)
    return data
data = load_model()
model = data['model']
df_test = data['test_data']
df_train = data['train_data']

In [None]:
model

In [None]:
df_test

In [None]:
df_train