In [93]:
import numpy as np 
import pandas as pd 

from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

import warnings
warnings.simplefilter(action='ignore')

In [94]:
data = pd.read_csv("https://raw.githubusercontent.com/abdullabasim/dataset/main/HorseRacing.csv",encoding= 'unicode_escape')

data.head() 

Unnamed: 0,UID,ID,Tipster,Date,Track,Horse,Bet Type,Odds,Result,TipsterActive
0,1,1,Tipster A,24/07/2015,Ascot,Fredricka,Win,8.0,Lose,True
1,2,2,Tipster A,24/07/2015,Thirsk,Spend A Penny,Win,4.5,Lose,True
2,3,3,Tipster A,24/07/2015,York,Straightothepoint,Win,7.0,Lose,True
3,4,4,Tipster A,24/07/2015,Newmarket,Miss Inga Sock,Win,5.0,Lose,True
4,5,5,Tipster A,25/07/2015,Ascot,Peril,Win,4.33,Win,True


In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38248 entries, 0 to 38247
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   UID            38248 non-null  int64  
 1   ID             38248 non-null  int64  
 2   Tipster        38248 non-null  object 
 3   Date           38248 non-null  object 
 4   Track          38248 non-null  object 
 5   Horse          38248 non-null  object 
 6   Bet Type       38248 non-null  object 
 7   Odds           38248 non-null  float64
 8   Result         38248 non-null  object 
 9   TipsterActive  38248 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 2.7+ MB


In [96]:
data['Tipster'].value_counts()

Tipster X     4383
Tipster E     3700
Tipster B1    2497
Tipster A1    2446
Tipster D1    2119
Tipster J     1937
Tipster R     1901
Tipster C     1738
Tipster Q     1552
Tipster Y     1469
Tipster L     1425
Tipster T     1332
Tipster A     1331
Tipster O     1094
Tipster W      994
Tipster M      957
Tipster Z      883
Tipster H      833
Tipster D      741
Tipster E1     609
Tipster P      581
Tipster B      503
Tipster C1     501
Tipster I      453
Tipster N      415
Tipster V      406
Tipster U      383
Tipster S      383
Tipster G      290
Tipster K      246
Tipster F      146
Name: Tipster, dtype: int64

In [97]:
data['Track'].value_counts()

Kempton           2197
Wolverhampton     2113
Lingfield         2058
Ascot             1355
SouthWell         1326
                  ... 
Dusseldorf           1
Kilarney             1
Gowran park          1
Gulfsteam Park       1
Wahash               1
Name: Track, Length: 116, dtype: int64

In [98]:
data['Horse'].value_counts()

Doctor Parkes        26
Chookie Royale       23
Oriental Relation    21
Sennockian Star      21
Barnet Fair          20
                     ..
Billy Biscuit         1
Black Grass           1
Le Chat D or          1
Krakatoa King         1
Dream Farr            1
Name: Horse, Length: 15791, dtype: int64

In [99]:
data['Bet Type'].value_counts()

Win         30417
Each Way     7830
win             1
Name: Bet Type, dtype: int64

In [100]:
data.loc[data["Bet Type"] == "win", "Bet Type"] = 'Win'
data['Bet Type'].unique()

array(['Win', 'Each Way'], dtype=object)

In [101]:
data['Result'].value_counts()

Lose    30565
Win      7683
Name: Result, dtype: int64

In [102]:
count_win = len(data[data['Result'] == 'Win'])
count_lose = len(data[data['Result']== 'Lose'])

percentage_win = count_win /(count_win + count_lose)

print('percentage_win ' , percentage_win * 100)

percentage_no_win = count_lose /(count_win + count_lose)

print('percentage_no_win ' , percentage_no_win * 100)

percentage_win  20.087324827441957
percentage_no_win  79.91267517255804


In [103]:
data['TipsterActive'].value_counts()

True     25186
False    13062
Name: TipsterActive, dtype: int64

In [91]:
data =data.drop(['ID','UID','Date'],axis=1)

In [104]:
# collecting categorical varibale names 
cat_var = data.dtypes.loc[data.dtypes=='object'].index


In [105]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for var in cat_var:
    data[var] = le.fit_transform(data[var])

In [106]:
data.head()

Unnamed: 0,UID,ID,Tipster,Date,Track,Horse,Bet Type,Odds,Result,TipsterActive
0,1,1,0,818,2,5158,1,8.0,0,True
1,2,2,0,818,96,13108,1,4.5,0,True
2,3,3,0,818,114,13411,1,7.0,0,True
3,4,4,0,818,74,8976,1,5.0,0,True
4,5,5,0,851,2,10554,1,4.33,1,True


In [107]:
X = data.drop(['Result'], axis=1)
y = data['Result']

In [113]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [114]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier( random_state = 14)

parameters = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train,y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

In [67]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()

# fit the model
classifier.fit(X_train,y_train)

In [115]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

classifers = [GaussianNB(), LogisticRegression(n_jobs=-1), DecisionTreeClassifier(min_samples_leaf=5,min_samples_split=17,random_state=1),RandomForestClassifier(criterion='gini', max_depth=8,max_features ='auto',n_estimators =200 , random_state= 3) , KNeighborsClassifier(n_neighbors=5, leaf_size=50, p=3)]
for cl in classifers:
    clf = cl
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)*100
    print('Accuracy of %r Classifier = %2f' % (cl, accuracy) + ' %')
    y_pred = clf.predict(X_test)
    
    
    print(accuracy_score(y_test, y_pred))



    print('\n')

Accuracy of GaussianNB() Classifier = 61.352855 %
0.6135285457222313


Accuracy of LogisticRegression(n_jobs=-1) Classifier = 65.017177 %
0.6501717650907901


Accuracy of DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=17, random_state=1) Classifier = 81.040406 %
0.8104040569278587


Accuracy of RandomForestClassifier(max_depth=8, max_features='auto', n_estimators=200,
                       random_state=3) Classifier = 71.102568 %
0.7110256829707181


Accuracy of KNeighborsClassifier(leaf_size=50, p=3) Classifier = 71.249796 %
0.7124979551774906




In [117]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=17, random_state=1), X = X_train, y = y_train, cv = 10)

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 80.74 %
Standard Deviation: 0.52 %
