In [46]:
import warnings
warnings.simplefilter(action='ignore')

from utils import data_handler,features_handler

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression,LogisticRegression

np.random.seed(100)

TEST_SIZE = 0.25
SWITCH_SIZE = 0.50

# HANDLING DATA

In [47]:
# in 2019 I deleted the data after Australian Open and Australian Open data is used for our validation
DATASET_PATHS=[]
for i in range(2015,2019):
            DATASET_PATHS.append(f"{i}.xlsx")

In [48]:
relevant_columns =[
        "Date","Tournament","Court","Surface",
        "Round","Best of","Winner","Loser",
        "WRank","LRank","LPts","WPts","Series",
        "AvgL","AvgW"
]   

In [49]:
# Read all the data from the specified years
matches_data = pd.concat([pd.read_excel(f) for f in DATASET_PATHS], ignore_index=True,sort=True)
matches_data.head()

Unnamed: 0,ATP,AvgL,AvgW,B365L,B365W,Best of,Comment,Court,Date,EXL,...,Tournament,W1,W2,W3,W4,W5,WPts,WRank,Winner,Wsets
0,1,1.2,4.31,1.18,4.5,3,Completed,Outdoor,2015-01-05,1.2,...,Brisbane International,6.0,6.0,,,,430.0,125.0,Duckworth J.,2.0
1,1,1.47,2.62,1.44,2.62,3,Completed,Outdoor,2015-01-05,1.48,...,Brisbane International,6.0,6.0,,,,341.0,149.0,Kokkinakis T.,2.0
2,1,3.3,1.32,3.5,1.28,3,Completed,Outdoor,2015-01-05,3.2,...,Brisbane International,6.0,6.0,,,,1195.0,31.0,Chardy J.,2.0
3,1,2.25,1.61,2.25,1.57,3,Completed,Outdoor,2015-01-05,2.3,...,Brisbane International,7.0,7.0,,,,797.0,53.0,Tomic B.,2.0
4,1,2.53,1.5,2.37,1.53,3,Completed,Outdoor,2015-01-06,2.5,...,Brisbane International,6.0,4.0,6.0,,,705.0,69.0,Kukushkin M.,2.0


In [50]:
# This takes some time (4-5 min)
matches_data = data_handler(matches_data,relevant_columns)
matches_data = features_handler(matches_data,matches_data)

Finished renaming columns!
Finish switching columns!
Finish dealing with unwanted values!
Finish dealing with non numerical values!
Finish dealing with experience feature!
Finish dealing with W/L feature!


### Deal with the train/test data

In [51]:
matches_data.head()

Unnamed: 0,Date,Tournament,Court,Surface,Round,Best of,Series,P1_won,P1,P2,P1Rank,P2Rank,P1Pts,P2Pts,AvgP2,AvgP1,P1_Experince,P2_Experince,P1_W/L,P2_W/L
0,2015-01-05,0,0,0,0,3,0,False,Simon G.,Duckworth J.,21.0,125.0,1730.0,430.0,1.2,4.31,0,0,100.0,100.0
1,2015-01-05,0,0,0,0,3,0,True,Kokkinakis T.,Benneteau J.,149.0,25.0,341.0,1365.0,1.47,2.62,0,0,100.0,100.0
2,2015-01-05,0,0,0,0,3,0,True,Chardy J.,Golubev A.,31.0,72.0,1195.0,691.0,3.3,1.32,0,0,100.0,100.0
3,2015-01-05,0,0,0,0,3,0,False,Querrey S.,Tomic B.,35.0,53.0,1090.0,797.0,2.25,1.61,0,0,100.0,100.0
4,2015-01-06,0,0,0,0,3,0,True,Kukushkin M.,Copil M.,69.0,201.0,705.0,242.0,2.53,1.5,0,0,100.0,100.0


In [52]:
#the relevant features for training 
features = [
    "Tournament","Court",
    "Surface","Round",
    "Best of","Series",
    "P1Rank","P2Rank",
    "P1_Experince","P2_Experince",
    "P1_W/L","P2_W/L",
    "P1Pts","P2Pts",
    "AvgP1","AvgP2",
]

In [53]:
# Split the test and train data (our x and y)
X_train, X_test, y_train, y_test = train_test_split(matches_data[features], matches_data["P1_won"], test_size = TEST_SIZE) 
print(f"Nr of training data:{len(X_train)}")
print(f"Nr of testing data:{len(X_test)}")

Nr of training data:7894
Nr of testing data:2632


# RANDOM FOREST

In [54]:
# Create the RandomForest Classifier with gini index and fit the training data
forest_gini = RandomForestClassifier(n_jobs=2,n_estimators=100,criterion = "gini")
forest_gini.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [55]:
# Create the RandomForest Classifier with entropy and fit the training data
forest_entropy = RandomForestClassifier(n_jobs=2,n_estimators=100,criterion = "entropy")
forest_entropy.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [56]:
# Compares the 2 RandomForestClassifier and choose the best

score_gini = forest_gini.score(X_test, y_test)
score_entropy = forest_entropy.score(X_test, y_test)

print(f"The score for the RandomForestClassifier with gini: {score_gini}")
print(f"The score for the RandomForestClassifier with entropy: {score_entropy}")

forest = forest_gini  if score_gini > score_entropy else forest_entropy
print("\n")
print("gini index seems to fit the best:")
print(forest.score(X_test, y_test))

The score for the RandomForestClassifier with gini: 0.8126899696048632
The score for the RandomForestClassifier with entropy: 0.8119300911854104


gini index seems to fit the best:
0.8126899696048632


In [57]:
# Draw a little table to see the actual results
preds = forest.predict(X_test)
pd.crosstab(y_test, preds, rownames=['Actual Wins'],colnames=['Predicted Wins'])

Predicted Wins,False,True
Actual Wins,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1058,229
True,264,1081


# LOGISTICAL REGRESION

In [58]:
# Adding regresion
regr = LogisticRegression(max_iter=2000)

regr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
regr.score(X_test, y_test)

0.6546352583586627

In [60]:
preds = regr.predict(X_test)
pd.crosstab(y_test, preds, rownames=['Actual Wins'],colnames=['Predicted Wins'])

Predicted Wins,False,True
Actual Wins,Unnamed: 1_level_1,Unnamed: 2_level_1
False,827,460
True,449,896


## Trying the classifiers score on the 2019 data

In [61]:
TEST_PATH = f"FullAO.xlsx"
test_df = pd.concat([pd.read_excel(TEST_PATH)], ignore_index=True,sort=True)

In [62]:
test_df = data_handler(test_df,relevant_columns)
test_df = features_handler(test_df,matches_data)

Finished renaming columns!
Finish switching columns!
Finish dealing with unwanted values!
Finish dealing with non numerical values!
Finish dealing with experience feature!
Finish dealing with W/L feature!


In [63]:
features = [
    "Tournament","Court",
    "Surface","Round",
    "Best of","Series",
    "P1Rank","P2Rank",
    "P1_Experince","P2_Experince",
    "P1_W/L","P2_W/L",
    "P1Pts","P2Pts",
    "AvgP1","AvgP2",
]

In [64]:
print(f"The score for testing on the 2019 AO with LogisticRegression is: { regr.score(test_df[features], test_df['P1_won']) } ")
print(f"The score for testing on the 2019 AO with RandomForestClassifier is: { forest.score(test_df[features], test_df['P1_won']) } ")

The score for testing on the 2019 AO with LogisticRegression is: 0.7007874015748031 
The score for testing on the 2019 AO with RandomForestClassifier is: 0.8346456692913385 


## Choosing the best method

In [65]:
# We have the RandomForestClassifier and the LogisticRegression
score_list = [regr.score(X_test, y_test),forest.score(X_test, y_test)]

pd.DataFrame.from_dict(
    {
        "Name":["LogisticalRegression","RandomForestClassifier"],
        "Score":score_list
    }
)

Unnamed: 0,Name,Score
0,LogisticalRegression,0.654635
1,RandomForestClassifier,0.81269


The choice is RandomForestClassifier,having the best score