In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import math

import warnings
warnings.filterwarnings('ignore')


from sklearn import metrics

from scipy.special import legendre
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import seaborn as sns

### Reading CSV

In [2]:
df = pd.read_csv('all_matches_final.csv')

In [3]:
df.describe()

Unnamed: 0,draw_size,tourney_date,match_num,Player1_id,Player1_seed,Player1_ht,Player1_age,Player1_rank,Player1_rank_points,Player2_id,...,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,y
count,91957.0,91957.0,91957.0,91957.0,29123.0,85269.0,91768.0,88574.0,88574.0,91957.0,...,82354.0,82354.0,82354.0,82354.0,82354.0,82354.0,82354.0,82354.0,82354.0,92020.0
mean,52.479866,20037200.0,44.643138,103514.236785,7.330907,185.164573,25.72798,100.478741,1212.180516,103642.756767,...,4.756587,3.513721,80.730808,47.684411,31.64156,15.024455,12.0463,4.81055,8.762027,0.5
std,37.929683,80631.31,69.700276,3579.987407,6.402888,6.706831,3.68599,158.240459,1680.863174,4637.426335,...,4.598634,2.628285,29.509406,19.369342,14.430257,7.259284,4.19923,3.27229,4.125754,0.500003
min,4.0,19901230.0,1.0,100282.0,1.0,160.0,14.762491,1.0,0.0,100284.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.0,0.0,0.0
25%,32.0,19970110.0,8.0,102000.0,3.0,180.0,23.019849,25.0,430.0,101990.0,...,2.0,2.0,59.0,34.0,21.0,10.0,9.0,2.0,6.0,0.0
50%,32.0,20030910.0,21.0,103252.0,6.0,185.0,25.538672,58.0,729.0,103176.0,...,4.0,3.0,75.0,44.0,29.0,14.0,11.0,4.0,8.0,0.5
75%,64.0,20110120.0,41.0,104468.0,9.0,190.0,28.221766,105.0,1272.0,104386.0,...,7.0,5.0,96.0,58.0,39.0,19.0,15.0,7.0,11.0,1.0
max,128.0,20180920.0,701.0,208029.0,35.0,208.0,43.655031,2101.0,16950.0,207973.0,...,103.0,26.0,489.0,328.0,284.0,101.0,91.0,28.0,35.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92020 entries, 0 to 92019
Data columns (total 50 columns):
tourney_id             91957 non-null object
tourney_name           91957 non-null object
surface                91839 non-null object
draw_size              91957 non-null float64
tourney_level          91957 non-null object
tourney_date           91957 non-null float64
match_num              91957 non-null float64
Player1_id             91957 non-null float64
Player1_seed           29123 non-null float64
Player1_entry          14376 non-null object
Player1_name           91957 non-null object
Player1_hand           91941 non-null object
Player1_ht             85269 non-null float64
Player1_ioc            91957 non-null object
Player1_age            91768 non-null float64
Player1_rank           88574 non-null float64
Player1_rank_points    88574 non-null float64
Player2_id             91957 non-null float64
Player2_seed           28172 non-null float64
Player2_entry          14

In [5]:
players = ['Player1_id','Player1_name','Player2_id','Player2_name']

to_drop = ['tourney_id','tourney_date','score']

cat_cols = ['tourney_name','surface','draw_size','tourney_level','Player1_entry','Player1_hand','Player1_ioc',
            'Player2_entry','Player2_hand','Player2_ioc','best_of','round']


num_cols = ['match_num','Player1_seed','Player1_ht','Player1_age','Player1_rank','Player1_rank_points',
            'Player2_seed','Player2_ht','Player2_age','Player2_rank','Player2_rank_points','minutes']

betting_cols = ['w_ace','w_df','w_svpt','w_1stIn','w_1stWon','w_2ndWon','w_SvGms','w_bpSaved',
                'w_bpFaced','l_ace','l_df','l_svpt','l_1stIn','l_1stWon','l_2ndWon','l_SvGms',
                'l_bpSaved','l_bpFaced']

### Dropping Irrelevant Features and Betting Data

In [6]:
df = df.drop(to_drop, axis=1)
df = df.drop(betting_cols, axis=1)
df.head()

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,match_num,Player1_id,Player1_seed,Player1_entry,Player1_name,Player1_hand,...,Player2_hand,Player2_ht,Player2_ioc,Player2_age,Player2_rank,Player2_rank_points,best_of,round,minutes,y
0,Umag,Clay,32.0,A,279.0,105413.0,,,Andrej Martin,R,...,R,188.0,UZB,30.858316,82.0,624.0,3.0,R32,83.0,1
1,US Open,Hard,128.0,G,103.0,104269.0,,,Fernando Verdasco,L,...,R,188.0,SRB,18.272416,97.0,431.0,5.0,R32,184.0,1
2,Sydney Outdoor,Hard,32.0,A,5.0,101733.0,4.0,,Jan Siemerink,L,...,R,185.0,SWE,24.84052,40.0,966.0,3.0,R32,83.0,0
3,Costa Do Sauipe,Hard,32.0,A,27.0,103103.0,7.0,,Dominik Hrbaty,R,...,R,175.0,ARG,20.654346,70.0,568.0,3.0,QF,73.0,0
4,Queen's Club,Grass,56.0,A,19.0,104379.0,,Q,Todd Reid,R,...,R,188.0,FRA,23.386721,88.0,476.0,3.0,R64,75.0,1


### Replacing Missing Values

In [7]:
for i in cat_cols:
    df[i] = df[i].replace(np.NaN, df[i].mode()[0])

In [8]:
for i in num_cols:
    print(i)
    df[i] = df[i].replace(np.NaN,df[i].mean())

match_num
Player1_seed
Player1_ht
Player1_age
Player1_rank
Player1_rank_points
Player2_seed
Player2_ht
Player2_age
Player2_rank
Player2_rank_points
minutes


### convert categorical data to numeric

In [9]:
def convertCatToNum(dff):
    dff_new = pd.get_dummies(dff, columns=cat_cols) 
    return dff_new

In [10]:
df = convertCatToNum(df)
df.head()

Unnamed: 0,match_num,Player1_id,Player1_seed,Player1_name,Player1_ht,Player1_age,Player1_rank,Player1_rank_points,Player2_id,Player2_seed,...,best_of_5.0,round_BR,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF
0,279.0,105413.0,7.330907,Andrej Martin,180.0,27.82204,149.0,372.0,104797.0,7.257774,...,0,0,0,0,0,0,1,0,0,0
1,103.0,104269.0,7.330907,Fernando Verdasco,188.0,21.787817,48.0,770.0,104925.0,7.257774,...,1,0,0,0,0,0,1,0,0,0
2,5.0,101733.0,4.0,Jan Siemerink,183.0,26.732375,17.0,1530.0,102025.0,7.257774,...,0,0,0,0,0,0,1,0,0,0
3,27.0,103103.0,7.0,Dominik Hrbaty,183.0,24.678987,44.0,772.0,103909.0,7.257774,...,0,0,0,1,0,0,0,0,0,0
4,19.0,104379.0,7.330907,Todd Reid,180.0,20.010951,129.0,315.0,103693.0,7.257774,...,0,0,0,0,0,0,0,1,0,0


### Normalize numeric features

In [11]:
def normalize(dff,col_name_list):
    result = dff.copy()
    for feature_name in col_name_list:
        max_value = dff[feature_name].max()
        min_value = dff[feature_name].min()
        result[feature_name] = (dff[feature_name] - min_value) / (max_value - min_value)
    return result


df = normalize(df,num_cols)

In [12]:
df

Unnamed: 0,match_num,Player1_id,Player1_seed,Player1_name,Player1_ht,Player1_age,Player1_rank,Player1_rank_points,Player2_id,Player2_seed,...,best_of_5.0,round_BR,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF
0,0.397143,105413.0,0.186203,Andrej Martin,0.416667,0.452004,0.070476,0.021947,104797.0,0.184052,...,0,0,0,0,0,0,1,0,0,0
1,0.145714,104269.0,0.186203,Fernando Verdasco,0.583333,0.243154,0.022381,0.045428,104925.0,0.184052,...,1,0,0,0,0,0,1,0,0,0
2,0.005714,101733.0,0.088235,Jan Siemerink,0.479167,0.414290,0.007619,0.090265,102025.0,0.184052,...,0,0,0,0,0,0,1,0,0,0
3,0.037143,103103.0,0.176471,Dominik Hrbaty,0.479167,0.343220,0.020476,0.045546,103909.0,0.184052,...,0,0,0,1,0,0,0,0,0,0
4,0.025714,104379.0,0.186203,Todd Reid,0.416667,0.181655,0.060952,0.018584,103693.0,0.184052,...,0,0,0,0,0,0,0,1,0,0
5,0.100000,102610.0,0.294118,Albert Costa,0.416667,0.241069,0.005238,0.109558,102446.0,0.184052,...,0,0,0,0,0,0,1,0,0,0
6,0.001429,105583.0,0.186203,Dusan Lajovic,0.416667,0.340946,0.036190,0.039292,104665.0,0.184052,...,0,0,0,0,0,0,1,0,0,0
7,0.031429,102035.0,0.186203,Jonas Bjorkman,0.479167,0.213778,0.118571,0.007198,101703.0,0.184052,...,0,0,0,0,0,1,0,0,0,0
8,0.385714,106000.0,0.186203,Damir Dzumhur,0.250000,0.351843,0.042381,0.035693,104327.0,0.184052,...,0,0,0,0,0,0,1,0,0,0
9,0.055714,102437.0,0.186203,Adrian Voinea,0.520833,0.482612,0.040476,0.027375,102845.0,0.029412,...,0,0,0,0,0,0,1,0,0,0


In [13]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

### Splitting Data into train and test set

In [14]:
Y = pd.DataFrame(df['y'])
df = df.drop(['y'], axis=1)
X = df
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [15]:
X_train = X_train.drop(['Player1_id'], axis=1)
X_train = X_train.drop(['Player1_name'], axis=1)
X_train = X_train.drop(['Player2_id'], axis=1)
X_train = X_train.drop(['Player2_name'], axis=1)

In [16]:
test_names = []
for index, row in X_test.iterrows():
    v = []
    v.append(row['Player1_name'])
    v.append(row['Player2_name'])
    test_names.append(v)

In [17]:
X_test = X_test.drop(['Player1_id'], axis=1)
X_test = X_test.drop(['Player1_name'], axis=1)
X_test = X_test.drop(['Player2_id'], axis=1)
X_test = X_test.drop(['Player2_name'], axis=1)

In [18]:
import operator
def getWinner(preds):
    dic = {}
    
    for i in range(len(preds)):
        w = ''
        if(preds[i]==0):
            w = test_names[i][0]
        else:
            w = test_names[i][1]
        if w in dic:
            dic[w] = dic[w]+1
        else:
            dic[w] = 1
    winner = max(dic.items(), key=operator.itemgetter(1))[0]
    return winner

In [19]:
from sklearn.ensemble import RandomForestClassifier
def randomForest(est,dep):
    print("Random Forest")
    rdforest = RandomForestClassifier(n_estimators=est,max_depth=dep)
    rdforest.fit(X_train, y_train)
    rdpreds = rdforest.predict(X_test)
    print ("\tMean Squared Error: ",metrics.mean_squared_error(y_test,rdpreds))
    print ("\tR2 Score: ",metrics.r2_score(y_test,rdpreds))
    print ("\tTraining Accuracy: ",rdforest.score(X_train, y_train)*100)
    print ("\tTest Accuracy: ",metrics.accuracy_score(np.round(y_test),np.round(rdpreds))*100)
    return rdpreds

In [20]:
rf_preds = randomForest(140,30)
print("Winner: ",getWinner(rf_preds))

Random Forest


  """


	Mean Squared Error:  0.12737511114038266
	R2 Score:  0.49046746303615074
	Training Accuracy:  93.75537281235302
	Test Accuracy:  87.26248888596173
Winner:  Feliciano Lopez


In [21]:
from sklearn.linear_model import LogisticRegression
def logisticRegression(c):
    print("Logistic Regression")
    clf = LogisticRegression(penalty = 'l2', C = c,random_state = 0,solver='sag')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)*100
    print ("\tMean Squared Error:",metrics.mean_squared_error(y_test,y_pred))
    print ("\tR2 Score:",metrics.r2_score(y_test,y_pred))
    print ("\tTraining Accuracy: ",clf.score(X_train, y_train)*100)
    print("\tTest Accuracy: ",acc)
    return y_pred
    
    

In [22]:
lr_preds = logisticRegression(95)
print("Winner: ",getWinner(lr_preds))

Logistic Regression


  y = column_or_1d(y, warn=True)


	Mean Squared Error: 0.11693614779201106
	R2 Score: 0.5322259465463731
	Training Accuracy:  88.64775436718408
	Test Accuracy:  88.30638522079889
Winner:  Fabrice Santoro


In [23]:
from sklearn.neural_network import MLPClassifier
def NeuralNetwork(hl,iterations,Alpha,optimizer):
    print("Neural Network")
    clfmlp = MLPClassifier(hidden_layer_sizes=hl, max_iter=iterations, alpha=Alpha,
                     solver=optimizer, verbose=10,  random_state=21,tol=0.000000001)
    clfmlp.fit(X_train, y_train)
    mlp_preds = clfmlp.predict(X_test)
    print("mlp_preds: ",mlp_preds)
    print ("\tMean Squared Error: ",metrics.mean_squared_error(y_test,mlp_preds))
    print ("\tR2 Score: ",metrics.r2_score(y_test,mlp_preds))
    print ("\tTraining Accuracy: ",clfmlp.score(X_train, y_train)*100)
    print ("\tTest Accuracy: ",metrics.accuracy_score(np.round(y_test),np.round(mlp_preds))*100)
    return mlp_preds
    

In [24]:
hidden_layer = (100,100,100)
iterations = 500
alpha = 0.0001
opt = 'adam'
mlpreds = NeuralNetwork(hidden_layer,iterations,alpha,opt)
print("Winner: ",getWinner(lr_preds))

Neural Network


  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.36534691
Iteration 2, loss = 0.25091898
Iteration 3, loss = 0.21848294
Iteration 4, loss = 0.20541023
Iteration 5, loss = 0.19605858
Iteration 6, loss = 0.18711371
Iteration 7, loss = 0.17874818
Iteration 8, loss = 0.17138330
Iteration 9, loss = 0.16343382
Iteration 10, loss = 0.15879995
Iteration 11, loss = 0.15176708
Iteration 12, loss = 0.14574597
Iteration 13, loss = 0.14008320
Iteration 14, loss = 0.13413112
Iteration 15, loss = 0.12945519
Iteration 16, loss = 0.12361367
Iteration 17, loss = 0.12185025
Iteration 18, loss = 0.11416956
Iteration 19, loss = 0.10947682
Iteration 20, loss = 0.10606440
Iteration 21, loss = 0.10275064
Iteration 22, loss = 0.09698319
Iteration 23, loss = 0.09690764
Iteration 24, loss = 0.08776128
Iteration 25, loss = 0.08926625
Iteration 26, loss = 0.08324452
Iteration 27, loss = 0.08215117
Iteration 28, loss = 0.07578902
Iteration 29, loss = 0.07142450
Iteration 30, loss = 0.07305341
Iteration 31, loss = 0.06764127
Iteration 32, los

In [25]:
from sklearn.naive_bayes import GaussianNB

def naiveBayes():
    print("Naive Bayes")
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    print ("\tMean Squared Error: ",metrics.mean_squared_error(y_test,y_pred))
    print ("\tR2 Score: ",metrics.r2_score(y_test,y_pred))
    print ("\tTraining Accuracy: ",gnb.score(X_train, y_train)*100)
    print("\tTest Accuracy: ",metrics.accuracy_score(y_test, y_pred)*100)
    return y_pred

In [26]:
nb_preds = naiveBayes()
print("Winner: ",getWinner(nb_preds))

Naive Bayes


  y = column_or_1d(y, warn=True)


	Mean Squared Error:  0.33440906246912766
	R2 Score:  -0.33772050487794436
	Training Accuracy:  66.32767261933726
	Test Accuracy:  66.55909375308723
Winner:  Roger Federer
