## Import Libraries

In [70]:
# Data-related
import numpy as np
import pandas as pd
import requests, zipfile, io, seaborn
from tqdm import tqdm 

# Machine Learning
from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split

## Download Data

##### Data from the years specified below will be downloaded and stored as a Pandas DataFrame.

In [22]:
dfs = []

for year in tqdm(range(2013, 2019)):
    r = requests.get('http://www.tennis-data.co.uk/{0}/{0}.zip'.format(year))
    z = zipfile.ZipFile(io.BytesIO(r.content))
    df = pd.read_excel(z.open('{0}.xlsx'.format(year)))
    df['year'] = year
    dfs.append(df)

df = pd.concat(dfs, sort=True).reset_index(drop=True)

100%|██████████| 6/6 [00:04<00:00,  1.21it/s]


In [23]:
df.head()

Unnamed: 0,ATP,AvgL,AvgW,B365L,B365W,Best of,Comment,Court,Date,EXL,...,W1,W2,W3,W4,W5,WPts,WRank,Winner,Wsets,year
0,1,2.78,1.42,3.0,1.36,3,Completed,Outdoor,2012-12-31,2.65,...,6.0,6.0,,,,1215.0,28.0,Mayer F.,2.0,2013
1,1,2.05,1.73,2.2,1.61,3,Completed,Outdoor,2012-12-31,2.0,...,6.0,2.0,6.0,,,927.0,41.0,Nieminen J.,2.0,2013
2,1,3.58,1.28,3.75,1.25,3,Completed,Outdoor,2012-12-31,3.75,...,7.0,6.0,,,,1830.0,19.0,Nishikori K.,2.0,2013
3,1,7.76,1.08,9.0,1.07,3,Completed,Outdoor,2012-12-31,8.0,...,6.0,6.0,,,,1070.0,36.0,Baghdatis M.,2.0,2013
4,1,1.85,1.88,1.8,1.9,3,Completed,Outdoor,2013-01-01,1.87,...,6.0,6.0,,,,897.0,43.0,Istomin D.,2.0,2013


In [24]:
df.columns

Index(['ATP', 'AvgL', 'AvgW', 'B365L', 'B365W', 'Best of', 'Comment', 'Court',
       'Date', 'EXL', 'EXW', 'L1', 'L2', 'L3', 'L4', 'L5', 'LBL', 'LBW',
       'LPts', 'LRank', 'Location', 'Loser', 'Lsets', 'MaxL', 'MaxW', 'PSL',
       'PSW', 'Round', 'SJL', 'SJW', 'Series', 'Surface', 'Tournament', 'W1',
       'W2', 'W3', 'W4', 'W5', 'WPts', 'WRank', 'Winner', 'Wsets', 'year'],
      dtype='object')

## Code Variables and Predictors

##### We first need to code the outcomes of each match. For a logistic regression, one method would be to set it as 1 if Player A wins and 0 if Player A loses. Since the winners and losers are in separate columns, we should assign them as Player A in approximately equal numbers.

##### Eventually, what we want to do is maximize the return on investment (ROI) rather than simply predict the outcome of matches. Furthermore, prediction strategies should probably use a sliding window rather than the entire history of matches.

In [67]:
# Find a more ELEGANT way to do this

for index, match in tqdm(df.iterrows()) :
    if index in range(0, len(df), 2) :
        df.loc[index,'Player A'] = match['Winner']
        df.loc[index,'Player B'] = match['Loser']
        df.loc[index,'Outcome'] = 1
        df.loc[index,'B365 Diff'] = match['B365W'] - match['B365L']
        df.loc[index,'Rank Diff'] = match['WRank'] - match['LRank']
        df.loc[index,'Game Diff 2SETS'] = match[['W1','W2']].sum() - match[['L1','L2']].sum()
    elif index in range(1, len(df), 2):
        df.loc[index,'Player A'] = match['Loser']
        df.loc[index,'Player B'] = match['Winner']
        df.loc[index,'Outcome'] = 0
        df.loc[index,'B365 Diff'] = match['B365L'] - match['B365W']
        df.loc[index,'Rank Diff'] = match['LRank'] - match['WRank']
        df.loc[index,'Game Diff 2SETS'] = match[['L1','L2']].sum() - match[['W1','W2']].sum()

df = df.dropna(subset=['Rank Diff','B365 Diff','Game Diff 2SETS'])

15757it [01:30, 173.99it/s]


[Count] Elapsed: 90.57 seconds


## Machine Learning

##### Let's start with Logistic Regression.

In [99]:
clf = LogisticRegression(solver='lbfgs') # Silence Future Warning

XB = np.array(df['B365 Diff']).reshape(-1,1)
XB = scale(XB)
yB = df['Outcome']
scoresB = cross_val_score(clf,XB,yB,cv=5)

print('Accuracy using Betting Odds: %.3f' % scoresB.mean())

# 5-fold Cross Validated Scores
# X = df[['Rank Diff','B365 Diff','Game Diff 2SETS']]
X = df[['Rank Diff','B365 Diff']]
X = scale(X)
y = df['Outcome']
scores = cross_val_score(clf,X,y,cv=5)
print('Mean Accuracy: %.3f' % scores.mean())



Accuracy using Betting Odds: 0.696
Mean Accuracy: 0.661


##### Support Vector Machine.

In [88]:
clf = svm.SVC(kernel='linear', C=1)

X = df[['Rank Diff','B365 Diff','Game Diff 2SETS']]
X = scale(X)
y = df['Outcome']

# 5-fold Cross Validated Scores
scores = cross_val_score(clf,X,y,cv=5)
print('Mean Accuracy: %.3f' % scores.mean())

  


Mean Accuracy: 0.842
