## Import packages 

In [1]:
from utils.create_features_utils import *
import pandas as pd
import numpy as np
from keras import models, layers
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras import metrics
from keras.models import load_model
import matplotlib.pyplot as plt

Using TensorFlow backend.


## Read match data with features

In [2]:
df = pd.read_csv('data/wimbledon_matches_with_feature.csv')

df = df.dropna()

df['diff_rank'] = df['player_0_rank'] - df['player_1_rank']

## List of features

In [3]:
features_list = [
 'diff_rank',
 'diff_match_win_percent',
 'diff_games_win_percent',
 'diff_5_set_match_win_percent',
 'diff_close_sets_percent',
 'diff_match_win_percent_grass',
 'diff_games_win_percent_grass',
 'diff_5_set_match_win_percent_grass',
 'diff_close_sets_percent_grass',
 'diff_match_win_percent_52',
 'diff_games_win_percent_52',
 'diff_5_set_match_win_percent_52',
 'diff_close_sets_percent_52',
 'diff_match_win_percent_grass_60',
 'diff_games_win_percent_grass_60',
 'diff_5_set_match_win_percent_grass_60',
 'diff_close_sets_percent_grass_60',
 'diff_match_win_percent_hh',
 'diff_games_win_percent_hh',
 'diff_match_win_percent_grass_hh',
 'diff_games_win_percent_grass_hh']

## Split Data intro Train (80 %) and Test (20%)

In [4]:
target = df.outcome
features = df[features_list]

train_features, test_features, train_target, test_target = train_test_split(features, target, test_size=0.20, random_state=1)


## Build the neural network. 
### Details
    - Number of Layers: 3. (2 Hidden Layers)
    - Number of Neuros in each layer: 64->32->1
    - Activation relu->relu->sigmoid
    - Stop if validation loss does not improve for 500 epochs
    - Save the best model which gives the maximum validation accuracy. 

In [13]:
network = models.Sequential()
network.add(layers.Dense(units=64, activation='relu', input_shape=(len(features.columns),)))
network.add(layers.Dense(units=32, activation='relu'))
network.add(layers.Dense(units=1, activation='sigmoid'))

network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 

es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=500)
mc = ModelCheckpoint('data/best_model.h5', monitor='val_loss', mode='min', verbose=2, save_best_only=True)

history = network.fit(train_features, train_target, 
            epochs=1000, verbose=0, batch_size=128, 
            validation_data=(test_features, test_target), callbacks=[es, mc]) 

saved_model = load_model('data/best_model.h5')


Epoch 00001: val_loss improved from inf to 0.60132, saving model to data/best_model.h5

Epoch 00002: val_loss did not improve from 0.60132

Epoch 00003: val_loss did not improve from 0.60132

Epoch 00004: val_loss improved from 0.60132 to 0.57316, saving model to data/best_model.h5

Epoch 00005: val_loss improved from 0.57316 to 0.56665, saving model to data/best_model.h5

Epoch 00006: val_loss improved from 0.56665 to 0.55409, saving model to data/best_model.h5

Epoch 00007: val_loss did not improve from 0.55409

Epoch 00008: val_loss did not improve from 0.55409

Epoch 00009: val_loss improved from 0.55409 to 0.54758, saving model to data/best_model.h5

Epoch 00010: val_loss did not improve from 0.54758

Epoch 00011: val_loss improved from 0.54758 to 0.53290, saving model to data/best_model.h5

Epoch 00012: val_loss did not improve from 0.53290

Epoch 00013: val_loss improved from 0.53290 to 0.52931, saving model to data/best_model.h5

Epoch 00014: val_loss improved from 0.52931 to 


Epoch 00149: val_loss did not improve from 0.49654

Epoch 00150: val_loss did not improve from 0.49654

Epoch 00151: val_loss did not improve from 0.49654

Epoch 00152: val_loss did not improve from 0.49654

Epoch 00153: val_loss did not improve from 0.49654

Epoch 00154: val_loss did not improve from 0.49654

Epoch 00155: val_loss did not improve from 0.49654

Epoch 00156: val_loss did not improve from 0.49654

Epoch 00157: val_loss did not improve from 0.49654

Epoch 00158: val_loss did not improve from 0.49654

Epoch 00159: val_loss did not improve from 0.49654

Epoch 00160: val_loss did not improve from 0.49654

Epoch 00161: val_loss did not improve from 0.49654

Epoch 00162: val_loss did not improve from 0.49654

Epoch 00163: val_loss did not improve from 0.49654

Epoch 00164: val_loss did not improve from 0.49654

Epoch 00165: val_loss did not improve from 0.49654

Epoch 00166: val_loss did not improve from 0.49654

Epoch 00167: val_loss did not improve from 0.49654

Epoch 00168


Epoch 00317: val_loss did not improve from 0.49654

Epoch 00318: val_loss did not improve from 0.49654

Epoch 00319: val_loss did not improve from 0.49654

Epoch 00320: val_loss did not improve from 0.49654

Epoch 00321: val_loss did not improve from 0.49654

Epoch 00322: val_loss did not improve from 0.49654

Epoch 00323: val_loss did not improve from 0.49654

Epoch 00324: val_loss did not improve from 0.49654

Epoch 00325: val_loss did not improve from 0.49654

Epoch 00326: val_loss did not improve from 0.49654

Epoch 00327: val_loss did not improve from 0.49654

Epoch 00328: val_loss did not improve from 0.49654

Epoch 00329: val_loss did not improve from 0.49654

Epoch 00330: val_loss did not improve from 0.49654

Epoch 00331: val_loss did not improve from 0.49654

Epoch 00332: val_loss did not improve from 0.49654

Epoch 00333: val_loss did not improve from 0.49654

Epoch 00334: val_loss did not improve from 0.49654

Epoch 00335: val_loss did not improve from 0.49654

Epoch 00336


Epoch 00479: val_loss did not improve from 0.49654

Epoch 00480: val_loss did not improve from 0.49654

Epoch 00481: val_loss did not improve from 0.49654

Epoch 00482: val_loss did not improve from 0.49654

Epoch 00483: val_loss did not improve from 0.49654

Epoch 00484: val_loss did not improve from 0.49654

Epoch 00485: val_loss did not improve from 0.49654

Epoch 00486: val_loss did not improve from 0.49654

Epoch 00487: val_loss did not improve from 0.49654

Epoch 00488: val_loss did not improve from 0.49654

Epoch 00489: val_loss did not improve from 0.49654

Epoch 00490: val_loss did not improve from 0.49654

Epoch 00491: val_loss did not improve from 0.49654

Epoch 00492: val_loss did not improve from 0.49654

Epoch 00493: val_loss did not improve from 0.49654

Epoch 00494: val_loss did not improve from 0.49654

Epoch 00495: val_loss did not improve from 0.49654

Epoch 00496: val_loss did not improve from 0.49654

Epoch 00497: val_loss did not improve from 0.49654

Epoch 00498

## Accuracy of the best model

In [14]:
_, train_acc = saved_model.evaluate(train_features, train_target, verbose=0)
_, test_acc = saved_model.evaluate(test_features, test_target, verbose=0)

print('Train Accuracy: %.3f, Test Accuracy: %.3f' % (train_acc, test_acc))

Train Accuracy: 0.772, Test Accuracy: 0.773


## 2019 Wimbledon Matches

In [15]:
df_2019 = pd.read_csv('data/wimbledon_2019.csv')
df_raw = pd.read_csv('data/mens/combined_raw_data.csv')

df_2019['Date'] = '2019/07/07'
df_2019['Surface'] = 'Grass'
df_2019['diff_rank'] = df_2019['player_0_rank'] - df_2019['player_1_rank']

## Creating features to make prediction

In [16]:
df_2019 = create_features(df_2019, df_raw)

Creating Player Career Stats All Surface
Creating Player Career Stats on Grass/Clay/Hard
Creating Player Career Stats All Surface Last 52 Weeks
Creating Player Career Stats on Grass/Clay/Hard Last 60 Weeks
Creating Player Head to Head Career Stats All Surface
Creating Player Head to Head Career Stats On Grass
Creating Difference Variables


## Model Predictions
    - Outcome 0 indicates player_0 will win and outcome 1 indicates player_1 will win

In [17]:
features_16 = df_2019[features_list]

df_2019['prediction'] = saved_model.predict_classes(features_16)
df_2019['probability'] = 1 - np.abs(df_2019.prediction - saved_model.predict_proba(features_16).flatten())

df_2019[['Round', 'player_0', 'player_1', 'prediction', 'probability']]

Unnamed: 0,Round,player_0,player_1,prediction,probability
0,Round of 16,Djokovic N.,Humbert U.,0,0.898167
1,Round of 16,Goffin D.,Verdasco F.,0,0.599459
2,Round of 16,Raonic M.,Pella G.,0,0.821116
3,Round of 16,Bautista Agut R.,Paire B.,0,0.752442
4,Round of 16,Querrey S.,Sandgren T.,0,0.748146
5,Round of 16,Nadal R.,Sousa J.,0,0.914172
6,Round of 16,Nishikori K.,Kukushkin M.,0,0.865524
7,Round of 16,Federer R.,Berrettini M.,0,0.818423
8,Quarter,Djokovic N.,Goffin D.,0,0.825125
9,Quarter,Bautista Agut R.,Pella G.,0,0.527465
