In [1]:
import Parser
import Process_Data
import Feature_Engineering as fe

import zipfile
import datetime
import pandas as pd
import os

## DOWNLOAD DATA

In [2]:
## FOR MAC 
## Chrome version 90.0.4430.212
script_dir = os.path.dirname(os.path.realpath('chromedriver_mac'))
webdriver_path = os.path.join(script_dir, 'chromedriver_mac')

In [None]:
## FOR WINDOWS
## Chrome version 91.0.4472.77
# script_dir = os.path.dirname(os.path.realpath('chromedriver.exe'))
# webdriver_path = os.path.join(script_dir, 'chromedriver.exe')
##

In [38]:
## UNPACK DATA
with zipfile.ZipFile("Data/data.zip", "r") as z:
    z.extractall("")

In [4]:
## PARSE
matches = []
hrefs = set()

startdate = datetime.date.today() + datetime.timedelta(1)
enddate = startdate + datetime.timedelta(2)
Parser.parse(startdate, enddate, matches, hrefs, webdriver_path)


## UPDATE OLD MATCHES
filename = 'Data/old_matches.csv'
old_matches = Process_Data.csv_to_list(filename)

matches = Process_Data.unpack_and_change(matches)
Process_Data.make_csv(matches, 'Data/new_matches.csv')
matches_after_parse = Process_Data.csv_to_list('Data/new_matches.csv')
Process_Data.make_csv(old_matches + matches_after_parse, 'Data/old_matches.csv')


## PROCESS OLD MATCHES
Process_Data.delete_trash(old_matches)
Process_Data.split_sets_and_dur(old_matches)
Process_Data.fix_set_score(old_matches)
Process_Data.split_stats(old_matches)
Process_Data.create_srv_games(old_matches)
Process_Data.create_bp_stat(old_matches)


## PROCESS NEW MATCHES
filename = 'Data/new_matches.csv'
new_matches = Process_Data.csv_to_list(filename)
Process_Data.delete_trash(new_matches)
Process_Data.split_sets_and_dur(new_matches)
Process_Data.fix_set_score(new_matches)
Process_Data.split_stats(new_matches)
Process_Data.create_srv_games(new_matches)
Process_Data.create_bp_stat(new_matches)

## UNION MATCHES --> CSV
matches = old_matches + new_matches
filename = 'Data/matches_final.csv'
Process_Data.to_csv(matches, filename)

## CSV --> PICKLE
data = pd.read_csv(filename)
filename = 'Data/matches_final.pkl'
data.to_pickle(filename)


Parsed matches on date: 2021-05-28 7


  interactivity=interactivity, compiler=compiler, result=result)


## FEATURE ENGINEERING

In [5]:
## READ DATA
df = pd.read_pickle('Data/matches_final.pkl') 
players1 = pd.read_pickle('Data/players.pkl')
players2 = pd.read_pickle('Data/players.pkl')

## CHANGE TIME TYPE
fe.fix_time(df)

## fix players' names for transporting stats
fe.fix_names1(players1)
fe.fix_names2(players2)
players1 = players1.set_index(['name'])
players2 = players2.set_index(['name'])

## transporting stats players --> dataframe
fe.add_stats(df, players1)
fe.add_stats(df, players2, 0)


## fix letters in data
fe.fix_letters(df)

## create retirement stat
fe.create_retirement_stat(df)


## converting age to datetime
fe.fix_age(df, 1)
fe.fix_age(df, 2)


## converting some stats to datetime in df for the next steps
df['date'] = pd.to_datetime(df['date'])
df['p1_birth'] = pd.to_datetime(df['p1_birth'])
df['p2_birth'] = pd.to_datetime(df['p2_birth'])

# count prc of winning points on return
fe.create_feature_WRP(df)

## normalize to per_game stats
fe.create_aces_per_game(df)
fe.create_double_faults_per_game(df)


## create stats based on common opponents
stats = ['first_serve_prc_match_player1', 'first_serve_points_prc_match_player1',
         'second_serve_points_prc_match_player1', 'winning_on_return_prc_player1',
        'aces_per_game_player1', 'df_per_game_player1', 'break_points_prc_match_player1']


fe.create_common_stats(df, stats)


## create new features
fe.create_feature_WSP(df)
fe.create_feature_COMPLETE(df)
fe.create_feature_SERVEADV(df)

# add invert matches before startdate
startdate = '2021-01-01'
df = fe.invert(df, startdate)

## drop Nan stats
df.dropna(subset=['k1'], inplace=True)
df.dropna(subset=['k2'], inplace=True)

# FIX SOME STATS' FORMAT
df = fe.fix_format(df)

# CREATE LAG FEATURES USING MAIN STATS
df = fe.build_lag_features(df)


df.rename(columns={"player1": "p1", "player2": "p2", "player1_win": "p1_win"}, inplace=True)
df = df.set_index(["date", "p1", "p2"])

df.head(3)

FIXING PLAYERS' NAMES...


IntProgress(value=0, max=1357)

IntProgress(value=0, max=1357)

PLAYERS' NAMES FIXED

IMPORTING STATIC STATS...


IntProgress(value=0, max=27623)

IntProgress(value=0, max=27623)

STATIC STATS IMPORTED

FIXING SURNAMES...


IntProgress(value=0, max=27623)

SURNAMES FIXED

CREATING STAT RETIREMENT...


IntProgress(value=0, max=1180)

STAT RETIREMENT CREATED

CREATING STAT first_serve_prc_match__common...


IntProgress(value=0, max=12657)

STAT first_serve_prc_match_common CREATED
CREATING STAT first_serve_points_prc_match__common...


IntProgress(value=0, max=12657)

STAT first_serve_points_prc_match_common CREATED
ADDING INVERT MATCHES...
INVERT MATCHES ADDED

CREATING LAG FEATURES...


  interpolation=interpolation)


LAG FEATURES CREATED



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,match_url,status,Surface,k1,k2,p1_win,round,match_dur,Location,sets_stat,duration_set1,duration_set2,duration_set3,duration_set4,duration_set5,aces_match_player1,aces_match_player2,double_faults_match_player1,double_faults_match_player2,first_serve_prc_match_player1,first_serve_prc_match_player2,first_serve_cnt_match_player1,first_serve_cnt_match_player2,second_serve_prc_match_player1,...,p1_filter_lag_1_match_dur_pctl10_7,p2_lag_1_match_dur_mean_1,p2_lag_1_match_dur_median_1,p2_lag_1_match_dur_pctl90_1,p2_lag_1_match_dur_pctl10_1,p2_lag_1_match_dur_mean_3,p2_lag_1_match_dur_median_3,p2_lag_1_match_dur_pctl90_3,p2_lag_1_match_dur_pctl10_3,p2_lag_1_match_dur_mean_7,p2_lag_1_match_dur_median_7,p2_lag_1_match_dur_pctl90_7,p2_lag_1_match_dur_pctl10_7,p2_filter_lag_1_match_dur_mean_1,p2_filter_lag_1_match_dur_median_1,p2_filter_lag_1_match_dur_pctl90_1,p2_filter_lag_1_match_dur_pctl10_1,p2_filter_lag_1_match_dur_mean_3,p2_filter_lag_1_match_dur_median_3,p2_filter_lag_1_match_dur_pctl90_3,p2_filter_lag_1_match_dur_pctl10_3,p2_filter_lag_1_match_dur_mean_7,p2_filter_lag_1_match_dur_median_7,p2_filter_lag_1_match_dur_pctl90_7,p2_filter_lag_1_match_dur_pctl10_7
date,p1,p2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
2015-06-01,Chardy J.,Murray A.,0,https://www.sofascore.com/chardy-murray/BcgsEqg,Ended,0.0,8.0,1.08,0,1.0,173,,,39.0,50.0,41.0,41.0,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2015-06-01,Djokovic N.,Gasquet R.,1,https://www.sofascore.com/djokovic-gasquet/oNf...,Ended,0.0,1.0,19.0,1,1.0,121,,,38.0,39.0,43.0,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2015-06-01,Gasquet R.,Djokovic N.,2,https://www.sofascore.com/djokovic-gasquet/oNf...,Ended,0.0,19.0,1.0,0,1.0,121,,,38.0,39.0,43.0,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


## CREATE MODELS

In [3]:
import Create_Models as cm

In [5]:
predictions = []

data = cm.create_data1(df)
y_pred1 = cm.build_model_classifier(data, startdate=startdate)
predictions.append(y_pred1)
data = cm.create_data2(df)
y_pred2 = cm.build_model_classifier(data, startdate=startdate)
predictions.append(y_pred2)
data = cm.create_data3(df)
y_pred3 = cm.build_model_classifier(data, startdate=startdate)
predictions.append(y_pred3)

## BUILD COMPOSITION

In [6]:
import Composition
agg = Composition.aggregate(df, startdate, predictions)

In [39]:
agg

Unnamed: 0_level_0,Unnamed: 1_level_0,p1_win_prob,p2_win_prob
p1,p2,Unnamed: 2_level_1,Unnamed: 3_level_1
Barrios Vera T.,Harrison C.,0.000000,0.000000
Escobar G.,Quiroz R.,0.425430,0.574570
Kozlov S.,Haerteis J.,0.597077,0.402923
Sakamoto P.,Young D.,0.360787,0.639213
Basic M.,Kuzmanov D.,0.493065,0.506935
...,...,...,...
Monfils G.,Nishioka Y.,0.653291,0.346709
Paul T.,Tsitsipas S.,0.204358,0.795642
Rinderknech A.,Sinner J.,0.176604,0.823396
Schwartzman D.,Gasquet R.,0.745482,0.254518
