In [1]:
import Parser
import Process_Data
import Feature_Engineering as fe

import datetime
import pandas as pd
from selenium import webdriver
import os


## DOWNLOAD DATA

In [10]:
## PARSE
matches = []
hrefs = set()

script_dir = os.path.dirname(os.path.realpath('chromedriver'))
webdriver_path = os.path.join(script_dir, "chromedriver")

startdate = datetime.date(2021, 5, 22)
endtime = datetime.date(2021, 5, 23)
Parser.parse(startdate, enddate, matches, hrefs, webdriver_path)


## UPDATE OLD MATCHES
filename = 'Data/old_matches.csv'
old_matches = Process_Data.csv_to_list(filename)

matches = Process_Data.unpack_and_change(matches)
Process_Data.make_csv(matches, 'Data/new_matches.csv')
matches_after_parse = Process_Data.csv_to_list('Data/new_matches.csv')
Process_Data.make_csv(old_matches + matches_after_parse, 'Data/old_matches.csv')


## PROCESS OLD MATCHES
Process_Data.delete_trash(old_matches)
Process_Data.split_sets_and_dur(old_matches)
Process_Data.fix_set_score(old_matches)
Process_Data.split_stats(old_matches)
Process_Data.create_srv_games(old_matches)
Process_Data.create_bp_stat(old_matches)


## PROCESS NEW MATCHES
filename = 'Data/new_matches.csv'
new_matches = Process_Data.csv_to_list(filename)
Process_Data.delete_trash(new_matches)
Process_Data.split_sets_and_dur(new_matches)
Process_Data.fix_set_score(new_matches)
Process_Data.split_stats(new_matches)
Process_Data.create_srv_games(new_matches)
Process_Data.create_bp_stat(new_matches)

## UNION MATCHES --> CSV
matches = old_matches + new_matches
filename = 'Data/matches_final.csv'
Process_Data.to_csv(matches, filename)

## CSV --> PICKLE
data = pd.read_csv(filename)
filename = 'Data/matches_final.pkl'
data.to_pickle(filename)


Parsed matches on date: 2020-11-18 4


  interactivity=interactivity, compiler=compiler, result=result)


## FEATURE ENGINEERING

In [2]:
## READ DATA
df = pd.read_pickle('test.pkl') 
players1 = pd.read_pickle('Data/players.pkl')
players2 = pd.read_pickle('Data/players.pkl')

## CHANGE TIME TYPE
fe.fix_time(df)

## fix players' names for transporting stats
fe.fix_names1(players1)
fe.fix_names2(players2)
players1 = players1.set_index(['name'])
players2 = players2.set_index(['name'])

## transporting stats players --> dataframe
fe.add_stats(df, players1)
fe.add_stats(df, players2, 0)


## fix letters in data
fe.fix_letters(df)

## create retirement stat
fe.create_retirement_stat(df)


## converting age to datetime
fe.fix_age(df, 1)
fe.fix_age(df, 2)



## converting some stats to datetime in df for the next steps
df['date'] = pd.to_datetime(df['date'])
df['p1_birth'] = pd.to_datetime(df['p1_birth'])
df['p2_birth'] = pd.to_datetime(df['p2_birth'])

# count prc of winning points on return
fe.create_feature_WRP(df)

## normalize to per_game stats
fe.create_aces_per_game(df)
fe.create_double_faults_per_game(df)


## create stats based on common opponents
stats = ['first_serve_prc_match_player1', 'first_serve_points_prc_match_player1',
         'second_serve_points_prc_match_player1', 'winning_on_return_prc_player1',
        'aces_per_game_player1', 'df_per_game_player1', 'break_points_prc_match_player1']
fe.create_common_stats(df, stats)


## create new features
fe.create_feature_WSP(df)
fe.create_feature_COMPLETE(df)
fe.create_feature_SERVEADV(df)

# add invert matches
df = fe.invert(df)

## drop Nan stats
df.dropna(subset=['p1_height'], inplace=True)
df.dropna(subset=['p2_height'], inplace=True)
df.dropna(subset=['p1_birth'], inplace=True)
df.dropna(subset=['p2_birth'], inplace=True)
df.dropna(subset=['k1'], inplace=True)
df.dropna(subset=['k2'], inplace=True)

# FIX SOME STATS' FORMAT
df = fe.fix_format(df)

# CREATE LAG FEATURES USING MAIN STATS
df = fe.build_lag_features(df)


df.rename(columns={"player1": "p1", "player2": "p2", "player1_win": "p1_win"}, inplace=True)
df = df.set_index(["date", "p1", "p2"])

df.head(3)

FIXING PLAYERS' NAMES...


IntProgress(value=0, max=1357)

IntProgress(value=0, max=1357)

PLAYERS' NAMES FIXED

IMPORTING STATIC STATS...


IntProgress(value=0, max=2)

IntProgress(value=0, max=2)

STATIC STATS IMPORTED

FIXING SURNAMES...


IntProgress(value=0, max=2)

SURNAMES FIXED

CREATING STAT RETIREMENT...


IntProgress(value=0, max=2)

STAT RETIREMENT CREATED

CREATING STAT first_serve_prc_match__common...


IntProgress(value=0, max=2)

STAT first_serve_prc_match_common CREATED
CREATING STAT first_serve_points_prc_match__common...


IntProgress(value=0, max=2)

STAT first_serve_points_prc_match_common CREATED
CREATING STAT second_serve_points_prc_match__common...


IntProgress(value=0, max=2)

STAT second_serve_points_prc_match_common CREATED
CREATING STAT winning_on_return_prc__common...


IntProgress(value=0, max=2)

STAT winning_on_return_prc_common CREATED
CREATING STAT aces_per_game__common...


IntProgress(value=0, max=2)

STAT aces_per_game_common CREATED
CREATING STAT df_per_game__common...


IntProgress(value=0, max=2)

STAT df_per_game_common CREATED
CREATING STAT break_points_prc_match__common...


IntProgress(value=0, max=2)

STAT break_points_prc_match_common CREATED
ADDING INVERT MATCHES...
INVERT MATCHES ADDED

CREATING LAG FEATURES...
LAG FEATURES CREATED



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,match_url,status,Location,Surface,match_dur,k1,k2,p1_win,round,duration_set1,duration_set2,duration_set3,duration_set4,duration_set5,aces_match_player1,aces_match_player2,double_faults_match_player1,double_faults_match_player2,first_serve_prc_match_player1,first_serve_prc_match_player2,first_serve_cnt_match_player1,first_serve_cnt_match_player2,second_serve_prc_match_player1,second_serve_prc_match_player2,...,p1_filter_lag_1_match_dur_pctl10_7,p2_lag_1_match_dur_mean_1,p2_lag_1_match_dur_median_1,p2_lag_1_match_dur_pctl90_1,p2_lag_1_match_dur_pctl10_1,p2_lag_1_match_dur_mean_3,p2_lag_1_match_dur_median_3,p2_lag_1_match_dur_pctl90_3,p2_lag_1_match_dur_pctl10_3,p2_lag_1_match_dur_mean_7,p2_lag_1_match_dur_median_7,p2_lag_1_match_dur_pctl90_7,p2_lag_1_match_dur_pctl10_7,p2_filter_lag_1_match_dur_mean_1,p2_filter_lag_1_match_dur_median_1,p2_filter_lag_1_match_dur_pctl90_1,p2_filter_lag_1_match_dur_pctl10_1,p2_filter_lag_1_match_dur_mean_3,p2_filter_lag_1_match_dur_median_3,p2_filter_lag_1_match_dur_pctl90_3,p2_filter_lag_1_match_dur_pctl10_3,p2_filter_lag_1_match_dur_mean_7,p2_filter_lag_1_match_dur_median_7,p2_filter_lag_1_match_dur_pctl90_7,p2_filter_lag_1_match_dur_pctl10_7
date,p1,p2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
2020-11-18,Djokovic N.,Medvedev D.,0,https://www.sofascore.com/medvedev-djokovic/HX...,Ended,London,0.0,81,1.4,3.0,0,0.0,47,34,,,,3.0,9.0,3.0,4.0,0.632353,0.714286,43.0,40.0,0.88,0.75,...,,,,,,,,,,,,,,,,,,,,,,,,,
2020-11-18,Medvedev D.,Djokovic N.,1,https://www.sofascore.com/medvedev-djokovic/HX...,Ended,London,0.0,81,3.0,1.4,1,0.0,47,34,,,,9.0,3.0,4.0,3.0,0.714286,0.632353,40.0,43.0,0.75,0.88,...,,,,,,,,,,,,,,,,,,,,,,,,,
2020-11-18,Schwartzman D.,Zverev A.,2,https://www.sofascore.com/zverev-schwartzman/Z...,Ended,London,0.0,131,3.75,1.28,0,0.0,39,53,39.0,,,0.0,10.0,2.0,3.0,0.677419,0.804878,63.0,66.0,0.933333,0.8125,...,,,,,,,,,,,,,,,,,,,,,,,,,


## CREATE MODELS

In [3]:
import Create_Models as cm

In [7]:
df = pd.read_pickle('Data/all_features.pkl')
data = cm.create_data1(df)
y_pred = cm.build_model_classifier(data, startdate='')

In [10]:
df.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,p2_filter_lag_1_max_points_in_a_row_match__median_7,p1_lag_1_points_won_match__pctl10_28,p1_lag_1_second_serve_prc_match__pctl90_7,p2_lag_1_second_serve_cnt_match__median_28,p2_filter_lag_1_max_points_in_a_row_match__pctl10_7,p2_lag_1_first_serve_points_cnt_match__pctl90_28,p1_lag_1_receiver_points_won_match__pctl10_7,p1_filter_lag_1_second_serve_points_cnt_match__median_7,p1_lag_1_match_dur_pctl90_1,p1_filter_lag_1_max_points_in_a_row_match__mean_7,p2_lag_1_aces_match__pctl90_28,p2_filter_lag_1_aces_match__median_7,p1_lag_1_first_serve_prc_match__pctl10_28,p2_lag_1_match_dur_median_1,p1_lag_1_second_serve_cnt_match__pctl10_7,p1_lag_1_first_serve_points_prc_match__mean_7,p1_filter_lag_1_break_points_prc_match__median_28,p1_lag_1_second_serve_points_cnt_match__pctl10_28,p1_filter_lag_1_first_serve_points_cnt_match__pctl90_7,p2_lag_1_points_won_match__median_28,p1_filter_lag_1_second_serve_prc_match__mean_28,p1_filter_lag_1_bp_saved_cnt__median_7_y,p2_lag_1_second_serve_points_prc_match__mean_7,p1_lag_1_second_serve_cnt_match__median_28,p2_lag_1_break_points_prc_match__pctl90_28,...,p2_filter_lag_1_second_serve_points_cnt_match__median_7,p2_filter_lag_1_receiver_points_won_match__mean_28,p1_lag_1_receiver_points_won_match__mean_7,p2_filter_lag_1_match_dur_median_1,p1_lag_1_second_serve_points_cnt_match__pctl90_28,p1_filter_lag_1_aces_match__mean_28,p1_lag_1_first_serve_cnt_match__mean_7,p2_lag_1_bp_saved_cnt__median_28_y,p1_lag_1_first_serve_cnt_match__pctl90_7,p1_lag_1_aces_match__median_7,p2_lag_1_break_points_cnt_match__pctl90_28,p2_filter_lag_1_break_points_prc_match__mean_28,p1_lag_1_second_serve_prc_match__pctl90_28,p2_filter_lag_1_first_serve_points_prc_match__pctl90_28,break_points_prc_match_common_player1,p1_filter_lag_1_points_won_match__pctl90_28,p1_lag_1_second_serve_points_cnt_match__mean_28,p2_lag_1_first_serve_points_prc_match__mean_28,p2_filter_lag_1_second_serve_prc_match__mean_7,p1_filter_lag_1_second_serve_points_prc_match__median_7,p1_filter_lag_1_first_serve_prc_match__median_7,p1_filter_lag_1_points_won_match__median_28,p2_lag_1_receiver_points_won_match__pctl10_28,p2_lag_1_bp_saved_cnt__pctl90_7_x,p1_lag_1_first_serve_cnt_match__pctl10_28
date,p1,p2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
2020-11-21,Thiem D.,Djokovic N.,7.0,60.2,0.974286,22.0,3.6,45.3,13.6,14.0,74.0,5.857143,10.6,3.0,0.586161,96.0,9.2,0.721599,0.366667,6.7,55.6,78.5,0.863272,3.0,0.472592,31.0,0.712121,...,10.0,25.857143,31.285714,96.0,28.0,5.964286,60.428571,3.0,92.4,8.0,7.0,0.441317,0.961286,0.856296,0.39519,101.0,18.464286,0.735739,0.869726,0.488372,0.742857,76.0,16.7,8.2,42.4
2020-11-22,Medvedev D.,Thiem D.,6.0,47.4,0.932143,31.0,4.6,71.2,24.6,9.0,155.0,7.285714,12.0,9.0,0.534339,174.0,6.2,0.805217,0.363636,5.0,48.8,100.5,0.914338,6.0,0.516149,18.0,0.6375,...,14.0,24.214286,32.142857,174.0,20.3,8.285714,48.142857,5.0,64.0,8.0,7.0,0.421457,1.0,0.817406,0.404553,96.1,12.321429,0.729398,0.903779,0.5625,0.740741,73.5,18.2,4.8,29.0
2020-11-22,Thiem D.,Medvedev D.,7.0,60.2,0.9625,18.0,5.2,48.6,13.6,14.0,174.0,6.0,14.3,8.0,0.586161,155.0,9.2,0.750604,0.4,6.7,63.4,78.5,0.861819,3.0,0.504796,31.0,0.734091,...,9.0,29.142857,24.857143,155.0,28.0,6.142857,55.857143,2.0,79.6,9.0,5.0,0.375495,0.961286,0.878538,0.3973,102.5,18.25,0.780779,0.842976,0.488372,0.742857,76.0,15.4,6.0,42.4


In [None]:
## Парсер сделать так чтобы он заполнял ВСЕ поля которые нужны для дальнейших преобразований(как walkover????)

In [11]:
## IAN: 2021-01-01 - 2021-03-01 // EGOR: 2021-03-01 - 2021-05-21