In [2]:
import numpy as np
from pprint import pprint
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
def my_accuracy(y_true, y_pred):
    y_pred = np.where(y_pred > 0.5, 1, 0)
    return accuracy_score(y_true, y_pred)

# Only k1/k2 as  features

In [4]:
data = pd.read_pickle('test_only_coefs.pkl')
print(data.shape)
data.tail(5)

(36822, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,k1,k2,p1_win
date,player1,player2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-21,Medvedev D.,Nadal R.,1.72,2.1,1
2020-11-21,Nadal R.,Medvedev D.,2.1,1.72,0
2020-11-21,Thiem D.,Djokovic N.,2.37,1.57,1
2020-11-22,Medvedev D.,Thiem D.,1.66,2.2,1
2020-11-22,Thiem D.,Medvedev D.,2.2,1.66,0


In [5]:
y = data['p1_win'].astype(int)
X = data.drop(['p1_win'], axis=1)
date_idx = X.index.get_level_values('date')

In [10]:
test_periods = [
    ('2019-01-01', '2019-04-01'),
    ('2019-04-01', '2019-07-01'),
    ('2019-07-01', '2019-10-01'),
    ('2019-10-01', '2020-04-01'),
    ('2020-08-01', '2020-11-01'),
]

for p in test_periods:
    n_test = data[(date_idx >= p[0]) & (date_idx < p[1])].shape[0]
    print('test period: {}, match count: {:,}'.format(p, n_test))

test period: ('2019-01-01', '2019-04-01'), match count: 2,180
test period: ('2019-04-01', '2019-07-01'), match count: 2,216
test period: ('2019-07-01', '2019-10-01'), match count: 2,172
test period: ('2019-10-01', '2020-04-01'), match count: 2,610
test period: ('2020-08-01', '2020-11-01'), match count: 1,768


In [11]:
## TO DO: Протестить все тестовые периоды
test_from, test_to = test_periods[0]
X_test = X[(date_idx >= test_from) & (date_idx < test_to)]
y_test = y[(date_idx >= test_from) & (date_idx < test_to)]
X_train = X[(date_idx < test_from)]
y_train = y[(date_idx < test_from)]
## X_train = X[(date_idx < test_from) & (date_idx  >= '2016-01-01')]
## y_train = y[(date_idx < test_from) & (date_idx >= '2016-01-01')]

In [14]:
grid_params = {
    'learning_rate': [0.01, 0.02], 
    'max_depth': [1, 2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8],
    'n_estimators': [150, 250],
    'eval_metric' :["logloss"]
}

metric = make_scorer(my_accuracy)
cv = TimeSeriesSplit(n_splits=10)
gs_reg = GridSearchCV(xgb.XGBClassifier(n_jobs=3), grid_params, cv=cv, scoring=metric, verbose=True)
model = gs_reg.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [21]:
pprint(model.best_score_)
pprint(model.best_params_)
# pprint(model.cv_results_)

0.68147512864494
{'colsample_bytree': 0.8,
 'eval_metric': 'logloss',
 'learning_rate': 0.01,
 'max_depth': 1,
 'n_estimators': 150,
 'subsample': 0.7}


In [16]:
d = dict()
for i in range(len(X.columns)):
    d[X.columns[i]] = model.best_estimator_.feature_importances_[i]
f_importance = pd.DataFrame(data={'stats': X.columns, 'importance': model.best_estimator_.feature_importances_})
f_importance.sort_values('importance', ascending=False).head(20)

Unnamed: 0,stats,importance
1,k2,0.530853
0,k1,0.469147


# No coefs in features no lag

In [17]:
data = pd.read_pickle('test_no_coefs.pkl')
print(data.shape)
data.tail(5)

(36822, 37)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Surface,hour,round,p1_win,p1_age,p2_age,p1_height,p2_height,p1_birthday_today,p2_birthday_today,...,df_per_game_common_player2,break_points_prc_match_common_player1,break_points_prc_match_common_player2,first_serve_prc_match_common__dif,first_serve_points_prc_match_common__dif,second_serve_points_prc_match_common__dif,winning_on_return_prc_common__dif,aces_per_game_common__dif,df_per_game_common__dif,break_points_prc_match_common__dif
date,player1,player2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2020-11-21,Medvedev D.,Nadal R.,2.0,23,28.0,1,24.794521,34.493151,1.98,1.85,False,False,...,0.159306,0.416665,0.460037,-0.055953,-3.8e-05,-0.063917,-0.032912,0.331972,0.120598,-0.043371
2020-11-21,Nadal R.,Medvedev D.,2.0,23,28.0,0,34.493151,24.794521,1.85,1.98,False,False,...,0.279904,0.460037,0.416665,-0.055953,-3.8e-05,-0.063917,-0.032912,0.331972,0.120598,-0.043371
2020-11-21,Thiem D.,Djokovic N.,2.0,17,28.0,1,27.235616,33.526027,1.85,1.88,False,False,...,0.203736,0.39519,0.478712,-0.030587,-0.010308,-0.02444,-0.021343,0.064095,0.024169,-0.083522
2020-11-22,Medvedev D.,Thiem D.,2.0,21,16.0,1,24.79726,27.238356,1.98,1.85,False,False,...,0.239209,0.404553,0.3973,-0.005792,0.002319,0.000523,0.001675,0.151128,0.04678,0.007253
2020-11-22,Thiem D.,Medvedev D.,2.0,21,16.0,0,27.238356,24.79726,1.85,1.98,False,False,...,0.285989,0.3973,0.404553,-0.005792,0.002319,0.000523,0.001675,0.151128,0.04678,0.007253


In [19]:
y = data['p1_win'].astype(int)
X = data.drop(['p1_win'], axis=1)
date_idx = X.index.get_level_values('date')
test_periods = [
    ('2019-01-01', '2019-04-01'),
    ('2019-04-01', '2019-07-01'),
    ('2019-07-01', '2019-10-01'),
    ('2019-10-01', '2020-04-01'),
    ('2020-08-01', '2020-11-01'),
]

for p in test_periods:
    n_test = data[(date_idx >= p[0]) & (date_idx < p[1])].shape[0]
    print('test period: {}, match count: {:,}'.format(p, n_test))

test period: ('2019-01-01', '2019-04-01'), match count: 2,180
test period: ('2019-04-01', '2019-07-01'), match count: 2,216
test period: ('2019-07-01', '2019-10-01'), match count: 2,172
test period: ('2019-10-01', '2020-04-01'), match count: 2,610
test period: ('2020-08-01', '2020-11-01'), match count: 1,768


In [20]:
## TO DO: Протестить все тестовые периоды
test_from, test_to = test_periods[0]
X_test = X[(date_idx >= test_from) & (date_idx < test_to)]
y_test = y[(date_idx >= test_from) & (date_idx < test_to)]
X_train = X[(date_idx < test_from)]
y_train = y[(date_idx < test_from)]
## X_train = X[(date_idx < test_from) & (date_idx  >= '2016-01-01')]
## y_train = y[(date_idx < test_from) & (date_idx >= '2016-01-01')]

In [22]:
grid_params = {
    'learning_rate': [0.01, 0.02], 
    'max_depth': [1, 2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8],
    'n_estimators': [150, 250],
    'eval_metric' :["logloss"]
}

metric = make_scorer(my_accuracy)
cv = TimeSeriesSplit(n_splits=10)
gs_reg = GridSearchCV(xgb.XGBClassifier(n_jobs=3), grid_params, cv=cv, scoring=metric, verbose=True)
model = gs_reg.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [23]:
pprint(model.best_score_)
# pprint(model.best_params_)
# pprint(model.cv_results_)

0.5616209262435679


In [33]:
d = dict()
for i in range(len(X.columns)):
    d[X.columns[i]] = model.best_estimator_.feature_importances_[i]
f_importance = pd.DataFrame(data={'stats': X.columns, 'importance': model.best_estimator_.feature_importances_})
f_importance.sort_values('importance', ascending=False).head(20)

Unnamed: 0,stats,importance
1,k2,0.327473
0,k1,0.242725
14,complete_player2,0.055291
13,complete_player1,0.047856
11,overall_winning_serve_prc_player1,0.022881
21,second_serve_points_prc_match_common_player1,0.020908
12,overall_winning_serve_prc_player2,0.018398
6,p2_age,0.016386
5,p1_age,0.016181
19,first_serve_points_prc_match_common_player1,0.015421


# ALL STATS NO LAG + DIFF_FETURES

In [29]:
data = pd.read_pickle('test_all_stats.pkl')
print(data.shape)
data.tail(5)

(36822, 39)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,k1,k2,Surface,hour,round,p1_win,p1_age,p2_age,p1_height,p2_height,...,df_per_game_common_player2,break_points_prc_match_common_player1,break_points_prc_match_common_player2,first_serve_prc_match_common__dif,first_serve_points_prc_match_common__dif,second_serve_points_prc_match_common__dif,winning_on_return_prc_common__dif,aces_per_game_common__dif,df_per_game_common__dif,break_points_prc_match_common__dif
date,player1,player2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2020-11-21,Medvedev D.,Nadal R.,1.72,2.1,2.0,23,28.0,1,24.794521,34.493151,1.98,1.85,...,0.159306,0.416665,0.460037,-0.055953,-3.8e-05,-0.063917,-0.032912,0.331972,0.120598,-0.043371
2020-11-21,Nadal R.,Medvedev D.,2.1,1.72,2.0,23,28.0,0,34.493151,24.794521,1.85,1.98,...,0.279904,0.460037,0.416665,-0.055953,-3.8e-05,-0.063917,-0.032912,0.331972,0.120598,-0.043371
2020-11-21,Thiem D.,Djokovic N.,2.37,1.57,2.0,17,28.0,1,27.235616,33.526027,1.85,1.88,...,0.203736,0.39519,0.478712,-0.030587,-0.010308,-0.02444,-0.021343,0.064095,0.024169,-0.083522
2020-11-22,Medvedev D.,Thiem D.,1.66,2.2,2.0,21,16.0,1,24.79726,27.238356,1.98,1.85,...,0.239209,0.404553,0.3973,-0.005792,0.002319,0.000523,0.001675,0.151128,0.04678,0.007253
2020-11-22,Thiem D.,Medvedev D.,2.2,1.66,2.0,21,16.0,0,27.238356,24.79726,1.85,1.98,...,0.285989,0.3973,0.404553,-0.005792,0.002319,0.000523,0.001675,0.151128,0.04678,0.007253


In [30]:
y = data['p1_win'].astype(int)
X = data.drop(['p1_win'], axis=1)
date_idx = X.index.get_level_values('date')
test_periods = [
    ('2019-01-01', '2019-04-01'),
    ('2019-04-01', '2019-07-01'),
    ('2019-07-01', '2019-10-01'),
    ('2019-10-01', '2020-04-01'),
    ('2020-08-01', '2020-11-01'),
]

for p in test_periods:
    n_test = data[(date_idx >= p[0]) & (date_idx < p[1])].shape[0]
    print('test period: {}, match count: {:,}'.format(p, n_test))

test period: ('2019-01-01', '2019-04-01'), match count: 2,180
test period: ('2019-04-01', '2019-07-01'), match count: 2,216
test period: ('2019-07-01', '2019-10-01'), match count: 2,172
test period: ('2019-10-01', '2020-04-01'), match count: 2,610
test period: ('2020-08-01', '2020-11-01'), match count: 1,768


In [31]:
## TO DO: Протестить все тестовые периоды
test_from, test_to = test_periods[0]
X_test = X[(date_idx >= test_from) & (date_idx < test_to)]
y_test = y[(date_idx >= test_from) & (date_idx < test_to)]
X_train = X[(date_idx < test_from)]
y_train = y[(date_idx < test_from)]
## X_train = X[(date_idx < test_from) & (date_idx  >= '2016-01-01')]
## y_train = y[(date_idx < test_from) & (date_idx >= '2016-01-01')]

In [32]:
grid_params = {
    'learning_rate': [0.01, 0.02], 
    'max_depth': [1, 2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8],
    'n_estimators': [150, 250],
    'eval_metric' :["logloss"]
}

metric = make_scorer(my_accuracy)
cv = TimeSeriesSplit(n_splits=10)
gs_reg = GridSearchCV(xgb.XGBClassifier(n_jobs=3), grid_params, cv=cv, scoring=metric, verbose=True)
model = gs_reg.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [34]:
pprint(model.best_score_)
# pprint(model.best_params_)
# pprint(model.cv_results_)

0.6815608919382504


In [35]:
d = dict()
for i in range(len(X.columns)):
    d[X.columns[i]] = model.best_estimator_.feature_importances_[i]
f_importance = pd.DataFrame(data={'stats': X.columns, 'importance': model.best_estimator_.feature_importances_})
f_importance.sort_values('importance', ascending=False).head(20)

Unnamed: 0,stats,importance
1,k2,0.327473
0,k1,0.242725
14,complete_player2,0.055291
13,complete_player1,0.047856
11,overall_winning_serve_prc_player1,0.022881
21,second_serve_points_prc_match_common_player1,0.020908
12,overall_winning_serve_prc_player2,0.018398
6,p2_age,0.016386
5,p1_age,0.016181
19,first_serve_points_prc_match_common_player1,0.015421


# ALL STATS NO LAG NO DIFF_FETURES

In [36]:
data = pd.read_pickle('test_all_stats_no_dif.pkl')
print(data.shape)
data.tail(5)

(36822, 32)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,k1,k2,Surface,hour,round,p1_win,p1_age,p2_age,p1_height,p2_height,...,second_serve_points_prc_match_common_player1,second_serve_points_prc_match_common_player2,winning_on_return_prc_common_player1,winning_on_return_prc_common_player2,aces_per_game_common_player1,aces_per_game_common_player2,df_per_game_common_player1,df_per_game_common_player2,break_points_prc_match_common_player1,break_points_prc_match_common_player2
date,player1,player2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2020-11-21,Medvedev D.,Nadal R.,1.72,2.1,2.0,23,28.0,1,24.794521,34.493151,1.98,1.85,...,0.526984,0.590901,0.369188,0.4021,0.623172,0.291201,0.279904,0.159306,0.416665,0.460037
2020-11-21,Nadal R.,Medvedev D.,2.1,1.72,2.0,23,28.0,0,34.493151,24.794521,1.85,1.98,...,0.590901,0.526984,0.4021,0.369188,0.291201,0.623172,0.159306,0.279904,0.460037,0.416665
2020-11-21,Thiem D.,Djokovic N.,2.37,1.57,2.0,17,28.0,1,27.235616,33.526027,1.85,1.88,...,0.531193,0.555633,0.372535,0.393878,0.486419,0.422324,0.227905,0.203736,0.39519,0.478712
2020-11-22,Medvedev D.,Thiem D.,1.66,2.2,2.0,21,16.0,1,24.79726,27.238356,1.98,1.85,...,0.522934,0.522412,0.368747,0.367072,0.616668,0.46554,0.285989,0.239209,0.404553,0.3973
2020-11-22,Thiem D.,Medvedev D.,2.2,1.66,2.0,21,16.0,0,27.238356,24.79726,1.85,1.98,...,0.522412,0.522934,0.367072,0.368747,0.46554,0.616668,0.239209,0.285989,0.3973,0.404553


In [37]:
y = data['p1_win'].astype(int)
X = data.drop(['p1_win'], axis=1)
date_idx = X.index.get_level_values('date')
test_periods = [
    ('2019-01-01', '2019-04-01'),
    ('2019-04-01', '2019-07-01'),
    ('2019-07-01', '2019-10-01'),
    ('2019-10-01', '2020-04-01'),
    ('2020-08-01', '2020-11-01'),
]

for p in test_periods:
    n_test = data[(date_idx >= p[0]) & (date_idx < p[1])].shape[0]
    print('test period: {}, match count: {:,}'.format(p, n_test))

test period: ('2019-01-01', '2019-04-01'), match count: 2,180
test period: ('2019-04-01', '2019-07-01'), match count: 2,216
test period: ('2019-07-01', '2019-10-01'), match count: 2,172
test period: ('2019-10-01', '2020-04-01'), match count: 2,610
test period: ('2020-08-01', '2020-11-01'), match count: 1,768


In [38]:
## TO DO: Протестить все тестовые периоды
test_from, test_to = test_periods[0]
X_test = X[(date_idx >= test_from) & (date_idx < test_to)]
y_test = y[(date_idx >= test_from) & (date_idx < test_to)]
X_train = X[(date_idx < test_from)]
y_train = y[(date_idx < test_from)]
## X_train = X[(date_idx < test_from) & (date_idx  >= '2016-01-01')]
## y_train = y[(date_idx < test_from) & (date_idx >= '2016-01-01')]

In [39]:
grid_params = {
    'learning_rate': [0.01, 0.02], 
    'max_depth': [1, 2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8],
    'n_estimators': [150, 250],
    'eval_metric' :["logloss"]
}

metric = make_scorer(my_accuracy)
cv = TimeSeriesSplit(n_splits=10)
gs_reg = GridSearchCV(xgb.XGBClassifier(n_jobs=3), grid_params, cv=cv, scoring=metric, verbose=True)
model = gs_reg.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [44]:
pprint(model.best_score_)
pprint(model.best_params_)
# pprint(model.cv_results_)

0.6816466552315609
{'colsample_bytree': 0.8,
 'eval_metric': 'logloss',
 'learning_rate': 0.01,
 'max_depth': 2,
 'n_estimators': 150,
 'subsample': 0.8}


In [42]:
d = dict()
for i in range(len(X.columns)):
    d[X.columns[i]] = model.best_estimator_.feature_importances_[i]
f_importance = pd.DataFrame(data={'stats': X.columns, 'importance': model.best_estimator_.feature_importances_})
f_importance.sort_values('importance', ascending=False)

Unnamed: 0,stats,importance
0,k1,0.377298
1,k2,0.372827
13,complete_player1,0.051039
14,complete_player2,0.048503
16,serve_advantage_player2,0.044862
15,serve_advantage_player1,0.043312
11,overall_winning_serve_prc_player1,0.029497
12,overall_winning_serve_prc_player2,0.012896
6,p2_age,0.010998
5,p1_age,0.008767


Можно заметить, что статитстики, полученные как разность stat_p1 - stat_p2 не несут никакого вклада, это достаточно логично так как имеем дело с симметричными данными

# TESTS

In [127]:
data = pd.read_pickle('test.pkl')
print(data.shape)
data.tail(5)

(36822, 29)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,k1,k2,p1_win,p1_age,p2_age,p1_height,p2_height,p1_birthday_today,p2_birthday_today,overall_winning_serve_prc_player1,...,first_serve_points_prc_match_common_player1,first_serve_points_prc_match_common_player2,second_serve_points_prc_match_common_player1,second_serve_points_prc_match_common_player2,winning_on_return_prc_common_player1,winning_on_return_prc_common_player2,aces_per_game_common_player1,aces_per_game_common_player2,df_per_game_common_player1,df_per_game_common_player2
date,player1,player2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2020-11-21,Medvedev D.,Nadal R.,1.72,2.1,1,24.794521,34.493151,1.98,1.85,False,False,0.655559,...,0.73037,0.730408,0.526984,0.590901,0.369188,0.4021,0.623172,0.291201,0.279904,0.159306
2020-11-21,Nadal R.,Medvedev D.,2.1,1.72,0,34.493151,24.794521,1.85,1.98,False,False,0.6869,...,0.730408,0.73037,0.590901,0.526984,0.4021,0.369188,0.291201,0.623172,0.159306,0.279904
2020-11-21,Thiem D.,Djokovic N.,2.37,1.57,1,27.235616,33.526027,1.85,1.88,False,False,0.664651,...,0.739388,0.749696,0.531193,0.555633,0.372535,0.393878,0.486419,0.422324,0.227905,0.203736
2020-11-22,Medvedev D.,Thiem D.,1.66,2.2,1,24.79726,27.238356,1.98,1.85,False,False,0.657046,...,0.734239,0.73192,0.522934,0.522412,0.368747,0.367072,0.616668,0.46554,0.285989,0.239209
2020-11-22,Thiem D.,Medvedev D.,2.2,1.66,0,27.238356,24.79726,1.85,1.98,False,False,0.656597,...,0.73192,0.734239,0.522412,0.522934,0.367072,0.368747,0.46554,0.616668,0.239209,0.285989


In [128]:
y = data['p1_win'].astype(int)
X = data.drop(['p1_win'], axis=1)
date_idx = X.index.get_level_values('date')
test_periods = [
    ('2019-01-01', '2019-04-01'),
    ('2019-04-01', '2019-07-01'),
    ('2019-07-01', '2019-10-01'),
    ('2019-10-01', '2020-04-01'),
    ('2020-08-01', '2020-11-01'),
]

for p in test_periods:
    n_test = data[(date_idx >= p[0]) & (date_idx < p[1])].shape[0]
    print('test period: {}, match count: {:,}'.format(p, n_test))

test period: ('2019-01-01', '2019-04-01'), match count: 2,180
test period: ('2019-04-01', '2019-07-01'), match count: 2,216
test period: ('2019-07-01', '2019-10-01'), match count: 2,172
test period: ('2019-10-01', '2020-04-01'), match count: 2,610
test period: ('2020-08-01', '2020-11-01'), match count: 1,768


In [129]:
## TO DO: Протестить все тестовые периоды
test_from, test_to = test_periods[0]
X_test = X[(date_idx >= test_from) & (date_idx < test_to)]
y_test = y[(date_idx >= test_from) & (date_idx < test_to)]
X_train = X[(date_idx < test_from)]
y_train = y[(date_idx < test_from)]
## X_train = X[(date_idx < test_from) & (date_idx  >= '2016-01-01')]
## y_train = y[(date_idx < test_from) & (date_idx >= '2016-01-01')]

In [130]:
grid_params = {
    'learning_rate': [0.01, 0.02], 
    'max_depth': [2],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.8],
    'n_estimators': [150, 250],
    'eval_metric' :["logloss"]
}

metric = make_scorer(my_accuracy)
cv = TimeSeriesSplit(n_splits=10)
gs_reg = GridSearchCV(xgb.XGBClassifier(n_jobs=3), grid_params, cv=cv, scoring=metric, verbose=True)
model = gs_reg.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


In [131]:
pprint(model.best_score_)
pprint(model.best_params_)
# pprint(model.cv_results_)`

0.6823327615780446
{'colsample_bytree': 0.8,
 'eval_metric': 'logloss',
 'learning_rate': 0.02,
 'max_depth': 2,
 'n_estimators': 250,
 'subsample': 0.7}


In [132]:
d = dict()
for i in range(len(X.columns)):
    d[X.columns[i]] = model.best_estimator_.feature_importances_[i]
f_importance = pd.DataFrame(data={'stats': X.columns, 'importance': model.best_estimator_.feature_importances_})
f_importance.sort_values('importance', ascending=False)

Unnamed: 0,stats,importance
1,k2,0.329743
0,k1,0.322721
3,p2_age,0.020466
2,p1_age,0.019679
13,serve_advantage_player2,0.018877
10,complete_player1,0.017332
9,overall_winning_serve_prc_player2,0.017242
18,first_serve_points_prc_match_common_player1,0.016962
17,first_serve_prc_match_common_player2,0.016863
12,serve_advantage_player1,0.016855


removed_stats: No - 0.6815 - много статистик, не вносящих вклад
removed_stats: break_points_prc_match_common_player2, Surface - 0.6819 - почти нет статистик, не вносящих вклад, размазанный feature importance
removed_stats: break_points_prc_match_common_player1, break_points_prc_match_common_player2, Surface, hour  - 0.682
removed_stats: break_points_prc_match_common_player1, break_points_prc_match_common_player2, Surface, hour,
second_serve_points_prc_match_common_player1, second_serve_points_prc_match_common_player2 - 0.6819
removed_stats: break_points_prc_match_common_player1, break_points_prc_match_common_player2, Surface, hour, round  - 0.6823