In [63]:
import pandas
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import time
import datetime

In [68]:
features = pandas.read_csv('./features.csv', index_col='match_id')
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [69]:
X = features.loc[:, 'lobby_type':'dire_first_ward_time']
y = features.radiant_win # Целевая переменная

In [70]:
# Признаки которые имеют пропуски
X.columns[X.isna().any()].tolist()

['first_blood_time',
 'first_blood_team',
 'first_blood_player1',
 'first_blood_player2',
 'radiant_bottle_time',
 'radiant_courier_time',
 'radiant_flying_courier_time',
 'radiant_first_ward_time',
 'dire_bottle_time',
 'dire_courier_time',
 'dire_flying_courier_time',
 'dire_first_ward_time']

first_blood_time - Первой крови за первые 5 минут может и не быть.

first_blood_player2 - Даже если первая кровь была, она могла быть сделана без ассиста.

In [71]:
# Заменим пропуски на среднее.
na_columns = X.columns[X.isna().any()].tolist()
for na_column in na_columns:
    X[na_column].fillna(X[na_column].mean(), inplace=True)

In [72]:
cv = KFold(n_splits=5, shuffle=True, random_state=241)
for e in [10, 20, 30, 40, 50, 60, 70, 80]:
    scores = []
    
    start_time = datetime.datetime.now()
    
    for train_index, test_index in cv.split(X):
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]

        model = GradientBoostingClassifier(n_estimators=e, random_state=241)
        model.fit(X_train, y_train)

        predicts = model.predict_proba(X_test)[:, 1]
        scores.append(roc_auc_score(y_true=y_test, y_score=predicts))
        
    print('Time elapsed: ', datetime.datetime.now() - start_time)
    print(e, ': ', round(np.mean(scores), 2))
    print('')

Time elapsed:  0:00:30.111162
10 :  0.66

Time elapsed:  0:01:06.096356
20 :  0.68

Time elapsed:  0:02:18.638534
30 :  0.69

Time elapsed:  0:03:08.656419
40 :  0.69

Time elapsed:  0:02:04.842772
50 :  0.7

Time elapsed:  0:04:48.743401
60 :  0.7

Time elapsed:  0:05:41.421778
70 :  0.7

Time elapsed:  0:04:52.678831
80 :  0.7



При 30 деревьях качество 0.69. Расчеты длились 46 секунд.
Наилучшее качество получилось при 50 деревьях. Далее, при добавлении деревьев качество не улучшалось, а время расчетов быстро расло. Чтобы ускорить расчеты можно использовать меньшую выборку и установить ограничение на максимальную глубину деревьев.

## Логистическая регрессия

In [52]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [53]:
X = features.loc[:, 'lobby_type':'dire_first_ward_time']
y = features.radiant_win # Целевая переменная

In [54]:
# Заменим пропуски на среднее.
na_columns = X.columns[X.isna().any()].tolist()
for na_column in na_columns:
    X[na_column].fillna(X[na_column].mean(), inplace=True)

In [55]:
scaler = StandardScaler()
X_scaled = pandas.DataFrame(scaler.fit_transform(X))

In [57]:
cv = KFold(n_splits=5, shuffle=True)

for C in [0.001, 0.01, 0.1, 0.4, 0.8, 1.0, 1.5, 2.0]:
    scores = []
    start_time = datetime.datetime.now()
    
    for train_index, test_index in cv.split(X_scaled):
        X_train = X_scaled.iloc[train_index]
        y_train = y.iloc[train_index]

        X_test = X_scaled.iloc[test_index]
        y_test = y.iloc[test_index]

        model = LogisticRegression(penalty='l2')
        model.fit(X_train, y_train)
        
        predicts = model.predict_proba(X_test)[:, 1]
        scores.append(roc_auc_score(y_true=y_test, y_score=predicts))
        
    print('Time elapsed: ', datetime.datetime.now() - start_time)
    print(C, ': ', round(np.mean(scores), 10))
    print('')

Time elapsed:  0:00:16.628703
0.001 :  0.7167374217

Time elapsed:  0:00:27.476600
0.01 :  0.7167326852

Time elapsed:  0:00:25.464448
0.1 :  0.7166519236

Time elapsed:  0:00:25.715522
0.4 :  0.7166005997

Time elapsed:  0:00:25.822558
0.8 :  0.7168959123

Time elapsed:  0:00:26.008006
1.0 :  0.7166741781

Time elapsed:  0:00:14.071227
1.5 :  0.7168403511

Time elapsed:  0:00:13.545778
2.0 :  0.7167190414



In [58]:
categorical_features = [
    'lobby_type', 
    'r1_hero', 
    'r2_hero',
    'r3_hero',
    'r4_hero',
    'r5_hero', 
    'd1_hero', 
    'd2_hero',
    'd3_hero',
    'd4_hero',
    'd5_hero'
]

In [59]:
X_scaled_no_categorical = pandas.DataFrame(
    scaler.fit_transform(X.drop(categorical_features, axis=1))
)

In [62]:
cv = KFold(n_splits=5, shuffle=True)

for C in [0.001, 0.01, 0.1, 0.4, 0.8, 1.0, 1.5, 2.0]:
    scores = []
    start_time = datetime.datetime.now()
    
    for train_index, test_index in cv.split(X_scaled_no_categorical):
        X_train = X_scaled_no_categorical.iloc[train_index]
        y_train = y.iloc[train_index]

        X_test = X_scaled_no_categorical.iloc[test_index]
        y_test = y.iloc[test_index]

        model = LogisticRegression(penalty='l2')
        model.fit(X_train, y_train)
        
        predicts = model.predict_proba(X_test)[:, 1]
        scores.append(roc_auc_score(y_true=y_test, y_score=predicts))
        
    print('Time elapsed: ', datetime.datetime.now() - start_time)
    print(C, ': ', round(np.mean(scores), 10))
    print('')

Time elapsed:  0:00:10.515051
0.001 :  0.7167841908

Time elapsed:  0:00:10.197085
0.01 :  0.7167665475

Time elapsed:  0:00:11.156625
0.1 :  0.7168201358

Time elapsed:  0:00:10.429343
0.4 :  0.7166179366

Time elapsed:  0:00:09.575813
0.8 :  0.7169563391

Time elapsed:  0:00:09.928258
1.0 :  0.7167669272

Time elapsed:  0:00:09.504363
1.5 :  0.7167090278

Time elapsed:  0:00:09.847354
2.0 :  0.7169270382

