In [6]:
import pandas
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import time
import datetime

In [32]:
features = pandas.read_csv('./features.csv', index_col='match_id')
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [8]:
X = features.loc[:, 'lobby_type':'dire_first_ward_time']
y = features.radiant_win # Целевая переменная

In [9]:
# Признаки которые имеют пропуски
X.columns[X.isna().any()].tolist()

['first_blood_time',
 'first_blood_team',
 'first_blood_player1',
 'first_blood_player2',
 'radiant_bottle_time',
 'radiant_courier_time',
 'radiant_flying_courier_time',
 'radiant_first_ward_time',
 'dire_bottle_time',
 'dire_courier_time',
 'dire_flying_courier_time',
 'dire_first_ward_time']

first_blood_time - Первой крови за первые 5 минут может и не быть.

first_blood_player2 - Даже если первая кровь была, она могла быть сделана без ассиста.

In [10]:
# Заменим пропуски на среднее.
na_columns = X.columns[X.isna().any()].tolist()
for na_column in na_columns:
    X[na_column].fillna(X[na_column].mean(), inplace=True)

In [11]:
cv = KFold(n_splits=5, shuffle=True, random_state=241)
for e in [10, 20, 30, 40, 50, 60, 70, 80]:
    scores = []
    
    start_time = datetime.datetime.now()
    
    for train_index, test_index in cv.split(X):
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]

        model = GradientBoostingClassifier(n_estimators=e, random_state=241)
        model.fit(X_train, y_train)

        predicts = model.predict_proba(X_test)[:, 1]
        scores.append(roc_auc_score(y_true=y_test, y_score=predicts))
        
    print('Time elapsed: ', datetime.datetime.now() - start_time)
    print(e, ': ', round(np.mean(scores), 2))
    print('')

Time elapsed:  0:00:16.955408
10 :  0.66

Time elapsed:  0:00:32.010233
20 :  0.68

Time elapsed:  0:00:48.251998
30 :  0.69

Time elapsed:  0:01:06.852765
40 :  0.69

Time elapsed:  0:01:19.986767
50 :  0.7

Time elapsed:  0:01:34.421635
60 :  0.7

Time elapsed:  0:01:46.024453
70 :  0.7

Time elapsed:  0:02:05.403555
80 :  0.7



При 30 деревьях качество 0.69. Расчеты длились 46 секунд.
Наилучшее качество получилось при 50 деревьях. Далее, при добавлении деревьев качество не улучшалось, а время расчетов быстро расло. Чтобы ускорить расчеты можно использовать меньшую выборку и установить ограничение на максимальную глубину деревьев.

## Логистическая регрессия

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [14]:
X = features.loc[:, 'lobby_type':'dire_first_ward_time']
y = features.radiant_win # Целевая переменная

In [15]:
# Заменим пропуски на среднее.
na_columns = X.columns[X.isna().any()].tolist()
for na_column in na_columns:
    X[na_column].fillna(X[na_column].mean(), inplace=True)

In [16]:
scaler = StandardScaler()
X_scaled = pandas.DataFrame(scaler.fit_transform(X))

In [17]:
cv = KFold(n_splits=5, shuffle=True)

for C in [0.001, 0.01, 0.1, 0.4, 0.8, 1.0, 1.5, 2.0]:
    scores = []
    start_time = datetime.datetime.now()
    
    for train_index, test_index in cv.split(X_scaled):
        X_train = X_scaled.iloc[train_index]
        y_train = y.iloc[train_index]

        X_test = X_scaled.iloc[test_index]
        y_test = y.iloc[test_index]

        model = LogisticRegression(penalty='l2')
        model.fit(X_train, y_train)
        
        predicts = model.predict_proba(X_test)[:, 1]
        scores.append(roc_auc_score(y_true=y_test, y_score=predicts))
        
    print('Time elapsed: ', datetime.datetime.now() - start_time)
    print(C, ': ', round(np.mean(scores), 10))
    print('')

Time elapsed:  0:00:10.045954
0.001 :  0.7166502568

Time elapsed:  0:00:09.993703
0.01 :  0.7167419258

Time elapsed:  0:00:09.549398
0.1 :  0.7167478859

Time elapsed:  0:00:09.781853
0.4 :  0.7169114199

Time elapsed:  0:00:10.197858
0.8 :  0.7166747426

Time elapsed:  0:00:10.124864
1.0 :  0.7167092857

Time elapsed:  0:00:11.002961
1.5 :  0.7168975698

Time elapsed:  0:00:10.000777
2.0 :  0.7168783176



Качество 0.7169 - выше чем у градиентного бустинга в "лоб". Работает немного быстрее. Чем объяснить не знаю. Я ожидал что при нрадиентном бустинге качество будет выше. Видимо, такова особенность данных.

In [18]:
categorical_features = [
    'lobby_type', 
    'r1_hero', 
    'r2_hero',
    'r3_hero',
    'r4_hero',
    'r5_hero', 
    'd1_hero', 
    'd2_hero',
    'd3_hero',
    'd4_hero',
    'd5_hero'
]

In [19]:
X_scaled_no_categorical = pandas.DataFrame(
    scaler.fit_transform(X.drop(categorical_features, axis=1))
)

In [20]:
cv = KFold(n_splits=5, shuffle=True)

for C in [0.001, 0.01, 0.1, 0.4, 0.8, 1.0, 1.5, 2.0]:
    scores = []
    start_time = datetime.datetime.now()
    
    for train_index, test_index in cv.split(X_scaled_no_categorical):
        X_train = X_scaled_no_categorical.iloc[train_index]
        y_train = y.iloc[train_index]

        X_test = X_scaled_no_categorical.iloc[test_index]
        y_test = y.iloc[test_index]

        model = LogisticRegression(penalty='l2')
        model.fit(X_train, y_train)
        
        predicts = model.predict_proba(X_test)[:, 1]
        scores.append(roc_auc_score(y_true=y_test, y_score=predicts))
        
    print('Time elapsed: ', datetime.datetime.now() - start_time)
    print(C, ': ', round(np.mean(scores), 10))
    print('')

Time elapsed:  0:00:09.174038
0.001 :  0.7167824971

Time elapsed:  0:00:09.518603
0.01 :  0.7167842853

Time elapsed:  0:00:08.846918
0.1 :  0.7168074123

Time elapsed:  0:00:09.113286
0.4 :  0.7168273442

Time elapsed:  0:00:08.881563
0.8 :  0.716808573

Time elapsed:  0:00:09.465885
1.0 :  0.7167540532

Time elapsed:  0:00:08.813715
1.5 :  0.7167853514

Time elapsed:  0:00:09.222205
2.0 :  0.7166540247



При удалении категориальных признаков качество осталось прежним, . Я думаю это связано с тем что использование категориальных признаков как вещественных не имеет никакого смысло и никак не влияет на качество.

In [49]:
len(X.d5_hero.unique())

108

На время создания этой выборки в доте было 108 героев.

In [67]:
heroes = pandas.read_csv('./heroes.csv')
n_heroes = len(heroes)
print('Heroes number =', n_heroes)

Heroes number = 112


In [78]:
heroes.head()

Unnamed: 0,id,localized_name,name
0,1,Anti-Mage,antimage
1,2,Axe,axe
2,3,Bane,bane
3,4,Bloodseeker,bloodseeker
4,5,Crystal Maiden,crystal_maiden


In [77]:
in_data = pandas.read_csv('./features.csv', index_col='match_id')

x_pick = np.zeros((data.shape[0], 108))
x_pick = np.zeros((in_data.shape[0], n_heroes))
for i, match_id in enumerate(in_data.index):
    for p in range(5):
        x_pick[i, in_data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        x_pick[i, in_data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
x_pick = pandas.DataFrame(x_pick, index=in_data.index)

In [81]:
X_no_categorical = pandas.DataFrame(X.drop(categorical_features, axis=1))

final_x = pandas.concat([X_no_categorical, x_pick], axis=1)
scale_x =  pandas.DataFrame(scaler.fit_transform(final_x))

In [82]:
cv = KFold(n_splits=5, shuffle=True)

for C in [0.001, 0.01, 0.1, 0.4, 0.8, 1.0, 1.5, 2.0]:
    scores = []
    start_time = datetime.datetime.now()
    
    for train_index, test_index in cv.split(scale_x):
        X_train = scale_x.iloc[train_index]
        y_train = y.iloc[train_index]

        X_test = scale_x.iloc[test_index]
        y_test = y.iloc[test_index]

        model = LogisticRegression(penalty='l2')
        model.fit(X_train, y_train)
        
        predicts = model.predict_proba(X_test)[:, 1]
        scores.append(roc_auc_score(y_true=y_test, y_score=predicts))
        
    print('Time elapsed: ', datetime.datetime.now() - start_time)
    print(C, ': ', round(np.mean(scores), 10))
    print('')

Time elapsed:  0:00:18.030924
0.001 :  0.7517072389

Time elapsed:  0:00:17.643991
0.01 :  0.7519869075

Time elapsed:  0:00:17.107909
0.1 :  0.7520997274

Time elapsed:  0:00:17.892382
0.4 :  0.7519681654

Time elapsed:  0:00:17.680313
0.8 :  0.7519902589

Time elapsed:  0:00:17.499091
1.0 :  0.7519422375

Time elapsed:  0:00:17.560783
1.5 :  0.7518564396

Time elapsed:  0:00:17.603676
2.0 :  0.7519952152

