# Градиентный бустинг

##### 1. Чтение данных

In [1]:
import pandas

features = pandas.read_csv('./features.csv', index_col='match_id')
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16,2449,0,4,1974,3,63


In [2]:
result_columns = ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire']
X = features.drop(result_columns, axis=1)

##### 2. Поиск пропусков

In [3]:
row_count = len(X)
not_nan_counts = X.count()

columns_with_nan = filter(lambda axe: not_nan_counts[axe] < row_count, not_nan_counts.axes[0])

list(map(lambda column: [column, row_count - not_nan_counts[column]], columns_with_nan))

[['first_blood_time', 19553],
 ['first_blood_team', 19553],
 ['first_blood_player1', 19553],
 ['first_blood_player2', 43987],
 ['radiant_bottle_time', 15691],
 ['radiant_courier_time', 692],
 ['radiant_flying_courier_time', 27479],
 ['radiant_first_ward_time', 1836],
 ['dire_bottle_time', 16143],
 ['dire_courier_time', 676],
 ['dire_flying_courier_time', 26098],
 ['dire_first_ward_time', 1826]]

Возможные причины пропусков:
- `first_blood_time`, `first_blood_team`, `first_blood_player1`, `first_blood_player2` - событие "wirst blood" не произошло в первые 5 минут матча
- radiant_bottle_time, dire_bottle_time - предмет "bottle" не приобретен в первые 5 минут матча
- radiant_courier_time, dire_courier_time - предмет "courier" не приобретен в первые 5 минут матча
- radiant_flying_courier_time, dire_flying_courier_time - предмет "flying courier" не приобретен в первые 5 минут матча
- radiant_first_ward_time, dire_first_ward_time  - событие "first ward" не произошло в первые 5 минут матча

##### 3. Заполнение пропусков

In [4]:
X.fillna(0, inplace = True)

##### 4. Целевая переменная

Целевая переменная содержится в поле `radiant_win`

In [5]:
Y = features.radiant_win.values

##### 5. Обучение и проверка классификатора

In [17]:
import time
import datetime
from sklearn import ensemble
from sklearn import cross_validation

for n in [10, 20, 30, 40]:
    clf = ensemble.GradientBoostingClassifier(n_estimators = n, random_state = 241)
    
    start_time = datetime.datetime.now()
    kf = cross_validation.KFold(len(Y), n_folds = 5, shuffle = True, random_state = 1)
    scores = cross_validation.cross_val_score(clf, X, Y, cv = kf, scoring = 'roc_auc')
    time_elapsed = datetime.datetime.now() - start_time

    print("N = " + str(n) + ", score = " + str(scores.mean()) + ", time elapsed = " + str(time_elapsed))

N = 10, score = 0.664832922805, time elapsed = 0:00:38.943959
N = 20, score = 0.682114035889, time elapsed = 0:01:15.263798
N = 30, score = 0.689694757384, time elapsed = 0:01:52.541814
N = 40, score = 0.693934188425, time elapsed = 0:02:33.874123


# Логистическая регрессия

##### 1. Логистическая регрессия на необработанных данных

Функция обучения логистической регрессии:

In [6]:
from sklearn import linear_model
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
import time
import datetime

def perform_grid_search(X, Y, C):
    start_time = datetime.datetime.now()
    
    parameters = { 'C': C }
    kf = cross_validation.KFold(len(Y), n_folds = 5, shuffle = True, random_state = 1)
    clf_search = grid_search.GridSearchCV(linear_model.LogisticRegression(), parameters, cv = kf, n_jobs = 4, verbose = True)
    clf_search.fit(X, Y)

    clf = clf_search.best_estimator_
    clf.fit(X, Y)

    C = clf_search.best_params_['C']
    scores = cross_validation.cross_val_score(clf, X, Y, cv = kf, scoring = 'roc_auc')
    
    time_elapsed = datetime.datetime.now() - start_time
    print("C = " + str(C) + ", score = " + str(scores.mean()) + ", time elapsed = " + str(time_elapsed))
    
    return clf

Масштабирование признаков:

In [7]:
from sklearn import preprocessing

def scale(data):
    scaler = preprocessing.StandardScaler()
    scaler.fit(data)
    return scaler.transform(data.copy())

Обучение с категориальными признаками:

In [8]:
C = [10 ** i for i in range(-3, 3)]
perform_grid_search(scale(X), Y, C)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
C = 0.1, score = 0.716350926752, time elapsed = 0:00:50.841526


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   34.1s finished


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Удаление категориальных признаков:

In [23]:
hero_columns = ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
categorial = ['lobby_type'] + hero_columns
X_without_categorial = X.drop(categorial, axis=1)
X_without_categorial_scaled = scale(X_without_categorial)

Обучение без категориальных признаков:

In [9]:
C = [10 ** i for i in range(-3, 3)]
perform_grid_search(X_without_categorial_scaled, Y, C)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
C = 0.1, score = 0.71638380048, time elapsed = 0:00:43.328206


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   29.3s finished


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Формирование "мешка слов" по героям:

In [24]:
import numpy as np
import scipy

hero_ids = pandas.unique(X[hero_columns].values.ravel())
print("Total heroes count = " + str(len(hero_ids)))

max_hero_id = max(hero_ids)

X_pick = np.zeros((X_without_categorial.shape[0], max_hero_id))

for i, match_id in enumerate(X_without_categorial.index):
    for p in range(5):
        X_pick[i, X.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
        X_pick[i, X.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1

categorial_matrix = scipy.sparse.hstack([X_without_categorial, X_pick]).todense()

columns = list(X_without_categorial.columns.values) + list(map(lambda i: 'hero_%d' % i, range(0, max_hero_id)))
X_with_categorial = pandas.DataFrame(categorial_matrix, columns = columns)
X_with_categorial_scaled = scale(X_with_categorial)

Total heroes count = 108


In [11]:
C = [10 ** i for i in range(-3, 3)]
clf = perform_grid_search(X_with_categorial_scaled, Y, C)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
C = 0.1, score = 0.751861720539, time elapsed = 0:01:30.710868


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   59.6s finished


Запуск модели на тестовой выборке:

In [16]:
X_test = pandas.read_csv('./features_test.csv', index_col='match_id')
X_test.fillna(0, inplace = True)
X_test_without_categorial = X_test.drop(categorial, axis=1)

hero_ids = pandas.unique(X_test[categorial].values.ravel())
max_hero_id = max(hero_ids)

X_pick = np.zeros((X_test_without_categorial.shape[0], max_hero_id))

for i, match_id in enumerate(X_test_without_categorial.index):
    for p in range(5):
        X_pick[i, X_test.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
        X_pick[i, X_test.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1

categorial_matrix = scipy.sparse.hstack([X_test_without_categorial, X_pick]).todense()

columns = list(X_test_without_categorial.columns.values) + list(map(lambda i: 'hero_%d' % i, range(0, max_hero_id)))
X_test_with_categorial = pandas.DataFrame(categorial_matrix, columns = columns)
X_test_with_categorial_scaled = scale(X_test_with_categorial)

predictions = clf.predict_proba(X_test_with_categorial_scaled)[:, 1]

print("Min prediction = " + str(min(predictions)) + ", max prediction = " + str(max(predictions)))

Min prediction = 0.00868860295948, max prediction = 0.996471254335
