# Градиентный бустинг

In [105]:
import json
import bz2
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
import time
import datetime
from sklearn.metrics import roc_auc_score

In [231]:
df = pd.read_csv('d2_features.csv', header=0)
test_df = pd.read_csv('d2_features_test.csv', header = 0)

### Целевая переменная находится в столбце 'radiant_win'

In [232]:
y = df.radiant_win
df.drop(['duration',
        'radiant_win',
        'tower_status_radiant',
        'tower_status_dire',
        'barracks_status_radiant',
        'barracks_status_dire',
        'match_id'], axis = 1, inplace = True)
test_df.drop('match_id',axis = 1, inplace = True)

In [108]:
columns = df.count()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97230 entries, 0 to 97229
Columns: 102 entries, start_time to dire_first_ward_time
dtypes: float64(12), int64(90)
memory usage: 75.7 MB


In [109]:
# найдем названия столбцов с пропусками
print(*zip(columns, df.columns))

(97230, 'start_time') (97230, 'lobby_type') (97230, 'r1_hero') (97230, 'r1_level') (97230, 'r1_xp') (97230, 'r1_gold') (97230, 'r1_lh') (97230, 'r1_kills') (97230, 'r1_deaths') (97230, 'r1_items') (97230, 'r2_hero') (97230, 'r2_level') (97230, 'r2_xp') (97230, 'r2_gold') (97230, 'r2_lh') (97230, 'r2_kills') (97230, 'r2_deaths') (97230, 'r2_items') (97230, 'r3_hero') (97230, 'r3_level') (97230, 'r3_xp') (97230, 'r3_gold') (97230, 'r3_lh') (97230, 'r3_kills') (97230, 'r3_deaths') (97230, 'r3_items') (97230, 'r4_hero') (97230, 'r4_level') (97230, 'r4_xp') (97230, 'r4_gold') (97230, 'r4_lh') (97230, 'r4_kills') (97230, 'r4_deaths') (97230, 'r4_items') (97230, 'r5_hero') (97230, 'r5_level') (97230, 'r5_xp') (97230, 'r5_gold') (97230, 'r5_lh') (97230, 'r5_kills') (97230, 'r5_deaths') (97230, 'r5_items') (97230, 'd1_hero') (97230, 'd1_level') (97230, 'd1_xp') (97230, 'd1_gold') (97230, 'd1_lh') (97230, 'd1_kills') (97230, 'd1_deaths') (97230, 'd1_items') (97230, 'd2_hero') (97230, 'd2_level')

### Мы видим пропуски в некоторых полях таблицы (first_blood_player1, first_blood_player2, first_blood_time' и тд.)
Это свазано с некоторыми моментами: <br>
1) Первая кровь была пролита после 5 минуты.<br>
2) Большинство первых кровопролитий было совершено с помощью еще одного героя-союзника (поля 'first blood_player_2')

In [233]:
df.fillna(0, inplace = True)

In [111]:
y.index

RangeIndex(start=0, stop=97230, step=1)

In [112]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 50)

Сразу проверим roc_auc_score на тестовых данных

In [113]:
for i in (20, 30):   
    for train_ind, test_ind in kf.split(df):
        print('Estimators: {}'.format(i))
        clf = GradientBoostingClassifier(n_estimators = i, random_state = 50)
        start_time = datetime.datetime.now()
        clf.fit(df.values[train_ind], y.values[train_ind])
        print('Time elapsed: ', datetime.datetime.now() - start_time)
        pred = clf.predict_proba(df.values[test_ind])
        print(roc_auc_score(y.values[test_ind], pred[:, 1]))

Estimators: 20
Time elapsed:  0:00:44.696261
0.6733003476784685
Estimators: 20
Time elapsed:  0:00:44.008201
0.6897105393706824
Estimators: 20
Time elapsed:  0:00:44.238628
0.6812092730714607
Estimators: 20
Time elapsed:  0:00:44.054323
0.6836447516594225
Estimators: 20
Time elapsed:  0:00:45.277284
0.6837961112152618
Estimators: 30
Time elapsed:  0:01:07.623122
0.6821366070551803
Estimators: 30
Time elapsed:  0:01:05.817411
0.6950518110250112
Estimators: 30
Time elapsed:  0:01:06.777655
0.6885288024651814
Estimators: 30
Time elapsed:  0:01:08.094445
0.689814165431073
Estimators: 30
Time elapsed:  0:01:07.518485
0.6917984238422203


In [114]:
# lets try to use more than 30 trees with different depth
for train_ind, test_ind in kf.split(df):
    clf = GradientBoostingClassifier(n_estimators = 40, random_state = 50, max_depth = 5)
    start_time = datetime.datetime.now()
    clf.fit(df.values[train_ind], y.values[train_ind])
    print('Time elapsed: ', datetime.datetime.now() - start_time)
    print(roc_auc_score(y.values[test_ind], pred[:, 1]))

Time elapsed:  0:02:32.132477
0.49701741190431115
Time elapsed:  0:02:28.983986
0.5022588468013716
Time elapsed:  0:02:31.068321
0.49325684932667635
Time elapsed:  0:02:27.201032
0.5004212506453252
Time elapsed:  0:02:27.096736
0.6917984238422203


In [115]:
# lets try to use more than 30 trees with different depth
for train_ind, test_ind in kf.split(df):
    clf = GradientBoostingClassifier(n_estimators = 40, random_state = 50, max_depth = 2)
    start_time = datetime.datetime.now()
    clf.fit(df.values[train_ind], y.values[train_ind])
    print('Time elapsed: ', datetime.datetime.now() - start_time)

Time elapsed:  0:01:00.613085
Time elapsed:  0:01:03.802698
Time elapsed:  0:01:03.321469
Time elapsed:  0:01:02.310811
Time elapsed:  0:01:03.497425


### Для уменьшения времени, затрачиваемого на обучение модели, можно прибегать к следующим методам: <br>
1) Уменьшение числа признаков. "Выкинуть" признаки из выборки <br>
2) Уменьшить объем выборки. Вместо 110000+ примеров можно взять две трети от этого количества, предварительно перемешав их.


#### Мы также видим, что при уменьшении размеров одного дерева до, например, 2, время при построении 40 деревьев практически совпадает с временем, затраченным на построение модели с 30 деревьями глубины 3

# Логистическая регрессия

In [116]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [117]:
scaler = StandardScaler()

##### Отскалируем значения признаков

In [118]:
X_scaled = scaler.fit_transform(df.values)
y_scaled = scaler.transform(test_df.values)

In [119]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 50)

In [120]:
# for C in np.linspace(0.001, 50, 10):
for train_ind, test_ind in kf.split(X_scaled):
    clf = LogisticRegression(random_state = 20, C = 0.0001)
    start_time = datetime.datetime.now()
    clf.fit(X_scaled[train_ind], y.values[train_ind])
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print('ROC_AUC_SCORE with C = 0.0001',
          roc_auc_score(y.values[test_ind], clf.predict_proba(X_scaled[test_ind])[:, 1]))

Time elapsed: 0:00:00.394182
ROC_AUC_SCORE with C = 0.0001 0.7083801337581977
Time elapsed: 0:00:00.419318
ROC_AUC_SCORE with C = 0.0001 0.7129968933178444
Time elapsed: 0:00:00.429418
ROC_AUC_SCORE with C = 0.0001 0.7109604267716054
Time elapsed: 0:00:00.429569
ROC_AUC_SCORE with C = 0.0001 0.7131541711823542
Time elapsed: 0:00:00.430288
ROC_AUC_SCORE with C = 0.0001 0.7110365644569229


### Качество логистической регрессии чуть выше, чем у градиентного бустинга <br> 
### Скорость обучения модели логистической регрессии заметно выше, чем у градиентного бустинга <br>
Причинами разницы в результатах roc-auc_score могут следующие: <br>
Данные не разряжены, соответственно, модели, построенные на основе дереьев, должны показыавть "приемлемый" результат <br>
В данных могут присутствовать выбросы, которые повлияли на ГБ, но которые не оказали воздействия на ЛР из-за предварительного масштабирования.

In [121]:
df_dropped = df.drop(['lobby_type',
                        'r1_hero',
                        'r2_hero',
                        'r3_hero',
                        'r4_hero',
                        'r5_hero',
                        'd1_hero',
                        'd2_hero',
                        'd3_hero',
                        'd4_hero',
                        'd5_hero'], axis = 1)
test_df_dropped = test_df.drop(['lobby_type',
                        'r1_hero',
                        'r2_hero',
                        'r3_hero',
                        'r4_hero',
                        'r5_hero',
                        'd1_hero',
                        'd2_hero',
                        'd3_hero',
                        'd4_hero',
                        'd5_hero'], axis = 1)

In [122]:
df_dropped
X_scaled = scaler.fit_transform(df_dropped)
y_scaled = scaler.transform(test_df_dropped)

In [150]:
for train_ind, test_ind in kf.split(df_dropped):
    clf = LogisticRegression(random_state = 20, C = 0.01)
    start_time = datetime.datetime.now()
    clf.fit(X_scaled[train_ind], y.values[train_ind])
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print(roc_auc_score(y.values[test_ind], clf.predict_proba(X_scaled[test_ind])[:, 1]))

Time elapsed: 0:00:00.903350
0.715150385895172
Time elapsed: 0:00:01.024810
0.7177918434949545
Time elapsed: 0:00:01.083111
0.7168748241698273
Time elapsed: 0:00:00.927034
0.7163772709196502
Time elapsed: 0:00:01.044895
0.7164433904312993


#### Значение roc_auc увеличилось на сотую. Причина тому- удаление "неправильно" закодированных категориальных признаков

In [152]:
heroes = ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
         'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']

In [160]:
for i in heroes:
    count = np.unique(df[i])
    print(max(count))

112
112
112
112
112
112
112
112
112
112


Мы видим, что максимальное число героев в игре - 112. 

In [234]:
X_pick = np.zeros((df.shape[0], 112))

for i, index in enumerate(df.index):
    for p in range(5):
        X_pick[i, df.loc[index, 'r%d_hero' % (p + 1)] - 1] = 1
        X_pick[i, df.loc[index, 'd%d_hero' % (p + 1)] - 1] = -1

#### Конвертируем полученный ndarray в датафрейм, чтобы объединить его с уже имеющемся ДФ с помощью одного метода

In [235]:
heroes = pd.DataFrame(X_pick)
heroes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97225,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97226,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
97227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
97228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,...,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [236]:
# Перед тем, как сделать join, нам следуюет снова отмасштабировать признаки
df_scaled = scaler.fit_transform(df)
test_df_scaled = scaler.transform(test_df)

In [237]:
df_bag = np.hstack((df_scaled, heroes))

In [240]:
for train_ind, test_ind in kf.split(df_bag):
    for i in [0.001, 0.01, 0.1, 1, 10]:
        clf = LogisticRegression(random_state = 20, C = i)
        start_time = datetime.datetime.now()
        clf.fit(df_bag[train_ind], y.values[train_ind])
        print('Time elapsed:', datetime.datetime.now() - start_time)
        print('C = {}, ROC_AUC_SCORE: '.format(i), 
              roc_auc_score(y.values[test_ind], clf.predict_proba(df_bag[test_ind])[:, 1]))

Time elapsed: 0:00:01.267588
C = 0.001, ROC_AUC_SCORE:  0.7456773683505704
Time elapsed: 0:00:01.313863
C = 0.001, ROC_AUC_SCORE:  0.7456773683505704
Time elapsed: 0:00:02.536107
C = 0.01, ROC_AUC_SCORE:  0.7529953676257171


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:02.962032
C = 0.1, ROC_AUC_SCORE:  0.7538386417270018


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.178527
C = 1, ROC_AUC_SCORE:  0.7538960131522906


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.099175
C = 10, ROC_AUC_SCORE:  0.7539025526256701
Time elapsed: 0:00:01.788811
C = 0.001, ROC_AUC_SCORE:  0.7501776035930589
Time elapsed: 0:00:01.372383
C = 0.001, ROC_AUC_SCORE:  0.7501776035930589
Time elapsed: 0:00:02.520621
C = 0.01, ROC_AUC_SCORE:  0.7551361965733439


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.254207
C = 0.1, ROC_AUC_SCORE:  0.7549769170134325


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.141863
C = 1, ROC_AUC_SCORE:  0.7549070203980457


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.232912
C = 10, ROC_AUC_SCORE:  0.7548928822190244
Time elapsed: 0:00:01.216452
C = 0.001, ROC_AUC_SCORE:  0.7462924301509344
Time elapsed: 0:00:01.271511
C = 0.001, ROC_AUC_SCORE:  0.7462924301509344
Time elapsed: 0:00:02.420117
C = 0.01, ROC_AUC_SCORE:  0.7516746552881243


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.191791
C = 0.1, ROC_AUC_SCORE:  0.7518378595832953


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.432879
C = 1, ROC_AUC_SCORE:  0.7518096737947795


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.456903
C = 10, ROC_AUC_SCORE:  0.75180703634219
Time elapsed: 0:00:01.466216
C = 0.001, ROC_AUC_SCORE:  0.7444584308997113
Time elapsed: 0:00:01.225626
C = 0.001, ROC_AUC_SCORE:  0.7444584308997113
Time elapsed: 0:00:02.954083
C = 0.01, ROC_AUC_SCORE:  0.7479869774560787


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.169193
C = 0.1, ROC_AUC_SCORE:  0.7477606636927715


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.289827
C = 1, ROC_AUC_SCORE:  0.7476834152615626


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.237522
C = 10, ROC_AUC_SCORE:  0.7476763955644832
Time elapsed: 0:00:01.323927
C = 0.001, ROC_AUC_SCORE:  0.744668730405345
Time elapsed: 0:00:01.334097
C = 0.001, ROC_AUC_SCORE:  0.744668730405345
Time elapsed: 0:00:02.428973
C = 0.01, ROC_AUC_SCORE:  0.750352102052218


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.115058
C = 0.1, ROC_AUC_SCORE:  0.7506287748526672


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Time elapsed: 0:00:03.232518
C = 1, ROC_AUC_SCORE:  0.7506277790170399
Time elapsed: 0:00:03.140675
C = 10, ROC_AUC_SCORE:  0.7506220900411691


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Мы можем наблюдать незначительное увеличение времени обучения модели, однако, вместе с этим увеличилось и качество предсказаний- практически на 0,05. Это можно объяснить тем, что наличие или отсутствие каждого индивидуального героя является важным признаком, нежели просто обозначение наличия такового в выборке. В зависимости от пикнутого героя той или иной командой, алгоритм способен более качествено предположить победителя в матче.

In [246]:
test_df = pd.read_csv('d2_features_test.csv', index_col = 'match_id')
test_df

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12.0,247.0,-86.0,272.0,3,4,2,0,118.0
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29.0,168.0,-54.0,,3,2,2,1,16.0
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22.0,46.0,-87.0,186.0,1,3,3,0,-34.0
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49.0,30.0,-89.0,210.0,3,4,2,1,-26.0
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36.0,180.0,-86.0,180.0,1,3,2,1,-33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114369,1450212780,7,11,5,2054,1941,27,0,1,8,...,1,8.0,253.0,-87.0,,4,3,2,1,-33.0
114377,1450222875,1,3,3,748,605,1,0,0,12,...,1,-1.0,133.0,-85.0,184.0,2,3,4,1,-18.0
114378,1450223593,1,85,2,575,499,0,0,0,8,...,0,20.0,133.0,-88.0,239.0,4,4,4,0,-36.0
114393,1450244771,0,7,4,1844,1176,8,1,2,8,...,0,-28.0,,-83.0,,1,4,1,0,


In [293]:
df = pd.read_csv('d2_features.csv', header=0)
y = df.radiant_win
df.drop(['duration',
        'radiant_win',
        'tower_status_radiant',
        'tower_status_dire',
        'barracks_status_radiant',
        'barracks_status_dire',
        'match_id'], axis = 1, inplace = True)

In [296]:
df.fillna(0, inplace = True)

In [312]:
df_scaled = scaler.fit_transform(df)
test_df = test_df.fillna(0)
test_df_scaled = scaler.transform(test_df)
clf = LogisticRegression(random_state = 20, C = 0.1)
clf.fit(df_scaled, y.values)

LogisticRegression(C=0.1, random_state=20)

In [320]:
pred = clf.predict_proba(test_df_scaled)
pred[:10]

array([[0.46780127, 0.53219873],
       [0.34719846, 0.65280154],
       [0.66017913, 0.33982087],
       [0.12306621, 0.87693379],
       [0.76226289, 0.23773711],
       [0.5639391 , 0.4360609 ],
       [0.43367186, 0.56632814],
       [0.31846126, 0.68153874],
       [0.69533081, 0.30466919],
       [0.53994413, 0.46005587]])

In [347]:
res = pd.DataFrame(pred[:, 1], index = test_df.index, columns = ['radiant_win'])
res.to_csv('out.csv')

In [352]:
print('min =', pred.min(), 'max =', pred.max())

min = 0.007259504763141111 max = 0.9927404952368589
