# Методы обработки признаков 
## на примере данных игры Дота 
#### сслыки на описания:
https://github.com/esokolov/ml-course-hse/blob/master/2016-fall/contests/contest01-dota-statement.ipynb

https://inclass.kaggle.com/c/hse-dota2-win-prediction

## Использованные методы обработки данных
1. Масштабирование вещественных
2. One-hot-кодирование категориальных
3. Мешок слов
4. Добавление комбинаций признаков опираясь на знания о их смысловом значении

5. Разложение матрицы объект-признак -- **SVD**

In [1]:
*%pylab inline

import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [193]:
import json
import bz2

with bz2.BZ2File('./matches.jsonlines.bz2') as matches_file:
    for line in matches_file:
        match = json.loads(line)
        print len(match)
        # Обработка матча
        break

8


In [194]:
import pandas
features = pandas.read_csv('./features.csv', index_col='match_id')

features.head()


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [4]:
features['radiant_win'].head()

match_id
0    1
1    1
2    0
3    0
4    0
Name: radiant_win, dtype: int64

In [195]:
df = features
X = df.drop(['duration', 'radiant_win', 'tower_status_radiant', 
             'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'],
            axis=1, inplace=False)
y = features['radiant_win']

In [53]:
y

match_id
0         1
1         1
2         0
3         0
4         0
5         0
8         0
9         1
11        0
12        0
14        1
15        0
17        1
20        1
21        0
22        1
23        1
25        1
26        1
27        1
28        1
29        1
30        1
31        1
32        1
34        1
35        0
36        0
38        0
39        0
         ..
114373    0
114374    0
114375    0
114376    0
114379    1
114380    0
114381    0
114382    0
114383    1
114384    1
114385    0
114386    1
114387    1
114388    1
114389    1
114390    0
114391    0
114392    1
114394    0
114395    1
114396    1
114397    1
114399    0
114400    1
114401    1
114402    0
114403    1
114404    0
114405    0
114406    1
Name: radiant_win, dtype: int64

Проверим наличие категориальных признаков типа "object" в таблице объект-признак.

In [6]:
cat_features_mask = (features.dtypes == "object").values # категориальные признаки имеют тип "object"
cat_features_mask.sum()

0

# Работа с котегориальными и вещественными признаками
## Масштабирование вещественных
## One-hot-кодирование категориальных 

Некоторые из признаков в нашем датасете являются категориальными. Типичным подходом к работе с ними является бинарное, или [one-hot-кодирование](https://en.wikipedia.org/wiki/One-hot).

Реализуйте функцию transform_data, которая принимает на вход DataFrame с признаками и выполняет следующие шаги:
1. Замена пропущенных значений на нули для вещественных признаков и на строки 'nan' для категориальных.
2. Масштабирование вещественных признаков с помощью [StandardScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html).
3. One-hot-кодирование категориальных признаков с помощью [DictVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html) или функции [pd.get_dummies](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html).

Метод должен возвращать преобразованный DataFrame, который должна состоять из масштабированных вещественных признаков и закодированных категориальных (исходные признаки должны быть исключены из выборки).

##### Среди признаков в выборке есть 11 категориальных: lobby_type и r1_hero, r2_hero, ..., r5_hero, d1_hero, d2_hero, ..., d5_hero.

In [105]:
from sklearn import preprocessing

In [8]:
# Если категорильные признаки не числовые и имеют тип "object".
def transform_data(data):
    # маска, указывающую на столбцы с категориальными признаками
    cat_features_mask = (data.dtypes == "object").values # категориальные признаки имеют тип "object"
    cat_features_mask.sum()
    
    # 1. Замена пропущенных значений на нули для вещественных признаков и на строки 'nan' для категориальных.
    # в вещественных признаках:
    data_real = data[data.columns[~cat_features_mask]]
    data[data.columns[~cat_features_mask]] = data_real.replace(np.nan, 0, regex=True)
    # в категориальных
    data_obj = data[data.columns[cat_features_mask]]
    data[data.columns[cat_features_mask]] = data_obj.replace(np.nan, 'nan', regex=True)
    
    # 2. Масштабирование вещественных признаков с помощью StandardScaler.
    normalizer = preprocessing.StandardScaler()
    X_no_mis = data[data.columns[~cat_features_mask]]
    X_real_norm_np = normalizer.fit_transform(X_no_mis)
    X_real_norm_pd = pd.DataFrame(data=X_real_norm_np, index=data[data.columns[~cat_features_mask]].index)
    data[data.columns[~cat_features_mask]] = X_real_norm_pd
    
    # 3. One-hot-кодирование категориальных признаков 
    data = pd.get_dummies(data, drop_first=True)
    return(data)

In [9]:
# Передаём в cat_features_names список названий столбцов с категориальными признаками.
def transform_data(data, cat_features_names):
    # Создаем список столбцов, задающих вещественные признаки.
    real_features_names = [col for col in data.columns.values if col not in cat_features_names]

    # 1. Замена пропущенных значений на нули для вещественных признаков и на строки 'nan' для категориальных.
    # в вещественных признаках:
    data_real = data[real_features_names]
    data[real_features_names] = data_real.replace(np.nan, 0, regex=True)
    # в категориальных
    data_obj = data[cat_features_names]
    data[cat_features_names] = data_obj.replace(np.nan, 'nan', regex=True)
    
    # 2. Масштабирование вещественных признаков с помощью StandardScaler.
    normalizer = preprocessing.StandardScaler()
    X_no_mis = data[real_features_names]
    X_real_norm_np = normalizer.fit_transform(X_no_mis)
    X_real_norm_pd = pd.DataFrame(data=X_real_norm_np, index=data[real_features_names].index)
    data[real_features_names] = X_real_norm_pd
    
    # 3. One-hot-кодирование категориальных признаков 
    data = pd.get_dummies(data, columns=cat_features_names, drop_first=True)
    return(data)

In [10]:
cat_features_names = ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                      'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
X.columns.values
real_features_names = [col for col in X.columns.values if col not in cat_features_names]
real_features_names[1:6]

['r1_level', 'r1_xp', 'r1_gold', 'r1_lh', 'r1_kills']

In [33]:
print features_norm_oho.shape
features_norm_oho.head()

(97230, 102)


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.544364,7,11,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,...,-0.987486,1.066448,-0.041743,-0.262922,0.640648,0.018054,0.562864,-0.551154,1.846004,-1.121494
1,-2.540452,0,42,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,...,-0.987486,-0.338591,0.578946,-0.262922,0.379585,1.066668,0.562864,0.67817,0.437788,0.043947
2,-2.539231,7,33,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,...,0.391203,-0.823968,-0.824352,0.158654,0.640648,0.018054,0.562864,0.67817,0.437788,0.490286
3,-2.532622,1,29,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,...,-0.987486,-0.594053,0.241615,-0.022021,0.269135,-1.554868,0.562864,-0.551154,-0.970428,0.837439
4,-2.529221,7,13,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,...,-0.987486,1.347455,1.024223,-0.022021,0.680811,1.590976,-0.302485,0.67817,-0.970428,-0.228816


In [11]:
features_norm_oho = transform_data(X, cat_features_names)
print(X.shape, features_norm_oho.shape)
features_norm_oho.head()
# print(data.columns)

((97230, 102), (97230, 1163))


Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,d5_hero_100,d5_hero_101,d5_hero_102,d5_hero_103,d5_hero_104,d5_hero_105,d5_hero_106,d5_hero_109,d5_hero_110,d5_hero_112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.544364,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,-0.332256,-0.625222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.540452,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,0.578881,0.732454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.539231,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,-0.332256,0.224676,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.532622,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,-1.243393,-1.170813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2.529221,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,-1.243393,-1.008757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# print features_norm_oho.shape
# print features_norm_oho.columns.values
# print features_norm_oho.iloc[0]
# features_norm_oho.head()

In [15]:
# print features.shape
# print features.columns.values
# print features.iloc[0]
# features.head()

In [20]:
X = features_norm_oho
y.head()

match_id
0    1
1    1
2    0
3    0
4    0
Name: radiant_win, dtype: int64

#### Разобьёмполучившуюся выборку на обучающую и контрольную в соотношении 70/30 с использованием перемешивания объектов.

При разбиении используйте значение параметра random_state=16.

In [17]:
from sklearn import cross_validation
cv = cross_validation.ShuffleSplit(n=df.shape[0], n_iter = 1, train_size=0.7,test_size=0.3, random_state=16)
cv

ShuffleSplit(97230, n_iter=1, test_size=0.3, random_state=16)

In [23]:
for tr_ind, test_ind in cv:
    X_train_full = X.iloc[tr_ind]
    X_test_full = X.iloc[test_ind]
    y_train = y.iloc[tr_ind]
    y_test = y.iloc[test_ind]  

X.head()

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,d5_hero_100,d5_hero_101,d5_hero_102,d5_hero_103,d5_hero_104,d5_hero_105,d5_hero_106,d5_hero_109,d5_hero_110,d5_hero_112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.544364,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,-0.332256,-0.625222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.540452,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,0.578881,0.732454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.539231,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,-0.332256,0.224676,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.532622,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,-1.243393,-1.170813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2.529221,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,-1.243393,-1.008757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Применим логистическую регрессию

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

In [73]:
lr = LogisticRegression()
lr.fit(X_train_full, y_train)
preds = lr.predict_proba(X_test_full)[:,1]
print 'ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(y_test, preds), 
                                    accuracy_score(y_test, (preds > 0.5).astype(int)))

ROC-AUC: 0.745, ACC: 0.676


In [74]:
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
log_loss(y_test, preds)

0.59381073489974501

### Тестирование на данных без ответов

In [8]:
import pandas as pd
features_test = pandas.read_csv('./features_test.csv', index_col='match_id')

features_test.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12.0,247.0,-86.0,272.0,3,4,2,0,118.0
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29.0,168.0,-54.0,,3,2,2,1,16.0
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22.0,46.0,-87.0,186.0,1,3,3,0,-34.0
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49.0,30.0,-89.0,210.0,3,4,2,1,-26.0
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36.0,180.0,-86.0,180.0,1,3,2,1,-33.0


In [40]:
features_test_norm_oho = transform_data(features_test, cat_features_names)
print(features_test.shape, features_test_norm_oho.shape)
features_test_norm_oho.head()

((17177, 102), (17177, 1163))


Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,d5_hero_100,d5_hero_101,d5_hero_102,d5_hero_103,d5_hero_104,d5_hero_105,d5_hero_106,d5_hero_109,d5_hero_110,d5_hero_112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,-2.514875,0.474746,-0.237743,-0.137094,-0.370005,-0.532732,1.003884,0.305165,-0.368985,0.003979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-2.513895,-1.36563,-1.207913,-1.25566,-1.14509,-0.532732,-0.5932,0.305165,0.567766,0.023887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,-2.512377,-1.36563,-0.862057,-0.742715,-1.14509,-0.532732,-0.5932,1.898546,-1.305736,-1.375081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,-2.508381,-0.445442,-0.938323,-0.537967,-1.14509,0.968865,1.003884,1.101855,-1.305736,-0.920823,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,-2.507087,0.474746,0.038942,-1.059534,-0.812911,-0.532732,-0.5932,0.305165,1.504517,0.94507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
preds_test = lr.predict_proba(features_test_norm_oho)[:,1]
print preds_test

In [52]:
print preds_test

[ 0.80300885  0.83739756  0.18931082 ...,  0.23027624  0.50365967
  0.36029848]


In [61]:
for match_num in range(len(features_test_norm_oho.index.values)):
    match_id = features_test_norm_oho.index.values[match_num]
    match_pred = preds_test[match_num]
    # print '%d, %.11f' % (match_id, match_pred)

In [67]:
import csv
with open('test_probs.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['match_id', 'radiant_win'])
    for match_num in range(len(features_test_norm_oho.index.values)):
        match_id = features_test_norm_oho.index.values[match_num]
        match_pred = preds_test[match_num]
        # print '%d, %.11f' % (match_id, match_pred)
        writer.writerow([match_id, match_pred])


# Улучшения базовой модели

## Мешок слов

- Удаляем информацию о целевой переменной из обучающей выборки.
- Удаляем стобцы с информацие о героях

- Нормализуем вещественные столбцы
- One-hot-coding используем для кодирования категориального признака lobby_type

- Создаем стобцы с информацией о героях использую технику - "мешок слов"

In [106]:
from sklearn import preprocessing
# Передаём в cat_features_names список названий столбцов с категориальными признаками.
def transform_data(data, cat_features_names):
    # Создаем список столбцов, задающих вещественные признаки.
    real_features_names = [col for col in data.columns.values if col not in cat_features_names]

    # 1. Замена пропущенных значений на нули для вещественных признаков и на строки 'nan' для категориальных.
    # в вещественных признаках:
    data_real = data[real_features_names]
    data[real_features_names] = data_real.replace(np.nan, 0, regex=True)
    # в категориальных
    data_obj = data[cat_features_names]
    data[cat_features_names] = data_obj.replace(np.nan, 'nan', regex=True)
    
    # 2. Масштабирование вещественных признаков с помощью StandardScaler.
    normalizer = preprocessing.StandardScaler()
    X_no_mis = data[real_features_names]
    X_real_norm_np = normalizer.fit_transform(X_no_mis)
    X_real_norm_pd = pd.DataFrame(data=X_real_norm_np, index=data[real_features_names].index)
    data[real_features_names] = X_real_norm_pd
    
    # 3. One-hot-кодирование категориальных признаков 
    data = pd.get_dummies(data, columns=cat_features_names, drop_first=True)
    return(data)

In [8]:
data_clean = features.drop(['duration', 'radiant_win', 'tower_status_radiant', 
             'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'],
            axis=1, inplace=False)

# remove categorial -- labels of the heros
data_clean.drop(['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                      'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1, inplace=True)
# normolize the data and the categorial lobbe_type with one-hot-encoding:
data_norm = transform_data(data_clean, ['lobby_type'])

In [9]:
print data_norm.shape
data_norm.head()

(97230, 93)


Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,lobby_type_1,lobby_type_7
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.544364,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,-0.332256,-0.625222,...,-0.041743,-0.262922,0.640648,0.018054,0.562864,-0.551154,1.846004,-1.121494,0.0,1.0
1,-2.540452,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,0.578881,0.732454,...,0.578946,-0.262922,0.379585,1.066668,0.562864,0.67817,0.437788,0.043947,0.0,0.0
2,-2.539231,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,-0.332256,0.224676,...,-0.824352,0.158654,0.640648,0.018054,0.562864,0.67817,0.437788,0.490286,0.0,1.0
3,-2.532622,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,-1.243393,-1.170813,...,0.241615,-0.022021,0.269135,-1.554868,0.562864,-0.551154,-0.970428,0.837439,1.0,0.0
4,-2.529221,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,-1.243393,-1.008757,...,1.024223,-0.022021,0.680811,1.590976,-0.302485,0.67817,-0.970428,-0.228816,0.0,1.0


### Создание и применение "Мешка слов":

In [10]:

from sklearn import cross_validation
from sklearn.cross_validation import KFold

# Код для формирования "мешка слов" по героям
data = features.drop(['duration', 'radiant_win', 'tower_status_radiant', 
             'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'],
            axis=1, inplace=False)
# N — количество различных героев в выборке
N = 113

X_pick = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
heros = pd.DataFrame(X_pick)
heros.set_index(data.index.values, inplace = True)

print heros.shape, data.shape

features_heros_bag = data_norm.join(heros)
features_heros_bag
features_heros_bag.shape

X_train = features_heros_bag

from sklearn.cross_validation import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

ROC = 0
ACC = 0
LOG_LOSS = 0
n_folds_K = 3
kf = KFold(n=X.shape[0], n_folds=3)
for train_ids, test_ids in kf:
    #print("%s %s" % (train, test))
    lr = LogisticRegression()
    lr.fit(X_svd.iloc[train_ids], y.iloc[train_ids])
    preds = lr.predict_proba(X_svd.iloc[test_ids])[:,1]
    ROC = ROC + roc_auc_score(y.iloc[test_ids], preds)
    ACC += accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int))
    # print 'ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(y.iloc[test_ids], preds), 
    #                                    accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int))
    LOG_LOSS += log_loss(y.iloc[test_ids], preds)
print ' ROC-AUC of K-folds: %.3f, ACC: %.3f, log-los: %.4f' % (ROC / n_folds_K, ACC / n_folds_K, LOG_LOSS / n_folds_K)


In [10]:
# Код для формирования "мешка слов" по героям
data = features.drop(['duration', 'radiant_win', 'tower_status_radiant', 
             'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'],
            axis=1, inplace=False)
# N — количество различных героев в выборке
N = 113

X_pick = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1


In [11]:
X_pick.shape

(97230, 113)

In [12]:
heros = pd.DataFrame(X_pick)
heros.set_index(data.index.values, inplace = True)
#data + heros

In [13]:
print heros.shape, data.shape

(97230, 113) (97230, 102)


In [14]:
features_heros_bag = data_norm.join(heros)
features_heros_bag
features_heros_bag.shape

(97230, 206)

In [15]:
features_heros_bag.head()

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,103,104,105,106,107,108,109,110,111,112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.544364,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,-0.332256,-0.625222,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.540452,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,0.578881,0.732454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.539231,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,-0.332256,0.224676,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.532622,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,-1.243393,-1.170813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2.529221,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,-1.243393,-1.008757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Обучаемся и проверяем качество.

In [16]:
# y = features['radiant_win']
X = features_heros_bag

In [17]:
from sklearn.cross_validation import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score



In [18]:
#cv = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.3)
cv = ShuffleSplit(n=X.shape[0], n_iter = 1, train_size=0.7,test_size=0.3, random_state=16)

for train_ids, test_ids in cv:
    lr = LogisticRegression()
    lr.fit(X.iloc[train_ids], y.iloc[train_ids])
    preds = lr.predict_proba(X.iloc[test_ids])[:,1]
    print 'ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(y.iloc[test_ids], preds), 
                                        accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int)))



ROC-AUC: 0.752, ACC: 0.680


### Обучаемся и проверяем качество. Используем технологию K-fold.

In [19]:
from sklearn.cross_validation import KFold, cross_val_score

In [20]:
import numpy as np
from sklearn import cross_validation
from sklearn.cross_validation import KFold

In [21]:
ROC = 0
n_folds_K = 3
kf = KFold(n=X.shape[0], n_folds=3)
for train_ids, test_ids in kf:
    #print("%s %s" % (train, test))
    lr = LogisticRegression()
    lr.fit(X.iloc[train_ids], y.iloc[train_ids])
    preds = lr.predict_proba(X.iloc[test_ids])[:,1]
    ROC = ROC + roc_auc_score(y.iloc[test_ids], preds)
    print 'ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(y.iloc[test_ids], preds), 
                                        accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int)))

print 'ROC-AUC of K-folds: %.3f' % (ROC / n_folds_K)

ROC-AUC: 0.756, ACC: 0.684
ROC-AUC: 0.738, ACC: 0.671
ROC-AUC: 0.748, ACC: 0.680
ROC-AUC of K-folds: 0.747


In [22]:
print 'ROC-AUC of K-folds: %.3f' % (ROC / n_folds_K)

ROC-AUC of K-folds: 0.747


In [24]:
lr = LogisticRegression()
lr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Применим результат обучения на тестовых данных.

- Удаляем стобцы с информацие о героях

- Нормализуем вещественные столбцы
- One-hot-coding используем для кодирования категориального признака lobby_type

- Создаем стобцы с информацией о героях использую технику - "мешок слов"

In [25]:
import pandas
features_test = pandas.read_csv('./features_test.csv', index_col='match_id')

features_test.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12.0,247.0,-86.0,272.0,3,4,2,0,118.0
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29.0,168.0,-54.0,,3,2,2,1,16.0
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22.0,46.0,-87.0,186.0,1,3,3,0,-34.0
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49.0,30.0,-89.0,210.0,3,4,2,1,-26.0
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36.0,180.0,-86.0,180.0,1,3,2,1,-33.0


In [26]:
# remove categorial -- labels of the heros
data_clean_test = features_test.drop(['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                      'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1, inplace=False)
# normolize the data and the categorial lobbe_type with one-hot-encoding:
data_norm_test = transform_data(data_clean_test, ['lobby_type'])
print data_norm_test.shape
data_norm_test.head()

(17177, 93)


Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,lobby_type_1,lobby_type_7
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,-2.514875,0.474746,-0.237743,-0.137094,-0.370005,-0.532732,1.003884,0.305165,-0.368985,0.003979,...,1.908352,-0.400363,1.149479,0.017743,0.55426,-0.571552,-0.983356,3.200079,0.0,0.0
7,-2.513895,-1.36563,-1.207913,-1.25566,-1.14509,-0.532732,-0.5932,0.305165,0.567766,0.023887,...,0.836945,1.72308,-1.597294,0.017743,-1.177689,-0.571552,0.433564,0.600867,1.0,0.0
10,-2.512377,-1.36563,-0.862057,-0.742715,-1.14509,-0.532732,-0.5932,1.898546,-1.305736,-1.375081,...,-0.817633,-0.466721,0.281014,-1.038655,-0.311714,0.668978,-0.983356,-0.673257,1.0,0.0
13,-2.508381,-0.445442,-0.938323,-0.537967,-1.14509,0.968865,1.003884,1.101855,-1.305736,-0.920823,...,-1.034626,-0.599436,0.523376,0.017743,0.55426,-0.571552,0.433564,-0.469397,1.0,0.0
16,-2.507087,0.474746,0.038942,-1.059534,-0.812911,-0.532732,-0.5932,0.305165,1.504517,0.94507,...,0.99969,-0.400363,0.220423,-1.038655,-0.311714,-0.571552,0.433564,-0.647774,1.0,0.0


In [27]:
features_test.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12.0,247.0,-86.0,272.0,3,4,2,0,118.0
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29.0,168.0,-54.0,,3,2,2,1,16.0
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22.0,46.0,-87.0,186.0,1,3,3,0,-34.0
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49.0,30.0,-89.0,210.0,3,4,2,1,-26.0
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36.0,180.0,-86.0,180.0,1,3,2,1,-33.0


In [28]:
# Код для формирования "мешка слов" по героям
data = features_test
# N — количество различных героев в выборке
N = 113

X_pick_test = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick_test[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick_test[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1


In [29]:
print X_pick_test.shape, data_norm_test.shape

(17177, 113) (17177, 93)


In [31]:
heros_test = pd.DataFrame(X_pick_test)
heros_test.set_index(data_norm_test.index.values, inplace = True)
features_heros_bag_test = data_norm_test.join(heros_test)
print features_heros_bag_test.shape
features_heros_bag_test.head()

(17177, 206)


Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,103,104,105,106,107,108,109,110,111,112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,-2.514875,0.474746,-0.237743,-0.137094,-0.370005,-0.532732,1.003884,0.305165,-0.368985,0.003979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-2.513895,-1.36563,-1.207913,-1.25566,-1.14509,-0.532732,-0.5932,0.305165,0.567766,0.023887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,-2.512377,-1.36563,-0.862057,-0.742715,-1.14509,-0.532732,-0.5932,1.898546,-1.305736,-1.375081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,-2.508381,-0.445442,-0.938323,-0.537967,-1.14509,0.968865,1.003884,1.101855,-1.305736,-0.920823,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
16,-2.507087,0.474746,0.038942,-1.059534,-0.812911,-0.532732,-0.5932,0.305165,1.504517,0.94507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
preds_bag_test = lr.predict_proba(features_heros_bag_test)[:,1]
print preds_bag_test

[ 0.81618172  0.76357625  0.19155221 ...,  0.24476982  0.61039686
  0.43473302]


In [33]:
import csv
with open('test_probs_bag.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['match_id', 'radiant_win'])
    for match_num in range(len(features_heros_bag_test.index.values)):
        match_id = features_heros_bag_test.index.values[match_num]
        match_pred = preds_bag_test[match_num]
        # print '%d, %.11f' % (match_id, match_pred)
        writer.writerow([match_id, match_pred])


In [34]:
features_heros_bag_test

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,103,104,105,106,107,108,109,110,111,112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,-2.514875,0.474746,-0.237743,-0.137094,-0.370005,-0.532732,1.003884,0.305165,-0.368985,0.003979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-2.513895,-1.365630,-1.207913,-1.255660,-1.145090,-0.532732,-0.593200,0.305165,0.567766,0.023887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,-2.512377,-1.365630,-0.862057,-0.742715,-1.145090,-0.532732,-0.593200,1.898546,-1.305736,-1.375081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,-2.508381,-0.445442,-0.938323,-0.537967,-1.145090,0.968865,1.003884,1.101855,-1.305736,-0.920823,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
16,-2.507087,0.474746,0.038942,-1.059534,-0.812911,-0.532732,-0.593200,0.305165,1.504517,0.945070,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,-2.506518,1.394934,1.472027,0.854314,0.405079,-0.532732,-0.593200,-0.093180,0.567766,0.139714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,-2.506387,0.474746,1.108435,0.720690,1.401616,-0.532732,1.003884,-1.288215,-0.368985,-0.519050,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24,-2.503777,1.394934,1.337232,1.789686,0.847984,2.470463,-0.593200,0.703510,0.567766,0.574063,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,-2.500708,-1.365630,-1.277085,-1.406527,-1.255816,-0.532732,1.003884,0.305165,0.567766,0.731515,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,-2.499155,-0.445442,-0.314009,-0.994877,-0.702184,-0.532732,-0.593200,-0.889870,0.567766,0.060083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Разложение матриц

## NMF -- Non-negative matrix factorization
### sklearn.decomposition import NMF --
-- нельзя использовать, так как матрица содержит отрицательные значения.

http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

In [94]:
from sklearn.decomposition import NMF

In [99]:
import numpy as np
X_sample = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
print X_sample
from sklearn.decomposition import NMF
model = NMF(n_components=2, init='random', random_state=0)
model.fit(X_sample)

[[ 1.   1. ]
 [ 2.   1. ]
 [ 3.   1.2]
 [ 4.   1. ]
 [ 5.   0.8]
 [ 6.   1. ]]


NMF(alpha=0.0, beta=1, eta=0.1, init='random', l1_ratio=0.0, max_iter=200,
  n_components=2, nls_max_iter=2000, random_state=0, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [98]:
print model.components_
#array([[ 2.09783018,  0.30560234],
#       [ 2.13443044,  2.13171694]])
print model.reconstruction_err_ 
# 0.00115993...


[[ 2.09783018  0.30560234]
 [ 2.13443044  2.13171694]]
0.0011599349216


In [100]:
model

NMF(alpha=0.0, beta=1, eta=0.1, init='random', l1_ratio=0.0, max_iter=200,
  n_components=2, nls_max_iter=2000, random_state=0, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

## SVD - Singular value decomposition 
### TruncatedSVD
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html

In [101]:
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

In [102]:
X_sample = sparse_random_matrix(100, 100, density=0.01, random_state=42)
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
svd.fit(X) 

TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7,
       random_state=42, tol=0.0)

In [103]:
print(svd.explained_variance_ratio_) 
print(svd.explained_variance_ratio_.sum()) 

[ 0.0534992   0.04089556  0.03868098  0.03816072  0.03785247]
0.209088923733


In [104]:
print(svd.explained_variance_) 

[ 5.42750056  4.14885956  3.92419034  3.87140985  3.8401379 ]


In [36]:
from sklearn.utils.extmath import randomized_svd

In [37]:
U, Sigma, VT = randomized_svd(X, n_components=46,
                                      n_iter=16,
                                      random_state=16)

In [None]:
print X.shape
print U.shape, Sigma.shape, VT.shape

In [None]:
X_svd = pd.DataFrame(U)
X_svd.set_index(X.index.values, inplace = True)
X_svd.head()

In [None]:
ROC = 0
n_folds_K = 3
kf = KFold(n=X.shape[0], n_folds=3)
for train_ids, test_ids in kf:
    #print("%s %s" % (train, test))
    lr = LogisticRegression()
    lr.fit(X_svd.iloc[train_ids], y.iloc[train_ids])
    preds = lr.predict_proba(X_svd.iloc[test_ids])[:,1]
    ROC = ROC + roc_auc_score(y.iloc[test_ids], preds)
    print 'ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(y.iloc[test_ids], preds), 
                                        accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int)))

print 'ROC-AUC of K-folds: %.3f' % (ROC / n_folds_K)

Найдем наилучшее К

In [43]:
for k in range(46, X.shape[1] - 61):
    U, Sigma, VT = randomized_svd(X, n_components=k, n_iter=6, random_state=16)
    X_svd = pd.DataFrame(U)
    X_svd.set_index(X.index.values, inplace = True)

    ROC = 0
    ACC = 0
    n_folds_K = 3
    kf = KFold(n=X.shape[0], n_folds=3)
    for train_ids, test_ids in kf:
        #print("%s %s" % (train, test))
        lr = LogisticRegression()
        lr.fit(X_svd.iloc[train_ids], y.iloc[train_ids])
        preds = lr.predict_proba(X_svd.iloc[test_ids])[:,1]
        ROC = ROC + roc_auc_score(y.iloc[test_ids], preds)
        ACC = accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int))
        # print 'ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(y.iloc[test_ids], preds), 
        #                                    accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int)))

    print 'K = %d - ROC-AUC of K-folds: %.3f, ACC: %.3f' % (k, ROC / n_folds_K, ACC)

K = 46 - ROC-AUC of K-folds: 0.695, ACC: 0.639
K = 47 - ROC-AUC of K-folds: 0.695, ACC: 0.639
K = 48 - ROC-AUC of K-folds: 0.697, ACC: 0.639
K = 49 - ROC-AUC of K-folds: 0.697, ACC: 0.639
K = 50 - ROC-AUC of K-folds: 0.697, ACC: 0.640
K = 51 - ROC-AUC of K-folds: 0.697, ACC: 0.640
K = 52 - ROC-AUC of K-folds: 0.697, ACC: 0.640
K = 53 - ROC-AUC of K-folds: 0.697, ACC: 0.640
K = 54 - ROC-AUC of K-folds: 0.699, ACC: 0.641
K = 55 - ROC-AUC of K-folds: 0.703, ACC: 0.643
K = 56 - ROC-AUC of K-folds: 0.705, ACC: 0.646
K = 57 - ROC-AUC of K-folds: 0.705, ACC: 0.646
K = 58 - ROC-AUC of K-folds: 0.706, ACC: 0.646
K = 59 - ROC-AUC of K-folds: 0.706, ACC: 0.646
K = 60 - ROC-AUC of K-folds: 0.705, ACC: 0.645
K = 61 - ROC-AUC of K-folds: 0.706, ACC: 0.646
K = 62 - ROC-AUC of K-folds: 0.705, ACC: 0.645
K = 63 - ROC-AUC of K-folds: 0.707, ACC: 0.646
K = 64 - ROC-AUC of K-folds: 0.709, ACC: 0.650
K = 65 - ROC-AUC of K-folds: 0.710, ACC: 0.650
K = 66 - ROC-AUC of K-folds: 0.712, ACC: 0.652
K = 67 - ROC-

KeyboardInterrupt: 

In [51]:
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

In [55]:
k = 140
while k < X.shape[1]:
    U, Sigma, VT = randomized_svd(X, n_components=k, n_iter=6, random_state=16)
    X_svd = pd.DataFrame(U)
    X_svd.set_index(X.index.values, inplace = True)

    ROC = 0
    ACC = 0
    LOG_LOSS = 0
    n_folds_K = 3
    kf = KFold(n=X.shape[0], n_folds=3)
    for train_ids, test_ids in kf:
        #print("%s %s" % (train, test))
        lr = LogisticRegression()
        lr.fit(X_svd.iloc[train_ids], y.iloc[train_ids])
        preds = lr.predict_proba(X_svd.iloc[test_ids])[:,1]
        ROC = ROC + roc_auc_score(y.iloc[test_ids], preds)
        ACC += accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int))
        # print 'ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(y.iloc[test_ids], preds), 
        #                                    accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int))
        LOG_LOSS += log_loss(y.iloc[test_ids], preds)

    print 'K = %d - ROC-AUC of K-folds: %.3f, ACC: %.3f, log-los: %.4f' % (k, ROC / n_folds_K, ACC / n_folds_K, LOG_LOSS / n_folds_K)
    k += 1

K = 140 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6704
K = 141 - ROC-AUC of K-folds: 0.738, ACC: 0.642, log-los: 0.6705
K = 142 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6705
K = 143 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6704
K = 144 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6705
K = 145 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6704
K = 146 - ROC-AUC of K-folds: 0.738, ACC: 0.644, log-los: 0.6704
K = 147 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6704
K = 148 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6704
K = 149 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6704
K = 150 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6703
K = 151 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6704
K = 152 - ROC-AUC of K-folds: 0.739, ACC: 0.643, log-los: 0.6703
K = 153 - ROC-AUC of K-folds: 0.739, ACC: 0.643, log-los: 0.6703
K = 154 - ROC-AUC of K-folds: 0.738, ACC: 0.643, log-los: 0.6703
K = 155 - ROC-AUC of K-fo

MemoryError: 

In [None]:
k = 163
U, Sigma, VT = randomized_svd(X, n_components=k, n_iter=6, random_state=16)
X_svd = pd.DataFrame(U)
X_svd.set_index(X.index.values, inplace = True)

In [None]:
lr_svd = LogisticRegression()
lr_svd.fit(X_svd.iloc[train_ids], y.iloc[train_ids])

In [3]:
U, Sigma, VT = randomized_svd(features_heros_bag_test, n_components=k, n_iter=6, random_state=16)
X_svd_test = pd.DataFrame(U)
X_svd_test.set_index(X.index.values, inplace = True)

In [None]:
# preds = lr.predict_proba(X_svd.iloc[test_ids])[:,1]
preds_svd_test = lr_svd.predict_proba(X_svd_test)[:,1]
print preds_svd_test

In [None]:
#preds_svd_test = lr_svd.predict_proba(X_svd_test)[:,1]
#print preds_svd_test
import csv
with open('test_probs_svd.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['match_id', 'radiant_win'])
    for match_num in range(len(features_heros_bag_test.index.values)):
        match_id = features_heros_bag_test.index.values[match_num]
        match_pred = preds_svd_test[match_num]
        # print '%d, %.11f' % (match_id, match_pred)
        writer.writerow([match_id, match_pred])

# Добавление признаков опираясь на знания об из смысле
Дополнительно Предобработаем данные

In [2]:
import pandas
features = pandas.read_csv('./features.csv', index_col='match_id')

features.head()


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [3]:
df = features
X = df.drop(['duration', 'radiant_win', 'tower_status_radiant', 
             'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'],
            axis=1, inplace=False)
y = features['radiant_win']

In [4]:
print X.columns.values
X.head()

['start_time' 'lobby_type' 'r1_hero' 'r1_level' 'r1_xp' 'r1_gold' 'r1_lh'
 'r1_kills' 'r1_deaths' 'r1_items' 'r2_hero' 'r2_level' 'r2_xp' 'r2_gold'
 'r2_lh' 'r2_kills' 'r2_deaths' 'r2_items' 'r3_hero' 'r3_level' 'r3_xp'
 'r3_gold' 'r3_lh' 'r3_kills' 'r3_deaths' 'r3_items' 'r4_hero' 'r4_level'
 'r4_xp' 'r4_gold' 'r4_lh' 'r4_kills' 'r4_deaths' 'r4_items' 'r5_hero'
 'r5_level' 'r5_xp' 'r5_gold' 'r5_lh' 'r5_kills' 'r5_deaths' 'r5_items'
 'd1_hero' 'd1_level' 'd1_xp' 'd1_gold' 'd1_lh' 'd1_kills' 'd1_deaths'
 'd1_items' 'd2_hero' 'd2_level' 'd2_xp' 'd2_gold' 'd2_lh' 'd2_kills'
 'd2_deaths' 'd2_items' 'd3_hero' 'd3_level' 'd3_xp' 'd3_gold' 'd3_lh'
 'd3_kills' 'd3_deaths' 'd3_items' 'd4_hero' 'd4_level' 'd4_xp' 'd4_gold'
 'd4_lh' 'd4_kills' 'd4_deaths' 'd4_items' 'd5_hero' 'd5_level' 'd5_xp'
 'd5_gold' 'd5_lh' 'd5_kills' 'd5_deaths' 'd5_items' 'first_blood_time'
 'first_blood_team' 'first_blood_player1' 'first_blood_player2'
 'radiant_bottle_time' 'radiant_courier_time' 'radiant_flying_courier

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,0,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,0,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,1,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,0,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,0,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0


In [5]:
cols_r = ['r1_level', 'r1_xp', 'r1_gold', 'r1_lh', 'r1_kills', 'r1_deaths', 'r1_items',
        'r2_level', 'r2_xp', 'r2_gold', 'r2_lh', 'r2_kills', 'r2_deaths', 'r2_items',
        'r3_level', 'r3_xp', 'r3_gold', 'r3_lh', 'r3_kills', 'r3_deaths', 'r3_items', 
        'r4_level','r4_xp', 'r4_gold', 'r4_lh', 'r4_kills', 'r4_deaths', 'r4_items',
        'r5_level', 'r5_xp', 'r5_gold', 'r5_lh', 'r5_kills', 'r5_deaths', 'r5_items']
cols_d = ['d1_level', 'd1_xp', 'd1_gold', 'd1_lh', 'd1_kills', 'd1_deaths', 'd1_items',
         'd2_level', 'd2_xp', 'd2_gold', 'd2_lh', 'd2_kills', 'd2_deaths', 'd2_items',
         'd3_level', 'd3_xp', 'd3_gold', 'd3_lh', 'd3_kills', 'd3_deaths', 'd3_items',
         'd4_level', 'd4_xp', 'd4_gold', 'd4_lh', 'd4_kills', 'd4_deaths', 'd4_items',
         'd5_level', 'd5_xp', 'd5_gold', 'd5_lh', 'd5_kills', 'd5_deaths', 'd5_items' ]

In [6]:
import numpy as np
cols_r = pd.DataFrame(np.array(cols_r).reshape(5,7))
cols_d = pd.DataFrame(np.array(cols_d).reshape(5,7))

In [7]:
cols_r

Unnamed: 0,0,1,2,3,4,5,6
0,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items
1,r2_level,r2_xp,r2_gold,r2_lh,r2_kills,r2_deaths,r2_items
2,r3_level,r3_xp,r3_gold,r3_lh,r3_kills,r3_deaths,r3_items
3,r4_level,r4_xp,r4_gold,r4_lh,r4_kills,r4_deaths,r4_items
4,r5_level,r5_xp,r5_gold,r5_lh,r5_kills,r5_deaths,r5_items


In [9]:
X.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,0,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,0,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,1,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,0,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,0,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0


Сортируем внутри каждого показателя каждой группы по всем героям

In [10]:
for hero in range(7):
    # print cols_r[hero]
    X[cols_r[hero]] = X[cols_r[hero]].apply( lambda x :  sorted(x), axis = 1)
    X[cols_d[hero]] = X[cols_d[hero]].apply( lambda x :  sorted(x), axis = 1)

In [72]:
'sum_' + str(cols_r[2][0])

'sum_r1_gold'

In [11]:
for atr in range(7):
    X['sum_' + str(cols_r[atr][0])] = X[cols_r[atr]].apply( lambda x :  sum(x), axis = 1)
    X['sum_' + str(cols_d[atr][0])] = X[cols_d[atr]].apply( lambda x :  sum(x), axis = 1)

In [12]:
for atr in range(7):
    X['dif_' + str(cols_r[atr][0])] = X['sum_' + str(cols_r[atr][0])] - X['sum_' + str(cols_d[atr][0])]
    X['prop_' + str(cols_r[atr][0])] = X['sum_' + str(cols_r[atr][0])] / X['sum_' + str(cols_d[atr][0])]
    # X.drop(['prop_' + str(cols_r[atr][0])], axis=1, inplace=True)# = X['sum_' + str(cols_r[atr][0])] / X['sum_' + str(cols_d[atr][0])]
    

In [100]:
'sum_' + str(cols_r[4][0])
'sum_' + str(cols_d[5][0])

'sum_d1_deaths'

In [13]:
X['dif_k_d_r_d'] = X['sum_' + str(cols_r[4][0])] - X['sum_' + str(cols_d[5][0])]
X['dif_k_d_d_r'] = X['sum_' + str(cols_d[4][0])] - X['sum_' + str(cols_r[5][0])]

In [14]:
X['dif_k_d_r'] = X['sum_' + str(cols_r[5][0])] / X['sum_' + str(cols_r[4][0])]
X['dif_k_d_d'] = X['sum_' + str(cols_d[5][0])] / X['sum_' + str(cols_d[4][0])]

In [137]:
# X.drop(['dif_k_d_r', 'dif_k_d_d'], axis=1, inplace=True)

In [16]:
X.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dif_r1_kills,prop_r1_kills,dif_r1_deaths,prop_r1_deaths,dif_r1_items,prop_r1_items,dif_k_d_r_d,dif_k_d_d_r,dif_k_d_r,dif_k_d_d
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,3,732,658,4,0,0,4,...,-1,0.0,1,inf,4,1.121212,0,0,inf,0.0
1,1430220345,0,42,2,415,539,1,0,0,5,...,-1,0.5,1,2.0,-3,0.926829,0,0,2.0,0.5
2,1430227081,7,33,3,1297,775,0,0,0,6,...,1,2.0,-1,0.5,-3,0.928571,0,0,0.5,2.0
3,1430263531,1,29,2,539,499,0,0,0,5,...,0,,0,,-8,0.783784,0,0,,
4,1430282290,7,13,2,629,552,0,0,0,7,...,2,3.0,-2,0.333333,2,1.05,0,0,0.333333,3.0


In [17]:
from sklearn import preprocessing
# Передаём в cat_features_names список названий столбцов с категориальными признаками.
def transform_data(data, cat_features_names):
    # Создаем список столбцов, задающих вещественные признаки.
    real_features_names = [col for col in data.columns.values if col not in cat_features_names]

    # 1. Замена пропущенных значений на нули для вещественных признаков и на строки 'nan' для категориальных.
    # в вещественных признаках:
    data_real = data[real_features_names]
    data[real_features_names] = data[real_features_names].replace([np.inf, -np.inf, np.nan], 0, regex=True)

    # data[real_features_names] = data[real_features_names].astype(np.float64)
    # в категориальных
    data_obj = data[cat_features_names]
    data[cat_features_names] = data_obj.replace(np.nan, 'nan', regex=True)
    
    # 2. Масштабирование вещественных признаков с помощью StandardScaler.
    normalizer = preprocessing.StandardScaler()
    X_no_mis = data[real_features_names]
    X_real_norm_np = normalizer.fit_transform(data[real_features_names])
    X_real_norm_pd = pd.DataFrame(data=X_real_norm_np, index=data[real_features_names].index)
    data[real_features_names] = X_real_norm_pd
    
    # 3. One-hot-кодирование категориальных признаков 
    data = pd.get_dummies(data, columns=cat_features_names, drop_first=True)
    return(data)

In [19]:
data_clean_train = X.copy()

# remove categorial -- labels of the heros
data_clean_train.drop(['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                       'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1, inplace=True)
# normolize the data and the categorial lobbe_type with one-hot-encoding:
data_norm_train = transform_data(data_clean_train, ['lobby_type'])

In [20]:
data_norm_train.shape

(97230, 125)

In [21]:
data_norm_train.head()

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,dif_r1_deaths,prop_r1_deaths,dif_r1_items,prop_r1_items,dif_k_d_r_d,dif_k_d_d_r,dif_k_d_r,dif_k_d_d,lobby_type_1,lobby_type_7
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.544364,1.337596,0.862781,0.231343,2.641212,-0.051386,-0.083613,-1.086651,0.511396,-0.172941,...,0.44919,-0.741324,0.580864,0.59938,-0.001033,0.001033,-0.742127,-0.736359,0.0,1.0
1,-2.540452,-0.218061,-0.687324,-0.635715,0.121646,-0.051386,-0.083613,-0.454584,0.511396,-0.610532,...,0.44919,1.053455,-0.393982,-0.42732,-0.001033,0.001033,1.055994,-0.2718,0.0,0.0
2,-2.539231,1.337596,3.625586,1.083829,-0.718209,-0.051386,-0.083613,0.177483,0.511396,2.596815,...,-0.516841,-0.292629,-0.393982,-0.418118,-0.001033,0.001033,-0.292596,1.121876,0.0,1.0
3,-2.532622,-0.218061,-0.080974,-0.927163,-0.718209,-0.051386,-0.083613,-0.454584,-1.258234,-0.998398,...,-0.033826,-0.741324,-1.0903,-1.182864,-0.001033,0.001033,-0.742127,-0.736359,1.0,0.0
4,-2.529221,-0.218061,0.359119,-0.540995,-0.718209,-0.051386,-0.083613,0.809551,0.511396,0.458584,...,-0.999857,-0.442194,0.302337,0.223249,-0.001033,0.001033,-0.44244,2.050994,0.0,1.0


In [22]:

# Код для формирования "мешка слов" по героям
data = features.drop(['duration', 'radiant_win', 'tower_status_radiant', 
             'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'],
            axis=1, inplace=False)
# N — количество различных героев в выборке
N = 113

X_pick = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
heros = pd.DataFrame(X_pick)
heros.set_index(data.index.values, inplace = True)

print heros.shape, data.shape

features_heros_bag = data_norm_train.join(heros)
features_heros_bag
features_heros_bag.shape

X_train = features_heros_bag


from sklearn.cross_validation import ShuffleSplit, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss


ROC = 0
ACC = 0
LOG_LOSS = 0
n_folds_K = 3
kf = KFold(n=X_train.shape[0], n_folds=3)
for train_ids, test_ids in kf:
    #print("%s %s" % (train, test))
    lr = LogisticRegression()
    lr.fit(X_train.iloc[train_ids], y.iloc[train_ids])
    preds = lr.predict_proba(X_train.iloc[test_ids])[:,1]
    ROC = ROC + roc_auc_score(y.iloc[test_ids], preds)
    ACC += accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int))
    # print 'ROC-AUC: %.3f, ACC: %.3f' % (roc_auc_score(y.iloc[test_ids], preds), 
    #                                    accuracy_score(y.iloc[test_ids], (preds > 0.5).astype(int))
    LOG_LOSS += log_loss(y.iloc[test_ids], preds)
print ' ROC-AUC of K-folds: %.3f, ACC: %.3f, log-los: %.4f' % (ROC / n_folds_K, ACC / n_folds_K, LOG_LOSS / n_folds_K)

lr = LogisticRegression()
lr.fit(X_train, y)
    

(97230, 113) (97230, 102)




 ROC-AUC of K-folds: 0.749, ACC: 0.682, log-los: 0.5889


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
X_train.shape

(97230, 238)

Test

In [24]:
import pandas as pd
features_test = pandas.read_csv('./features_test.csv', index_col='match_id')

features_test.head()


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12.0,247.0,-86.0,272.0,3,4,2,0,118.0
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29.0,168.0,-54.0,,3,2,2,1,16.0
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22.0,46.0,-87.0,186.0,1,3,3,0,-34.0
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49.0,30.0,-89.0,210.0,3,4,2,1,-26.0
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36.0,180.0,-86.0,180.0,1,3,2,1,-33.0


In [25]:
X = features_test
for hero in range(7):
    # print cols_r[hero]
    X[cols_r[hero]] = X[cols_r[hero]].apply( lambda x :  sorted(x), axis = 1)
    X[cols_d[hero]] = X[cols_d[hero]].apply( lambda x :  sorted(x), axis = 1)
for atr in range(7):
    X['sum_' + str(cols_r[atr][0])] = X[cols_r[atr]].apply( lambda x :  sum(x), axis = 1)
    X['sum_' + str(cols_d[atr][0])] = X[cols_d[atr]].apply( lambda x :  sum(x), axis = 1)

for atr in range(7):
    X['dif_' + str(cols_r[atr][0])] = X['sum_' + str(cols_r[atr][0])] - X['sum_' + str(cols_d[atr][0])]
    X['prop_' + str(cols_r[atr][0])] = X['sum_' + str(cols_r[atr][0])] / X['sum_' + str(cols_d[atr][0])]

X['dif_k_d_r_d'] = X['sum_' + str(cols_r[4][0])] - X['sum_' + str(cols_d[5][0])]
X['dif_k_d_d_r'] = X['sum_' + str(cols_d[4][0])] - X['sum_' + str(cols_r[5][0])]
X['dif_k_d_r'] = X['sum_' + str(cols_r[5][0])] / X['sum_' + str(cols_r[4][0])]
X['dif_k_d_d'] = X['sum_' + str(cols_d[5][0])] / X['sum_' + str(cols_d[4][0])]

In [27]:
print X.shape
X.head()

(17177, 134)


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dif_r1_kills,prop_r1_kills,dif_r1_deaths,prop_r1_deaths,dif_r1_items,prop_r1_items,dif_k_d_r_d,dif_k_d_d_r,dif_k_d_r,dif_k_d_d
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,3,917,933,3,0,0,6,...,-1,0.666667,1,1.5,-5,0.886364,0,0,1.5,0.666667
7,1430293357,1,20,2,556,570,0,0,0,6,...,0,1.0,0,1.0,-5,0.883721,0,0,1.0,1.0
10,1430301774,1,112,2,421,569,1,0,0,4,...,-1,0.5,1,2.0,0,1.0,0,0,2.0,0.5
13,1430323933,1,27,2,672,901,1,0,0,7,...,2,3.0,-2,0.333333,8,1.242424,0,0,0.333333,3.0
16,1430331112,1,39,2,484,609,2,0,0,6,...,0,1.0,0,1.0,-4,0.902439,0,0,1.0,1.0


In [56]:
data_clean_test.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,...,dif_r1_kills,prop_r1_kills,dif_r1_deaths,prop_r1_deaths,dif_r1_items,prop_r1_items,dif_k_d_r_d,dif_k_d_d_r,dif_k_d_r,dif_k_d_d
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,-2.514875,0,1.314506,1.760213,2.247348,1.787564,-0.060188,-0.089666,0.186272,0.493791,...,-0.439765,-0.115431,0.444346,0.583075,-0.661851,-0.630224,0.002041,-0.002041,0.591902,-0.114599
7,-2.513895,1,-0.278837,-0.006075,-0.409891,-0.715288,-0.060188,-0.089666,0.186272,-1.418258,...,0.043156,0.194291,-0.043169,0.140629,-0.661851,-0.644232,0.002041,-0.002041,0.145856,0.197547
10,-2.512377,1,-0.278837,-0.666598,-0.417212,0.118996,-0.060188,-0.089666,-1.070442,-1.418258,...,-0.439765,-0.270292,0.444346,1.02552,0.037954,-0.027878,0.002041,-0.002041,1.037949,-0.270671
13,-2.508381,1,-0.278837,0.561485,2.013101,0.118996,-0.060188,-0.089666,0.814629,0.493791,...,1.008997,2.052625,-1.018198,-0.449298,1.157642,1.257126,0.002041,-0.002041,-0.448873,2.070418
16,-2.507087,1,-0.278837,-0.358354,-0.124403,0.95328,-0.060188,-0.089666,0.186272,0.493791,...,0.043156,0.194291,-0.043169,0.140629,-0.52189,-0.545014,0.002041,-0.002041,0.145856,0.197547


In [57]:
data_clean_test = X.copy()

# remove categorial -- labels of the heros
data_clean_test.drop(['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                       'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1, inplace=True)
# normolize the data and the categorial lobbe_type with one-hot-encoding:
data_norm_test = transform_data(data_clean_test, ['lobby_type'])

In [58]:
data_norm_test

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,dif_r1_deaths,prop_r1_deaths,dif_r1_items,prop_r1_items,dif_k_d_r_d,dif_k_d_d_r,dif_k_d_r,dif_k_d_d,lobby_type_1,lobby_type_7
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,-2.514875,1.314506,1.760213,2.247348,1.787564,-0.060188,-0.089666,0.186272,0.493791,1.552328,...,0.444346,0.583075,-0.661851,-0.630224,0.002041,-0.002041,0.591902,-0.114599,0.0,0.0
7,-2.513895,-0.278837,-0.006075,-0.409891,-0.715288,-0.060188,-0.089666,0.186272,-1.418258,-0.090988,...,-0.043169,0.140629,-0.661851,-0.644232,0.002041,-0.002041,0.145856,0.197547,1.0,0.0
10,-2.512377,-0.278837,-0.666598,-0.417212,0.118996,-0.060188,-0.089666,-1.070442,-1.418258,-0.205870,...,0.444346,1.025520,0.037954,-0.027878,0.002041,-0.002041,1.037949,-0.270671,1.0,0.0
13,-2.508381,-0.278837,0.561485,2.013101,0.118996,-0.060188,-0.089666,0.814629,0.493791,-0.420650,...,-1.018198,-0.449298,1.157642,1.257126,0.002041,-0.002041,-0.448873,2.070418,1.0,0.0
16,-2.507087,-0.278837,-0.358354,-0.124403,0.953280,-0.060188,-0.089666,0.186272,0.493791,1.532348,...,-0.043169,0.140629,-0.521890,-0.545014,0.002041,-0.002041,0.145856,0.197547,1.0,0.0
18,-2.506518,-0.278837,-0.974842,-0.622178,0.118996,-0.060188,-0.089666,1.442986,0.493791,0.937957,...,0.444346,0.583075,0.177915,0.089914,0.002041,-0.002041,0.591902,-0.114599,0.0,0.0
19,-2.506387,-0.278837,-0.373032,-0.285448,0.118996,-0.060188,-0.089666,-1.698799,0.493791,-0.560507,...,0.444346,1.025520,-2.621304,-1.928108,0.002041,-0.002041,1.037949,-0.270671,1.0,0.0
24,-2.503777,-0.278837,-0.774239,-0.285448,0.118996,-0.060188,-0.089666,0.186272,-1.418258,0.278633,...,-1.018198,-0.744261,-0.102007,-0.148347,0.002041,-0.002041,-0.746237,-0.738889,1.0,0.0
33,-2.500708,-0.278837,-0.196893,-0.922307,-0.715288,-0.060188,-0.089666,0.186272,2.405840,2.766083,...,1.419375,-0.744261,0.037954,-0.027878,0.002041,-0.002041,-0.746237,-0.738889,1.0,0.0
37,-2.499155,1.314506,1.217116,0.475855,3.456131,-0.060188,-0.089666,-0.442085,0.493791,1.337548,...,-1.018198,-0.744261,0.737759,0.800348,0.002041,-0.002041,-0.746237,-0.738889,0.0,1.0


In [64]:

# Код для формирования "мешка слов" по героям
data = features_test
# N — количество различных героев в выборке
N = 113

X_pick = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
heros_test = pd.DataFrame(X_pick)
heros_test.set_index(data.index.values, inplace = True)

print heros_test.shape, data.shape

features_heros_bag_add = data_norm_test.join(heros_test)
features_heros_bag_add
features_heros_bag_add.shape

X_add_feat_test = features_heros_bag_add

lr = LogisticRegression()
lr.fit(X_train, y)



(17177, 113) (17177, 134)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [55]:
(features_heros_bag_add.values == -np.inf).sum()

0

In [67]:
data_norm_test
preds_add_feat_test = lr.predict_proba(features_heros_bag_add)[:,1]


In [69]:
preds_add_feat_test = lr.predict_proba(X_add_feat_test)[:,1]
print preds_add_feat_test.shape
import csv
with open('test_probs_add_feat.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['match_id', 'radiant_win'])
    for match_num in range(len(features_heros_bag_add.index.values)):
        match_id = features_heros_bag_add.index.values[match_num]
        match_pred = preds_add_feat_test[match_num]
        # print '%d, %.11f' % (match_id, match_pred)
        writer.writerow([match_id, match_pred])

(17177,)
