In [37]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import time
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold

In [2]:
from sklearn.cross_validation import StratifiedKFold
import matplotlib.pyplot as plt
%matplotlib inline
import sys
import math

In [26]:
data = pd.read_csv('./features.csv', index_col='match_id')

data.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16,2449,0,4,1974,3,63


#### Описание признаков в таблице

- `match_id`: идентификатор матча в наборе данных
- `start_time`: время начала матча (unixtime)
- `lobby_type`: тип комнаты, в которой собираются игроки (расшифровка в `dictionaries/lobbies.csv`)
- Наборы признаков для каждого игрока (игроки команды Radiant — префикс `rN`, Dire — `dN`):
    - `r1_hero`: герой игрока (расшифровка в dictionaries/heroes.csv)
    - `r1_level`: максимальный достигнутый уровень героя (за первые 5 игровых минут)
    - `r1_xp`: максимальный полученный опыт
    - `r1_gold`: достигнутая ценность героя
    - `r1_lh`: число убитых юнитов
    - `r1_kills`: число убитых игроков
    - `r1_deaths`: число смертей героя
    - `r1_items`: число купленных предметов
- Признаки события "первая кровь" (first blood). Если событие "первая кровь" не успело произойти за первые 5 минут, то признаки принимают пропущенное значение
    - `first_blood_time`: игровое время первой крови
    - `first_blood_team`: команда, совершившая первую кровь (0 — Radiant, 1 — Dire)
    - `first_blood_player1`: игрок, причастный к событию
    - `first_blood_player2`: второй игрок, причастный к событию
- Признаки для каждой команды (префиксы `radiant_` и `dire_`)
    - `radiant_bottle_time`: время первого приобретения командой предмета "bottle"
    - `radiant_courier_time`: время приобретения предмета "courier" 
    - `radiant_flying_courier_time`: время приобретения предмета "flying_courier" 
    - `radiant_tpscroll_count`: число предметов "tpscroll" за первые 5 минут
    - `radiant_boots_count`: число предметов "boots"
    - `radiant_ward_observer_count`: число предметов "ward_observer"
    - `radiant_ward_sentry_count`: число предметов "ward_sentry"
    - `radiant_first_ward_time`: время установки командой первого "наблюдателя", т.е. предмета, который позволяет видеть часть игрового поля
- Итог матча (данные поля отсутствуют в тестовой выборке, поскольку содержат информацию, выходящую за пределы первых 5 минут матча)
    - `duration`: длительность
    - `radiant_win`: 1, если победила команда Radiant, 0 — иначе
    - Состояние башен и барраков к концу матча (см. описание полей набора данных)
        - `tower_status_radiant`
        - `tower_status_dire`
        - `barracks_status_radiant`
        - `barracks_status_dire`

# Признаки каждой команды

In [9]:
def get_command_feature(data):
    features_time = [u'radiant_bottle_time', u'radiant_courier_time', u'radiant_flying_courier_time', 
                    u'radiant_first_ward_time', u'dire_bottle_time', u'dire_courier_time', 
                    u'dire_flying_courier_time', u'dire_first_ward_time']
    features_count = [u'radiant_tpscroll_count', u'radiant_boots_count', u'radiant_ward_observer_count',
                     u'radiant_ward_sentry_count',  u'dire_tpscroll_count', u'dire_boots_count',
                     u'dire_ward_observer_count', u'dire_ward_sentry_count']
    command_data = data[features_time + features_count]
    command_data[features_time] = command_data[features_time].fillna(command_data[features_time].max())
    command_data[features_count] = command_data[features_count].fillna(command_data[features_count].min())
    
    return command_data

# Признаки первой крови

In [10]:
def get_firstBlood_feature(data):
    blood_data = data['first_blood_team'] 
    return blood_data.fillna(0.5)

# Выделяем признаки каждого героя

In [11]:
def get_hero_features(data):
    hero_features = []

    for i in range(1, 6):
        hero_features.append('r' + str(i) + '_xp')
        hero_features.append('r' + str(i) + '_gold')
        hero_features.append('r' + str(i) + '_lh')
        hero_features.append('r' + str(i) + '_kills')
        hero_features.append('r' + str(i) + '_deaths')
        hero_features.append('r' + str(i) + '_items')
    
    for i in range(1, 6):
        hero_features.append('d' + str(i) + '_xp')
        hero_features.append('d' + str(i) + '_gold')
        hero_features.append('d' + str(i) + '_lh')
        hero_features.append('d' + str(i) + '_kills')
        hero_features.append('d' + str(i) + '_deaths')
        hero_features.append('d' + str(i) + '_items')

    hero_data = data[hero_features]
    return hero_data

# Разность ризнаков

In [12]:
def get_subfeatures(X):
    num_columns = int(len(X.columns) / 2)
    r_features = X.columns[:num_columns]
    d_features = X.columns[num_columns:]
    return X[r_features].values - X[d_features].values

# Имена героев

In [13]:
def get_heroNames_features(data):
    hero_feature_names = []

    for i in range(1, 6):
        hero_feature_names.append('r' + str(i) + '_hero')
    for i in range(1, 6):
        hero_feature_names.append('d' + str(i) + '_hero')

    hero_name_data = data[hero_feature_names]

    return hero_name_data

# Вычисление точности

In [14]:
def get_scores(X_train, X_test, y_train, y_test, clf):
    clf.fit(X_train, y_train)
    scores = clf.predict_proba(X_test)[:, 1]
    print ('Score: ', roc_auc_score(y_test, scores))
    return scores, roc_auc_score(y_test, scores)

# StandartScaler

In [15]:
def get_scaling_data(X_train, X_test):
    scaler = StandardScaler()
    train_scaler = scaler.fit_transform(X_train)
    test_scaler = scaler.transform(X_test)
    return train_scaler, test_scaler

# OneHotEncoder

In [16]:
def get_OneHot(X_train, X_test):
    enc = OneHotEncoder()
    enc_train = enc.fit_transform(X_train)
    enc_test = enc.transform(X_test)
    return enc_train, enc_test

# Мешок слов над уровнями героев

In [17]:
def get_BagLevels(X):
    R_pick, D_pick = np.zeros((X.shape[0], 8)), np.zeros((X.shape[0], 8))
    for i, match_id in enumerate(X.index):
        for p in range(5):
            R_pick[i, X.ix[match_id, 'r%d_level' % (p+1)]] += 1
            D_pick[i, X.ix[match_id, 'd%d_level' % (p+1)]] += -1
    return R_pick + D_pick

# Мешок слов над именами героев

In [18]:
def get_BagHeroes(X):
    X_pick = np.zeros((X.shape[0], 112))
    for i, match_id in enumerate(X.index):
        for p in range(5):
            X_pick[i, X.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, X.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    return X_pick

# Синергия и антисинергия

In [19]:
def calc_rating(data):
    N = 113 # heroes
    # calculate each hero-pair synergy and antisynergy
    synergy = np.zeros((N,N))     # sum of wins in matches played together
    antisynergy = np.zeros((N,N)) # sum of wins when played against
    matchcounts = np.zeros((N,N)) # count of matches played together
    matchcounta = np.zeros((N,N)) # count of matches played against
    
    for match_counter, match_id in enumerate(data.index):
        #synergy when both heroes in win team
        winteam = 'r' if data.ix[match_id, 'radiant_win'] == 1 else 'd'
        looseteam = 'd' if winteam =='r' else 'r'
        pind     = [0] * 5 #player indexes    
        antipind = [0] * 5 # looser indicies
        # get indexes of players in each tem
        for i in range(5):
            pind[i] = data.ix[match_id, winteam + '%d_hero'%(i + 1)] - 1
        for i in range(5):
            antipind[i] = data.ix[match_id, looseteam + '%d_hero'%(i + 1)] - 1
        # accumulate synergy of pairs
        for i in range(5):
            for j in range(i + 1, 5):
                synergy[pind[i], pind[j]] += 1
                synergy[pind[j], pind[i]] += 1
        # accumulate match counter for playing together
        for i in range(5):
            for j in range(5):
                matchcounts[pind[i], pind[j]] +=1 #together and win
                matchcounts[antipind[i], antipind[j]] +=1 # together and loose
                
        #antisynergy when hero i in winteam while hero j in loose team
        for i in range(5):
            for j in range(5):
                antisynergy[pind[i], antipind[j]] += 1
                matchcounta[pind[i], antipind[j]] += 1
                matchcounta[antipind[j], pind[i]] += 1
    
        if match_counter % 10000 == 0:
            print 'rating preprocessing %d' %match_counter
        
    # normalize
    synergyrate = np.zeros((N,N))
    antisynergyrate = np.zeros((N,N))
    synergyrate = (synergy + 1) / (matchcounts + 2) 
    antisynergyrate = (antisynergy + 1) / (matchcounta + 2)
    
    return synergyrate, antisynergyrate
    
    
# calculate aggreagtes synergy and antisyn    
def get_syn(data, synergyrate, antisynergyrate):
    syn = np.zeros(len(data))
    antisyn = np.zeros(len(data))
    for match_counter, match_id in enumerate(data.index):
        rind = [0] * 5 #radiant indicies    
        dind = [0] * 5 # dire indicies
        # get indexes of players in each team
        for i in range(5):
            rind[i] = data.ix[match_id, 'r%d_hero'%(i+1)]-1
        for i in range(5):
            dind[i] = data.ix[match_id, 'd%d_hero'%(i+1)]-1
        # accumulate synergy of radiants minus synergy of dires
        # + radiants synergy
        for i in range(5):
            for j in range(i + 1,5):
                syn[match_counter] += synergyrate[rind[i], rind[j]]
        # - dire synergy
        for i in range(5):
            for j in range(i + 1,5):
                syn[match_counter] -= synergyrate[dind[i], dind[j]]
        # accumulate antisynergy
        for i in range(5):
            for j in range(5):
                antisyn[match_counter] += antisynergyrate[rind[i], dind[j]] 
    return syn, antisyn

# исключаем финишные признаки, формируем матрицу признаков x и вектор классов y


# Подсчет синергии и антисинергии через фолдинг 

In [20]:
def count_syn(X, is_test=False, sum_syn=np.zeros((113, 113)), sum_antisyn=np.zeros((113, 113))):
    n_folds=7
    i = 0
    S = np.zeros((X.shape[0], 2))
    if is_test == False:        
        cv = KFold(X.shape[0], n_folds=n_folds, shuffle=True, random_state=241)
        for train, test in cv:
            print i
            i = i + 1
            synergyrate, antisynergyrate = calc_rating(X.iloc[train])
            sum_syn += synergyrate
            sum_antisyn += antisynergyrate
            S[test, 0], S[test, 1] = get_syn(X.iloc[test], synergyrate, antisynergyrate)    
        sum_syn /= n_folds
        sum_antisyn /= n_folds
    else:
        S[:, 0], S[:, 1] = get_syn(X, sum_syn, sum_antisyn)
        
    return S, sum_syn, sum_antisyn

# Сортировка количественных признаков каждой команды

In [21]:
def get_sort_heroes_feature(hero_feature, w):
    xp_features = []
    gold_features = []
    lh_features = []
    kills_features = []
    deaths_features = []
    items_features = []

    for i in range(1, 6):
            xp_features.append(w + str(i) + '_xp')
            gold_features.append(w + str(i) + '_gold')
            lh_features.append(w + str(i) + '_lh')
            kills_features.append(w + str(i) + '_kills')
            deaths_features.append(w + str(i) + '_deaths')
            items_features.append(w + str(i) + '_items')

    xp = hero_feature[xp_features]
    gold = hero_feature[gold_features]
    lh = hero_feature[lh_features]
    kills = hero_feature[kills_features]
    deaths = hero_feature[deaths_features]
    items = hero_feature[items_features]
    
    df_xp = pd.DataFrame(np.sort(xp, axis=1))
    df_gold = pd.DataFrame(np.sort(gold, axis=1))
    df_lh = pd.DataFrame(np.sort(lh, axis=1))
    df_kills = pd.DataFrame(np.sort(kills, axis=1))
    df_deaths = pd.DataFrame(np.sort(deaths, axis=1))
    df_items = pd.DataFrame(np.sort(items, axis=1))
    
    return pd.concat([df_xp, df_gold, df_lh, df_kills, df_deaths, df_items], axis=1)

# Извлечение всех признаков

In [24]:
def get_features(data):
    command_feature = get_subfeatures(get_command_feature(data))
    heroNames_feature = get_heroNames_features(data)
    hero_feature = get_hero_features(data)
    r_heroes_feature = get_sort_heroes_feature(hero_feature, 'r')
    d_heroes_feature = get_sort_heroes_feature(hero_feature, 'd')
    heroes_feature = r_heroes_feature - d_heroes_feature
    bag_features = get_BagHeroes(heroNames_feature)
    num_features = np.concatenate((command_feature, heroes_feature), axis=1)
    blood_feature = get_firstBlood_feature(data).reshape(-1, 1)
    lobby_features = data['lobby_type'].reshape(-1, 1)
    level_features = get_BagLevels(data)
    return num_features, bag_features, blood_feature, lobby_features, level_features

# Объединение всех признаков

In [29]:
def get_data(num_features, bag_features, blood_features, lobby_features, level_features, syn_features):
    enc = OneHotEncoder()
    enc_lobby = enc.fit_transform(lobby_features).toarray()
    
    scaler = StandardScaler()
    sc_num = scaler.fit_transform(np.concatenate((num_features, syn_features, blood_features), axis=1))
    
    data = np.concatenate((sc_num, bag_features, enc_lobby, level_features), axis=1)
    return data

In [27]:
targets = data['radiant_win'].values
num_features, bag_features, blood_features, lobby_features, level_features = get_features(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [28]:
syn_features, sum_syn, sum_antisyn = count_syn(data)

0
rating preprocessing 0
rating preprocessing 10000
rating preprocessing 20000
rating preprocessing 30000
rating preprocessing 40000
rating preprocessing 50000
rating preprocessing 60000
rating preprocessing 70000
rating preprocessing 80000
1
rating preprocessing 0
rating preprocessing 10000
rating preprocessing 20000
rating preprocessing 30000
rating preprocessing 40000
rating preprocessing 50000
rating preprocessing 60000
rating preprocessing 70000
rating preprocessing 80000
2
rating preprocessing 0
rating preprocessing 10000
rating preprocessing 20000
rating preprocessing 30000
rating preprocessing 40000
rating preprocessing 50000
rating preprocessing 60000
rating preprocessing 70000
rating preprocessing 80000
3
rating preprocessing 0
rating preprocessing 10000
rating preprocessing 20000
rating preprocessing 30000
rating preprocessing 40000
rating preprocessing 50000
rating preprocessing 60000
rating preprocessing 70000
rating preprocessing 80000
4
rating preprocessing 0
rating prep

In [30]:
X_train = get_data(num_features, bag_features, blood_features, 
                   lobby_features, level_features, syn_features)

# Тестовые выборка

In [31]:
data_test = pd.read_csv('./features_test.csv', index_col='match_id')

data_test.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12,247,-86,272.0,3,4,2,0,118
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29,168,-54,,3,2,2,1,16
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22,46,-87,186.0,1,3,3,0,-34
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49,30,-89,210.0,3,4,2,1,-26
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36,180,-86,180.0,1,3,2,1,-33


In [32]:
Tnum_features, Tbag_features, Tblood_features, Tlobby_features, Tlevel_features = get_features(data_test)

In [33]:
Tsyn_features, _, _ = count_syn(data_test, True, sum_syn, sum_antisyn)

In [35]:
X_test = get_data(Tnum_features, Tbag_features, Tblood_features, Tlobby_features, Tlevel_features, Tsyn_features)

# Метапризнаки над градиентным бустингом

In [59]:
np.random.seed(0)

n_folds = 10
verbose = True
shuffle = False

X, y, X_submission = X_train, targets, X_test

if shuffle:
    idx = np.random.permutation(y.size)
    X = X[idx]
    y = y[idx]

skf = list(StratifiedKFold(y, n_folds))

clfs = [GradientBoostingClassifier(n_estimators=200, max_depth=6)]

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

for j, clf in enumerate(clfs):
    print j, clf
    dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        Xtrain = X[train]
        y_train = y[train]
        Xtest = X[test]
        y_test = y[test]
        clf.fit(Xtrain, y_train)
        y_submission = clf.predict_proba(Xtest)[:,1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
    dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)

# XGBOOST

In [49]:
clf = xgb.XGBClassifier(n_estimators=300, max_depth=6, nthread=4)
clf.fit(X_train, targets)
XGB_scores = clf.predict_proba(X_test)[:, 1]

In [39]:
DATA = np.concatenate((X_train, dataset_blend_train), axis=1)

In [719]:
TDATA = np.concatenate((X_test, dataset_blend_test), axis=1)

# LogisticRegression

In [729]:
LR_clf = LogisticRegression(C=0.02)
LR_clf.fit(DATA, targets)
print 'fit finish..'
LR_scores = LR_clf.predict_proba(TDATA)[:, 1]

fit finish..


# Ансамбль

In [730]:
scores = 0.72 * LR_scores + 0.28 * XGB_scores

# Готовим сабмит

In [731]:
def make_submission(name_file, scores):
    f = open(name_file, 'w')
    f.write('match_id,' + 'radiant_win' + '\n')
    for i, game_id in enumerate(data_test.index):
        f.write(str(game_id) + ',' + str(scores[i]) + '\n')
    f.close()
    pass

In [732]:
make_submission('sub.csv', scores)