#Продолжение экспериментов

In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
features = pandas.read_csv('data/features.csv', index_col='match_id')
features_test = pandas.read_csv('data/features_test.csv', index_col='match_id')

In [3]:
# matches = features.copy().fillna(0)
# test_matches = features_test.copy().fillna(0)
matches = features.copy().fillna(0)
train_size = int(matches.shape[0] * 2 / 3)
test_matches = matches[train_size:]
matches = matches[:train_size]
y_test = test_matches['radiant_win']
del test_matches['duration']
del test_matches['radiant_win']
del test_matches['tower_status_dire']
del test_matches['tower_status_radiant']
del test_matches['barracks_status_radiant']
del test_matches['barracks_status_dire']

In [4]:
fields = ['level', 'gold', 'xp', 'lh', 'kills', 'deaths', 'items']
teams = ['r', 'd']
players = range(1, 6)
for field in fields:
    for team in teams:
        name = team + '_' + field
        matches[name] = 0
        test_matches[name] = 0
        for player in players:
            matches[name] += matches[team + str(player) + '_' + field] 
            test_matches[name] += test_matches[team + str(player) + '_' + field] 
            del matches[team + str(player) + '_' + field] 
            del test_matches[team + str(player) + '_' + field] 

In [5]:
def preprocess(data, cat_cols, other_cols, train_size=None):
    if train_size is None:
        train_size = int(2 * data.shape[0] / 3)
    if train_size == 0:
        transformer = DictVectorizer(sparse=False)
        data_arr = transformer.fit_transform(data[cat_cols].astype(str).T.to_dict().values())
        train = np.hstack((data_arr, data[other_cols]))
#         train = data
#         print(list(data.columns))
        if 'radiant_win' in list(data.columns):
            y_train = np.array(data['radiant_win'])
            del data['duration']
            del data['radiant_win']
            del data['tower_status_dire']
            del data['tower_status_radiant']
            del data['barracks_status_radiant']
            del data['barracks_status_dire']
            return train, y_train
        return train
    
    y_train = np.array(data['radiant_win'])
    y_test = y_train[train_size:]
    y_train = y_train[:train_size]
    del data['duration']
    del data['radiant_win']
    del data['tower_status_dire']
    del data['tower_status_radiant']
    del data['barracks_status_radiant']
    del data['barracks_status_dire']
    transformer = DictVectorizer(sparse=False)
    data_arr = transformer.fit_transform(data[cat_cols].astype(str).T.to_dict().values())
    data = np.hstack((data_arr, data[other_cols]))
    test = data[train_size:, :]
    train = data[:train_size, :]
    return train, y_train, test, y_test

##Логистическая регрессия на выбранных героях

In [6]:
N = 113
data = matches
X_pick = np.zeros((data.shape[0], N))
for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [7]:
train = X_pick
y_train = matches['radiant_win']

In [8]:
N = 113
data = test_matches
X_pick = np.zeros((data.shape[0], N))
for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [9]:
test = X_pick

In [10]:
logreg = LogisticRegression()
logreg.fit(train, y_train)
logreg_train_preds = logreg.predict_proba(train)[:, 1]
logreg_test_preds = logreg.predict_proba(test)[:, 1]

##xgboost

In [11]:
matches['log_heroes'] = logreg_train_preds
test_matches['log_heroes'] = logreg_test_preds

In [12]:
bad_columns = ['start_time', 'duration', 'radiant_win', 'tower_status_dire', 'tower_status_radiant',
               'barracks_status_radiant', 'barracks_status_dire', 'first_blood_player1', 'first_blood_player2']
categorical_columns = ['lobby_type']
for i in range(1, 6):
    bad_columns.append('r'+str(i)+'_hero')
    bad_columns.append('d'+str(i)+'_hero')
other_cols = [col for col in matches.columns if not (col in categorical_columns)
             and not (col in bad_columns)]

In [13]:
train, y_train = preprocess(matches, categorical_columns, other_cols, train_size=0)
test = preprocess(test_matches, categorical_columns, other_cols, train_size=0)

In [14]:
dtrain = xgb.DMatrix(train, label=y_train)
dtest = xgb.DMatrix(test)

In [15]:
param = {'bst:max_depth':3, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
evallist  = [(dtrain,'train')]

In [16]:
num_round = 10
bst = xgb.train(param, dtrain, num_round)

In [17]:
boost_preds = bst.predict(dtest,ntree_limit=bst.best_ntree_limit)

##Random Forest

In [41]:
forest = RandomForestClassifier(n_estimators=100)
forest.fit(train, y_train)
forest_preds = forest.predict_proba(test)[:, 1]


In [18]:
preds = boost_preds
# preds = 0.5 * (boost_preds + forest_preds)
# test_matches['radiant_win'] = preds
# final = test_matches[['radiant_win']]

In [19]:
roc_auc_score(y_test, preds)

0.72304277696606256

##Другие признаки

In [None]:
import json
import bz2

new_features = matches.copy()
new_features['r_num_obs'] = 0
new_features['r_num_sen'] = 0
new_features['d_num_obs'] = 0
new_features['d_num_sen'] = 0
# new_features['r_buy_back'] = 0
# new_features['r_num_tower_kills'] = 0
# new_features['r_pick_ban'] = 0
# new_features['d_buy_back'] = 0
# new_features['d_num_tower_kills'] = 0
# new_features['d_pick_ban'] = 0
i = 0
with bz2.BZ2File('data/matches.jsonlines.bz2') as matches_file:
    for line in matches_file:
        match = json.loads(line.decode())
        match_id = match['match_id']
        for j in range(5):
            new_features.loc[match_id, 'r_num_obs'] += 1#len(match['players'][j]['obs_log'])
            new_features.loc[match_id, 'r_num_sen'] += len(match['players'][j]['sen_log'])
        for j in range(5, 10):
            new_features.loc[match_id, 'd_num_obs'] += len(match['players'][j]['obs_log'])
            new_features.loc[match_id, 'd_num_sen'] += len(match['players'][j]['sen_log'])
        i += 1
        if not (i % 100):
            print('Match', i)
    

Match 100
Match 200
Match 300
Match 400
Match 500
Match 600
Match 700
Match 800
Match 900
Match 1000
Match 1100
Match 1200
Match 1300
Match 1400
Match 1500
Match 1600
Match 1700
Match 1800
Match 1900
Match 2000
Match

In [49]:
new_features

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r2_hero,r3_hero,r4_hero,r5_hero,d1_hero,d2_hero,d3_hero,...,d_kills,r_deaths,d_deaths,r_items,d_items,log_heroes,r_num_obs,r_num_sen,d_num_obs,d_num_sen
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,67,29,20,105,4,42,21,...,1,1,0,37,33,0.600975,5,2,17,8
1,1430220345,0,42,49,67,37,26,39,88,79,...,2,2,1,38,41,0.606024,0,0,0,0
2,1430227081,7,33,98,20,27,4,22,66,86,...,1,1,2,39,42,0.459030,0,0,0,0
3,1430263531,1,29,30,75,37,41,96,48,15,...,0,0,0,29,37,0.323037,0,0,0,0
4,1430282290,7,13,27,30,72,93,26,69,22,...,1,1,3,42,40,0.353572,0,0,0,0
5,1430284186,1,11,20,28,25,65,55,52,3,...,2,2,1,30,49,0.397774,0,0,0,0
8,1430293701,1,8,57,7,21,36,22,30,72,...,4,4,3,46,55,0.577407,0,0,0,0
9,1430299335,7,35,15,83,29,101,100,25,26,...,0,0,0,31,37,0.462537,0,0,0,0
11,1430308974,1,17,91,53,72,30,90,96,35,...,2,2,1,30,38,0.578084,0,0,0,0
12,1430316105,7,15,41,74,90,42,76,20,83,...,2,2,0,37,39,0.477917,0,0,0,0


In [33]:
match['players'][0]['sen_log']

[{'time': 799, 'xy': [116, 129]}]

In [27]:
# matches.iloc[10]['lobby_type']

1.0