In [1]:
import os
import pickle
from IPython.display import HTML, display

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [63]:
df_matches = pd.read_csv('./data/dataset.csv', index_col=0)

In [64]:
df_matches.head()

Unnamed: 0,account_id,assists,camps_stacked,creeps_stacked,deaths,denies,duration,gold_per_min,gold_spent,hero_damage,...,cm_enabled,int_gain,legs,move_speed,pro_ban,pro_pick,pro_win,projectile_speed,str_gain,turn_rate
0,57516313,12,3.0,7.0,2,0,1635,277,6655,7851,...,True,2.8,2,290,71.0,157.0,65.0,1100,2.6,0.5
1,26356855,11,3.0,6.0,5,0,1635,392,11140,8836,...,True,1.8,2,285,5.0,15.0,4.0,1100,3.2,1.0
2,121769650,10,0.0,0.0,2,20,1635,618,13750,13525,...,True,1.8,0,285,142.0,115.0,52.0,2000,2.6,0.5
3,62533910,7,2.0,6.0,0,20,1635,713,17845,10318,...,True,1.4,2,320,50.0,38.0,13.0,900,2.4,0.5
4,92367063,7,0.0,0.0,3,3,1635,474,12785,8587,...,True,1.7,2,305,94.0,128.0,73.0,900,3.3,0.5


In [65]:
'lane_role' in df_matches.columns

True

In [66]:
LANES = {
    'safe': 1,
    'mid': 2,
    'off': 3,
    'jungle': 4
}

In [67]:
ROLES = {v: k for k, v in LANES.items()}

In [68]:
df_matches['lane_role'].value_counts()

2.0    3469
1.0    3405
3.0    2939
4.0     117
Name: lane_role, dtype: int64

In [69]:
for col in df_matches.dtypes.items():
    print(col)

('account_id', dtype('int64'))
('assists', dtype('int64'))
('camps_stacked', dtype('float64'))
('creeps_stacked', dtype('float64'))
('deaths', dtype('int64'))
('denies', dtype('int64'))
('duration', dtype('int64'))
('gold_per_min', dtype('int64'))
('gold_spent', dtype('int64'))
('hero_damage', dtype('int64'))
('hero_healing', dtype('int64'))
('hero_id', dtype('int64'))
('is_roaming', dtype('bool'))
('kills', dtype('int64'))
('lane', dtype('float64'))
('lane_role', dtype('float64'))
('last_hits', dtype('int64'))
('level', dtype('int64'))
('match_id', dtype('int64'))
('observers_placed', dtype('float64'))
('player_slot', dtype('int64'))
('roshans_killed', dtype('float64'))
('rune_pickups', dtype('float64'))
('start_time', dtype('int64'))
('stuns', dtype('float64'))
('teamfight_participation', dtype('float64'))
('tower_damage', dtype('int64'))
('towers_killed', dtype('float64'))
('win', dtype('bool'))
('xp_per_min', dtype('int64'))
('id', dtype('O'))
('datetime', dtype('O'))
('t_item_aeth

In [70]:
id_columns = [
    'hero_id', 'match_id', 'account_id', 'team_id',
    'start_time', 'id', 'datetime', 'leaguename',
    'win'
]
categorical_columns = [
    c 
    for c in df_matches.columns 
    if df_matches[c].dtype.name in ('object', 'bool') and c not in id_columns
]
numerical_columns = [
    c 
    for c in df_matches.columns 
    if df_matches[c].dtype.name not in ('object', 'bool') and c not in id_columns
]
valid_columns = numerical_columns + categorical_columns

### Drop id columns

In [71]:
X = df_matches[valid_columns]
y = df_matches['win']

In [72]:
model = {}
for role, df_role in df_matches.groupby('lane_role'):
    print('Model for role: {}'.format(role))
    X = df_role[valid_columns]
    y = df_role['win']
    
    df_dummies = pd.get_dummies(X[categorical_columns])
    binary_columns = df_dummies.columns
    X = pd.concat([X, df_dummies], axis = 1)
    X.drop(categorical_columns, axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 12345)
    N_train, _ = X_train.shape 
    N_test,  _ = X_test.shape 
    print(N_train, N_test)
    
    for col in X_train.columns:
        if col in numerical_columns:
            train_denum = X_train[col].max() - X_train[col].min()
            if train_denum:
                X_train[col] = (X_train[col] - X_train[col].min()) / train_denum
            test_denum = (X_test[col].max() - X_test[col].min())
            if test_denum:
                X_test[col] = (X_test[col] - X_test[col].min()) / test_denum
    
    lm = LogisticRegression(penalty = 'l1', C = 1, fit_intercept=False)
#     lm = GradientBoostingClassifier()
    lm.fit(X_train, y_train)
    model[role] = lm
    
    print("Train accuracy = %s" %metrics.accuracy_score(y_train, lm.predict(X_train)))
    print("Test accuracy = %s" %metrics.accuracy_score(y_test, lm.predict(X_test)))

    print("Train AUC = %s" %metrics.roc_auc_score(y_train, lm.predict_proba(X_train)[:,1]))
    print("Test AUC = %s" %metrics.roc_auc_score(y_test, lm.predict_proba(X_test)[:,1]))

    print("Train Recall = %s" %metrics.recall_score(y_train, lm.predict(X_train)))
    print("Test Recall = %s" %metrics.recall_score(y_test, lm.predict(X_test)))

    print("Train Precision = %s" %metrics.precision_score(y_train, lm.predict(X_train)))
    print("Test Precision = %s" %metrics.precision_score(y_test, lm.predict(X_test)))
    
    print()
    break

Model for role: 1.0
2383 1022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Train accuracy = 0.991607217793
Test accuracy = 0.957925636008
Train AUC = 0.999708622236
Test AUC = 0.991906865516
Train Recall = 0.992831541219
Test Recall = 0.955801104972
Train Precision = 0.989285714286
Test Precision = 0.96468401487



In [119]:
for role, model_obj in model.items():
    file_name = 'model_{}.sklearn'.format(role)
    file_path = os.path.join(os.path.abspath('data'), file_name)
    with open(file_path, 'wb') as model_file:
        pickle.dump(model_obj, model_file)

In [None]:
{
    "gold_reasons_0": "Другие",
    "gold_reasons_1": "Смерть",
    "gold_reasons_2": "Выкуп",
    "NULL_gold_reasons_5": "Покинувшие",
    "NULL_gold_reasons_6": "Продажа",
    "gold_reasons_11": "Строение",
    "gold_reasons_12": "Герой",
    "gold_reasons_13": "Крипы",
    "gold_reasons_14": "Рошан",
    "NULL_gold_reasons_15": "Курьер",
    "xp_reasons_0": "Другие",
    "xp_reasons_1": "Герой",
    "xp_reasons_2": "Крипы",
    "xp_reasons_3": "Рошан",
}

In [73]:
for role, model_obj in model.items():
    if hasattr(model_obj, 'feature_importances_'):
        importance = model_obj.feature_importances_
    elif hasattr(model_obj, 'coef_'):
        importance = model.obj.coef_[0]
    else:
        break
    importance = list(zip(X_train.columns.tolist(), importance))
    importance = sorted(importance, key=lambda tpl: tpl[1], reverse=True)
    importance = [tpl for tpl in importance if tpl[1] != 0]
    print('role:', ROLES[int(role)])
    display(importance)
    print()

role: safe


[('gold_reason_11', 0.24680057785842313),
 ('gold_reason_1', 0.099583475104105576),
 ('assists', 0.07818635405936647),
 ('duration', 0.056704919438365843),
 ('gold_reason_2', 0.042127716991875681),
 ('teamfight_participation', 0.025755053087492943),
 ('tower_damage', 0.023944762097324653),
 ('xp_per_min', 0.020613044706550881),
 ('towers_killed', 0.019916185513948161),
 ('deaths', 0.01756374302691471),
 ('gold_reason_14', 0.015180010714938217),
 ('gold_reason_0', 0.01512036331014477),
 ('xp_reason_1', 0.01197483086368909),
 ('kills', 0.011486666507234072),
 ('base_armor', 0.010824554888002779),
 ('base_health_regen', 0.01053573699159556),
 ('gold_per_min', 0.010454367514273222),
 ('t_item_ultimate_scepter', 0.010390254980145969),
 ('t_item_force_staff', 0.0097622454001771736),
 ('t_item_black_king_bar', 0.0085485536448079453),
 ('xp_reason_2', 0.0084810159077033081),
 ('t_expireance_3_sum', 0.0082788763424058148),
 ('gold_reason_5', 0.0076756643613610778),
 ('4000_win', 0.0074241687658




In [231]:
%matplotlib notebook
import matplotlib.pyplot as plt

In [234]:
pd.DataFrame(model_obj.predict_proba(X_test)).describe()

Unnamed: 0,0,1
count,21.0,21.0
mean,0.525425,0.474575
std,0.50972,0.50972
min,0.000306,0.000235
25%,0.000306,0.000235
50%,0.999484,0.000516
75%,0.999765,0.999694
max,0.999765,0.999694


In [100]:
import requests
# For missings
INTEGER_MISSING = -666
DEFAULT_VALUES = {
    't_': INTEGER_MISSING,
    'is_roaming': False,
    'team_id': INTEGER_MISSING,
    'pings': INTEGER_MISSING,
    'gold_': INTEGER_MISSING,
    'xp_': INTEGER_MISSING,
    'kill_': INTEGER_MISSING,
    'multi_': INTEGER_MISSING,
    'hero_': 0,
}


def query_opendota(sql):
    resp = requests.get('https://api.opendota.com/api/explorer', params={'sql': sql})
    data = resp.json()
    if resp.status_code == 400 and data is not None:
        sys.stderr.write(data.get('error'))
    resp.raise_for_status()
    return pd.DataFrame.from_records(data['rows'])


def create_unique_id(df_matches):
    unique_ids = []
    for _, row in df_matches.iterrows():
        unique_id = '{}_{}'.format(row['match_id'], row['account_id'])
        unique_ids.append(unique_id)
    return unique_ids


def create_core_items_timings(df_matches):
    t_item = dict()
    core_items = [
        'blink',
        'power_treads',
        'ultimate_scepter',
        'phase_boots',
        'travel_boots',
        'blade_mail',
        'arcane_boots',
        'black_king_bar',
        'desolator',
        'magic_wand',
        'tranquil_boots',
        'invis_sword',
        'echo_sabre',
        'manta',
        'ring_of_aquila',
        'force_staff',
        'aether_lens',
        'wind_lace',
        'silver_edge',
        'hand_of_midas',
        'sphere',
        'boots',
        'hurricane_pike',
        'bottle'
    ]
    column_names = ['id']
    for item in core_items:
        column_names.append("t_item_" + item)
    items = pd.DataFrame(columns=column_names)
    for i, row in df_matches.iterrows():
        purchase_list = row['purchase_log']
        for item in core_items:
            column_name = "t_item_" + item
            t_item[column_name] = np.nan

        if purchase_list is not None:
            for purchase in purchase_list:
                for item in core_items:
                    column_name = "t_item_" + item
                    if purchase['item_name'] == item:
                        t_item[column_name] = purchase['time']
                        unique_id = '{}_{}'.format(row['match_id'], row['account_id'])
                        t_item['id'] = unique_id
            items_row = pd.DataFrame.from_dict(t_item, orient='index').transpose()
            items = items.append(items_row, ignore_index=True)
    return items


def create_first_kill_timing(df_matches):
    kill_logs = df_matches['kills_log']
    t_first_kills = []
    for kill_log in kill_logs:
        if kill_log:
            t_first_kill = kill_log[0]['time']
            t_first_kills.append(t_first_kill)
        else:
            t_first_kills.append(None)
    return t_first_kills


# Calculates the number of events in log in fixed time range
# log = [{'time': 1}, {'time': 2}, {'time': 6}] -->
# --> aggregations = {'t_<log_name>_5_cnt': 2, 't_<log_name>_10_cnt': 1, ...}
def aggregate_by_times(log, log_name, times=None):
    aggregations = {}
    if times is None:
        max_min = 31
        period = 3
        times = list(range(0, max_min, period))
    for agg_type in ('cnt', 'rpm'):
        aggregations.update({
            't_{}_{}_{}'.format(log_name, t, agg_type): 0
            for t in times
        })
        for record in log:
            for t in times:
                if record['time'] <= t * 60:
                    agg_name = 't_{}_{}_{}'.format(log_name, t, agg_type)
                    if agg_type == 'cnt':
                        aggregations[agg_name] += 1
                    elif agg_type == 'rpm':
                        aggregations[agg_name] += 1 / float(t if t != 0 else 1)
    return aggregations


def aggregate_array_of_ints_by_times(array_of_ints_log, log_name, times=None):
    if times is None:
        max_min = 31
        period = 3
        times = list(range(0, max_min, period))
    if isinstance(array_of_ints_log, str):
        array_of_ints_log = eval(array_of_ints_log)
    aggregation = {}
    for t in times:
        try:
            agg_name = 't_{}_{}_{}'.format(log_name, t, 'sum')
            aggregation[agg_name] = array_of_ints_log[t]
        except IndexError:
            pass
    return aggregation


# From column of dicts the function creates columns with dict keys as names
def create_aggregations_from_logs(df_matches):
    df_observers = df_matches['obs_log'].apply(lambda log: aggregate_by_times(log, 'obs')).apply(pd.Series)
    df_sentries = df_matches['sen_log'].apply(lambda log: aggregate_by_times(log, 'sen')).apply(pd.Series)
    df_runes = df_matches['runes_log'].apply(lambda log: aggregate_by_times(log, 'runes')).apply(pd.Series)
    df_buyback = df_matches['buyback_log'].apply(lambda log: aggregate_by_times(log, 'buybacks'))
    df_kills = df_matches['kills_log'].apply(lambda log: aggregate_by_times(log, 'kills')).apply(pd.Series)
    df_gold_reasons = df_matches['gold_reasons'].apply(pd.Series)
    # df_hero_roles = df_matches['role_log'].apply(
    #     lambda roles:
    #     {'hero_role_{}'.format(role.lower()): 1 for role in roles}
    # ).apply(pd.Series)
    for col in df_gold_reasons.columns:
        df_gold_reasons.rename(columns={col: '{}_{}'.format('gold_reason', col)}, inplace=True)
    df_xp_reasons = df_matches['xp_reasons'].apply(pd.Series)
    for col in df_xp_reasons.columns:
        df_xp_reasons.rename(columns={col: '{}_{}'.format('xp_reason', col)}, inplace=True)
    df_kill_streaks = df_matches['kill_streaks'].apply(pd.Series)
    for col in df_kill_streaks.columns:
        # We should skip kill streaks longer then 11 because of rare values
        if int(col) > 11:
            df_kill_streaks.drop(col, axis=1, inplace=True)
        else:
            df_kill_streaks.rename(columns={col: '{}_{}'.format('kill_streak', col)}, inplace=True)
    df_multi_kills = df_matches['multi_kills'].apply(pd.Series)
    for col in df_multi_kills.columns:
        df_multi_kills.rename(columns={col: '{}_{}'.format('multi_kill', col)}, inplace=True)

    # df_matches['pings'] = df_matches['ping_log'].apply(lambda dct: dct.get('0'))
    df_gold = df_matches['gold_t'].apply(lambda log: aggregate_array_of_ints_by_times(log, 'gold')).apply(pd.Series)
    df_lh = df_matches['lh_t'].apply(lambda log: aggregate_array_of_ints_by_times(log, 'lasthits')).apply(pd.Series)
    df_xp = df_matches['xp_t'].apply(lambda log: aggregate_array_of_ints_by_times(log, 'expireance')).apply(pd.Series)

    df_matches = pd.concat([
        df_matches, df_observers, df_sentries,
        df_runes, df_buyback, df_kills,
        df_gold_reasons, df_xp_reasons, df_kill_streaks,
        df_multi_kills,
        # df_hero_roles,
        df_gold,
        df_lh,
        df_xp,
    ], axis=1)
    df_matches.drop([
        'obs_log', 'sen_log',
        'runes_log', 'buyback_log', 'kills_log',
        'gold_reasons', 'xp_reasons', 'kill_streaks',
        'multi_kills',
        # 'role_log',
        # 'ping_log',
        'gold_t',
        'lh_t',
        'xp_t',

    ], axis=1, inplace=True)
    return df_matches


def create_hero_stats_table():
    hero_stats_url = 'https://api.opendota.com/api/heroStats'
    resp = requests.get(hero_stats_url)
    data = resp.json()
    if resp.status_code == 400 and data is not None:
        sys.stderr.write(data.get('error'))
    resp.raise_for_status()
    df_heroes = pd.DataFrame(data)
    df_heroes.drop([
        'id', 'icon', 'img', 'name', 'roles',
        'attack_type', 'localized_name', 'primary_attr'
    ], axis=1, inplace=True)
    df_heroes = df_heroes.fillna(0)
    return df_heroes


def fill_missings(df_matches):
    columns = df_matches.columns
    for col in columns:
        for key, column_default_value in DEFAULT_VALUES.items():
            if col.startswith(key):
                df_matches[col].fillna(column_default_value, inplace=True)
    return df_matches


def create_dataset(df_matches):
    items = create_core_items_timings(df_matches)
    df_matches = pd.concat([df_matches, items], axis=1)
    df_matches.drop('purchase_log', axis=1, inplace=True)
    df_matches['t_first_kill'] = create_first_kill_timing(df_matches)
    df_matches = create_aggregations_from_logs(df_matches)
    df_heroes = create_hero_stats_table()
    df_matches = df_matches.merge(df_heroes, on='hero_id', how='left', suffixes=('', ''), copy=False)
    df_matches = fill_missings(df_matches)
    return df_matches

In [144]:
df_player = pd.DataFrame(pd.Series(p)).T
df_player = create_dataset(df_player)
df_player = df_player.merge(pd.DataFrame(columns=valid_columns), how='left').fillna(0)
df_player

Unnamed: 0,ability_upgrades,ability_upgrades_arr,ability_uses,account_id,actions,assists,backpack_0,backpack_1,backpack_2,camps_stacked,...,kill_streak_5,kill_streak_6,kill_streak_7,kill_streak_8,kill_streak_9,multi_kill_3,multi_kill_4,multi_kill_5,multi_kill_6,multi_kill_7
0,"[{'ability': 5609, 'time': 801, 'level': 1}, {...","[5609, 5608, 5608, 5610, 5608, 5612, 5608, 561...","{'earth_spirit_rolling_boulder': 55, 'earth_sp...",113331514,"{'1': 3960, '2': 55, '3': 25, '4': 350, '5': 1...",13,0,40,0,2,...,0,0,0,0,0,0,0,0,0,0


In [148]:
import json
with open('data/3549145408_1618162017_anal.jsonl', 'r') as json_file:
    d = json.load(json_file)

In [182]:
win_proba = []
for odota_player in d['odota']['players']:
    df_player = pd.DataFrame(pd.Series(odota_player)).T
    df_player = create_dataset(df_player)
    df_player = df_player.merge(pd.DataFrame(columns=valid_columns), how='left').fillna(-666)
    
    df_player = df_player[valid_columns]
    df_dummies = pd.get_dummies(df_player[categorical_columns])
    binary_columns = df_dummies.columns
    df_player = pd.concat([df_player, df_dummies], axis = 1)
    df_player.drop(categorical_columns, axis=1, inplace=True)
    
    for col in df_player.columns:
        if col in numerical_columns:
            denum = X_train[col].max() - X_train[col].min()
            if train_denum:
                df_player[col] = (df_player[col] - X_train[col].min()) / train_denum
    win_class_index = list(model_obj.classes_).index(True)
    win_proba.append(model_obj.predict_proba(df_player)[0][win_class_index])

In [183]:
win_proba

[0.59036913099820976,
 0.18791029321148603,
 0.24336472278894089,
 0.5682731093953125,
 0.27816149877142671,
 0.51563234166344929,
 0.35481302303810741,
 0.17881262473500478,
 0.44213010223936194,
 0.31320858025111104]

In [184]:
radiant_cumulative_proba = sum(win_proba[:5])
dire_cumulative_proba = sum(win_proba[5:])

In [188]:
print(radiant_cumulative_proba)
print(dire_cumulative_proba)

1.86807875517
1.80459667193


In [187]:
contribution = [0] * 10
for slot in range(10):
    if slot < 5:
        contribution[slot] = win_proba[slot] / radiant_cumulative_proba
    elif 5 <= slot < 10:
        contribution[slot] = win_proba[slot] / dire_cumulative_proba

In [189]:
contribution

[0.31603010813425053,
 0.10059013448544404,
 0.13027540841948954,
 0.30420190145838094,
 0.14890244750243511,
 0.28573273445796199,
 0.19661624592226534,
 0.099087307162137303,
 0.24500217091015372,
 0.1735615415474816]

In [191]:
rr = [
    11.2,
    23.9,
    12.8,
    35.7,
    18.4,
]

In [192]:
[r / sum(rr) for r in rr]

[0.10980392156862745,
 0.23431372549019605,
 0.12549019607843137,
 0.35000000000000003,
 0.1803921568627451]