In [11]:
import pandas as pd
import numpy as np
from sklearn import ensemble 
from sklearn import metrics

# this is meant to be a simple example so only matches and players are used
matches = pd.read_csv('./data/match.csv', index_col=0)
players = pd.read_csv('./data/players.csv')

test_labels = pd.read_csv('./data/test_labels.csv', index_col=0)
test_players = pd.read_csv('data/test_player.csv')

train_labels = matches['radiant_win'].astype(int)


# Predicting Match Outcome

In [12]:
matches.head()

Unnamed: 0_level_0,start_time,duration,tower_status_radiant,tower_status_dire,barracks_status_dire,barracks_status_radiant,first_blood_time,game_mode,radiant_win,negative_votes,positive_votes,cluster
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1446750112,2375,1982,4,3,63,1,22,True,0,1,155
1,1446753078,2582,0,1846,63,0,221,22,False,0,2,154
2,1446764586,2716,256,1972,63,48,190,22,False,0,0,132
3,1446765723,3085,4,1924,51,3,40,22,False,0,0,191
4,1446796385,1887,2047,0,0,63,58,22,True,0,0,156


In [94]:
players.head(15)

Unnamed: 0,match_id,account_id,hero_id,player_slot,gold,gold_spent,gold_per_min,xp_per_min,kills,deaths,...,unit_order_glyph,unit_order_eject_item_from_stash,unit_order_cast_rune,unit_order_ping_ability,unit_order_move_to_direction,unit_order_patrol,unit_order_vector_target_position,unit_order_radar,unit_order_set_item_combine_lock,unit_order_continue
0,0,0,86,0,3261,10960,347,362,9,3,...,,,,6.0,,,,,,
1,0,1,51,1,2954,17760,494,659,13,3,...,,,,14.0,,,,,,
2,0,0,83,2,110,12195,350,385,0,4,...,,,,17.0,,,,,,
3,0,2,11,3,1179,22505,599,605,8,4,...,1.0,,,13.0,,,,,,
4,0,3,67,4,3307,23825,613,762,20,3,...,3.0,,,23.0,,,,,,
5,0,4,106,128,476,12285,397,524,5,6,...,,,,2.0,,,,,,
6,0,0,102,129,317,10355,303,369,4,13,...,4.0,1.0,,1.0,,,,,,
7,0,5,46,130,2390,13395,452,517,4,8,...,,,,4.0,110.0,,,,,
8,0,0,7,131,475,5035,189,223,1,14,...,1.0,,,4.0,,,,,,
9,0,6,73,132,60,17550,496,456,1,11,...,1.0,,,14.0,,,,,,


In [15]:
#choose features to use
feature_columns = players.iloc[:3,4:17].columns.tolist()
feature_columns

['gold',
 'gold_spent',
 'gold_per_min',
 'xp_per_min',
 'kills',
 'deaths',
 'assists',
 'denies',
 'last_hits',
 'stuns',
 'hero_damage',
 'hero_healing',
 'tower_damage']

In [18]:
#get mean of each account to 'estimate' the average performance

player_groups = players.groupby('account_id')
feature_components = player_groups[feature_columns].mean()
feature_components.head()

Unnamed: 0_level_0,gold,gold_spent,gold_per_min,xp_per_min,kills,deaths,assists,denies,last_hits,hero_damage,hero_healing,tower_damage
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1800.798735,13955.154883,407.672621,447.691653,7.436487,8.029696,11.644845,4.403066,124.977535,12227.711667,427.988298,1232.203666
1,8642.5,21200.0,627.5,667.5,20.5,1.5,13.5,8.0,242.0,31304.5,0.0,2256.0
2,1756.333333,20576.666667,537.666667,520.0,10.0,7.333333,16.666667,2.333333,277.0,14060.666667,1066.666667,3525.666667
3,3307.0,23825.0,613.0,762.0,20.0,3.0,17.0,13.0,245.0,33740.0,243.0,1833.0
4,763.5,12597.5,381.0,480.0,5.5,8.5,10.0,6.0,146.5,11819.0,0.0,324.5


In [19]:
# now to construct match_level features from the components
# account_id is needed to join with feature_components
train_ids = players[['match_id','account_id']]
test_ids = test_players[['match_id','account_id']]

In [31]:
#create dataframes for each match_id, account_id with the player's
#average performance

train_feat_comp = pd.merge(train_ids, feature_components,
                           how='left', left_on='account_id' ,
                           right_index=True).drop('account_id',axis=1)

test_feat_comp = pd.merge(test_ids, feature_components, 
                          how='left', left_on='account_id',
                          right_index=True)\
                          .drop('account_id', axis=1)


In [34]:
train_feat_comp.head(5)

Unnamed: 0,match_id,gold,gold_spent,gold_per_min,xp_per_min,kills,deaths,assists,denies,last_hits,hero_damage,hero_healing,tower_damage
0,0,1800.798735,13955.154883,407.672621,447.691653,7.436487,8.029696,11.644845,4.403066,124.977535,12227.711667,427.988298,1232.203666
1,0,8642.5,21200.0,627.5,667.5,20.5,1.5,13.5,8.0,242.0,31304.5,0.0,2256.0
2,0,1800.798735,13955.154883,407.672621,447.691653,7.436487,8.029696,11.644845,4.403066,124.977535,12227.711667,427.988298,1232.203666
3,0,1756.333333,20576.666667,537.666667,520.0,10.0,7.333333,16.666667,2.333333,277.0,14060.666667,1066.666667,3525.666667
4,0,3307.0,23825.0,613.0,762.0,20.0,3.0,17.0,13.0,245.0,33740.0,243.0,1833.0


In [74]:
# creates series of data per matchids
#
# unstack: each match_id will become a row
# iloc: Remove redundant match_ids
# rest_index: make match_id the index and reshapes 
def unstack_simplify(df):
    return df.unstack().iloc[10:].reset_index(drop=True)
def unstack_simplify2(df):
    return df.unstack().iloc[10:]

In [86]:
test_feat_group = test_feat_comp.groupby('match_id')

test_feats = test_feat_group.apply(unstack_simplify)
test_feats, test_feats.shape

(                  0            1            2            3            4    \
 match_id                                                                    
 50000      623.500000          NaN          NaN  6420.000000  1588.250000   
 50001     2250.222222  1800.798735  1800.798735          NaN  2358.000000   
 50002     1133.000000          NaN  2587.272727  2935.000000  1800.798735   
 50003             NaN  1800.798735  1800.798735  2002.140351    77.000000   
 50004     1800.798735  2944.500000  1800.798735  1800.798735  1800.798735   
 50005     1855.000000  1467.500000  1562.272727  3322.307692  1800.798735   
 50006     1800.798735   462.000000  1800.798735  1851.666667  2702.000000   
 50007             NaN  1828.312500          NaN  1800.798735  1800.798735   
 50008     2821.000000  4128.500000  1800.798735  2226.714286          NaN   
 50009      956.000000  1073.500000          NaN  3673.500000  1910.741935   
 50010      916.000000  1800.798735  1800.798735  1800.798735  3

In [77]:
train_feat_group = train_feat_comp.groupby('match_id')
train_feats = train_feat_group.apply(unstack_simplify)

In [88]:
#use random forest to fit data
rf = ensemble.RandomForestClassifier(n_estimators=150, n_jobs=-1)
rf.fit(train_feats,train_labels) 


# this is a bad way to deal with missing values 
test_feats.replace(np.nan, 0, inplace=True)

test_probs = rf.predict_proba(test_feats)
test_preds = rf.predict(test_feats)



In [83]:
#array of win/loss for each match
print test_labels.values.ravel()



[0 1 0 ..., 1 1 0]


In [89]:
metrics.log_loss(test_labels.values.ravel(), test_probs[:,1])



0.76797726559665791

In [90]:
metrics.roc_auc_score(test_labels.values, test_probs[:,1])

0.50249720905462414

In [92]:
print(metrics.classification_report(test_labels.values, test_preds))

             precision    recall  f1-score   support

          0       0.48      0.46      0.47     48139
          1       0.52      0.54      0.53     51861

avg / total       0.50      0.50      0.50    100000

