In [1]:
import csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.externals import joblib

# Data preprocessing
## Importing heroes.json

In [2]:
import json
json_file = json.load(open("heroes.json", "rt"))
heroes_list = json_file["heroes"]

In [3]:
def find_hero(index):
    for hero in heroes_list:
        if int(hero["id"]) == index:
            return hero["localized_name"]

In [4]:
input_file = open("706d.csv", "rt")
csv_reader = csv.reader(input_file, delimiter=",")
input_data = np.array(list(csv_reader))

In [5]:
input_data.shape

(506029, 14)

## Filtering games according to the desired MMR

In [6]:
filtered_data = []
for i in range(input_data.shape[0]):
    if int(input_data[i][-1]) >= 4000:
        filtered_data.append(input_data[i])
        
filtered_data = np.array(filtered_data)
print filtered_data.shape

(44709, 14)


In [7]:
NUMBER_OF_FEATURES = 2 * 114
print NUMBER_OF_FEATURES

228


## Initializing dictionaries for storing winrates

In [8]:
X_matrix = np.zeros((filtered_data.shape[0], NUMBER_OF_FEATURES), dtype=np.float32)
y_matrix = np.zeros((filtered_data.shape[0], ), dtype=np.int8)

In [9]:
winrates_radiant = {}
winrates_radiant['apps'] = np.zeros((114, 114), dtype=np.float32)
winrates_radiant['wins'] = np.zeros((114, 114), dtype=np.float32)
winrates_radiant['winrate'] = np.full((114, 114), -1, dtype=np.float32)

In [10]:
winrates_dire = {}
winrates_dire['apps'] = np.zeros((114, 114), dtype=np.float32)
winrates_dire['wins'] = np.zeros((114, 114), dtype=np.float32)
winrates_dire['winrate'] = np.full((114, 114), -1, dtype=np.float32)

In [11]:
counter_radiant = {}
counter_radiant['apps'] = np.zeros((114, 114), dtype=np.float32)
counter_radiant['wins'] = np.zeros((114, 114), dtype=np.float32)
counter_radiant['winrate'] = np.full((114, 114), -1, dtype=np.float32)

In [12]:
counter_dire = {}
counter_dire['apps'] = np.zeros((114, 114), dtype=np.float32)
counter_dire['wins'] = np.zeros((114, 114), dtype=np.float32)
counter_dire['winrate'] = np.full((114, 114), -1, dtype=np.float32)

## Converting the filtered data to the 228 features format

In [13]:
%%time

for i in range(filtered_data.shape[0]):
    hero_list = filtered_data[i][2:12]
    radiant_win = filtered_data[i][1]
    
    for j in range(5):
        
        X_matrix[i][int(hero_list[j]) - 1] = 1
        
    for j in range(5):
        X_matrix[i][int(hero_list[j + 5]) - 1 + 114] = 1
    
    y_matrix[i] = int(radiant_win)

print "Prepared the data for ML"

Prepared the data for ML
CPU times: user 328 ms, sys: 64 ms, total: 392 ms
Wall time: 330 ms


## Splitting the data into train and test sets

In [14]:
X_train, X_test, y_train, y_test = \
    train_test_split(X_matrix, y_matrix, test_size=0.2, random_state=42)

## Updating the winrates for synergies and counters if there is enough information

In [15]:
%%time

for i in range(X_train.shape[0]):
    if i % 10000 == 9999:
        print (i + 1)
        
    radiant_win = y_train[i]
    heroes = X_train[i]
    
    indices_list = []
    for j in range(len(heroes)):
        if heroes[j] == 1:
            indices_list.append(j)
            
    for j in range(len(indices_list)):
        for k in range(len(indices_list)):
            if j != k and indices_list[j] < 114 and indices_list[k] < 114:
                winrates_radiant['apps'][indices_list[j]][indices_list[k]] += 1
            
                if radiant_win == 1:
                    winrates_radiant['wins'][indices_list[j]][indices_list[k]] += 1
                    
            if j != k and indices_list[j] >= 114 and indices_list[k] >= 114:
                winrates_dire['apps'][indices_list[j] - 114][indices_list[k] - 114] += 1
                
                if radiant_win == 0:
                    winrates_dire['wins'][indices_list[j] - 114][indices_list[k] - 114] += 1
                    
            if indices_list[j] < 114 and indices_list[k] >= 114:
                counter_radiant['apps'][indices_list[j]][indices_list[k] - 114] += 1
                counter_dire['apps'][indices_list[k] - 114][indices_list[j]] += 1
                
                if radiant_win == 1:
                    counter_radiant['wins'][indices_list[j]][indices_list[k] - 114] += 1
                else:
                    counter_dire['wins'][indices_list[k] - 114][indices_list[j]] += 1
                

10000
20000
30000
CPU times: user 19.1 s, sys: 0 ns, total: 19.1 s
Wall time: 19.1 s


In [16]:
for i in range(114):
    for j in range(114):
        if i != j and i != 23 and j != 23:
            if winrates_radiant['apps'][i][j] < 10:
                print "Didn't find enough matches together of heroes %d and %d on radiant" % (i + 1, j + 1)
            else:
                winrates_radiant['winrate'][i][j] = winrates_radiant['wins'][i][j] / float(winrates_radiant['apps'][i][j])
                
            if winrates_dire['apps'][i][j] < 10:
                print "Didn't find enough matches together of heroes %d and %d on dire" % (i + 1, j + 1)
            else:
                winrates_dire['winrate'][i][j] = winrates_dire['wins'][i][j] / float(winrates_dire['apps'][i][j])
                
            if counter_radiant['apps'][i][j] < 10:
                print "Didn't find enough matches together of heroes %d and %d on radiant" % (i + 1, j + 1)
            else:
                counter_radiant['winrate'][i][j] = counter_radiant['wins'][i][j] / float(counter_radiant['apps'][i][j])
                
            if counter_dire['apps'][i][j] < 10:
                print "Didn't find enough matches together of heroes %d and %d on radiant" % (i + 1, j + 1)
            else:
                counter_dire['winrate'][i][j] = counter_dire['wins'][i][j] / float(counter_dire['apps'][i][j])

Didn't find enough matches together of heroes 1 and 38 on radiant
Didn't find enough matches together of heroes 1 and 67 on radiant
Didn't find enough matches together of heroes 1 and 72 on dire
Didn't find enough matches together of heroes 1 and 89 on radiant
Didn't find enough matches together of heroes 1 and 109 on radiant
Didn't find enough matches together of heroes 1 and 109 on dire
Didn't find enough matches together of heroes 2 and 38 on radiant
Didn't find enough matches together of heroes 2 and 38 on dire
Didn't find enough matches together of heroes 2 and 52 on radiant
Didn't find enough matches together of heroes 2 and 78 on dire
Didn't find enough matches together of heroes 2 and 89 on radiant
Didn't find enough matches together of heroes 3 and 10 on radiant
Didn't find enough matches together of heroes 3 and 10 on radiant
Didn't find enough matches together of heroes 3 and 12 on dire
Didn't find enough matches together of heroes 3 and 12 on radiant
Didn't find enough matc

Didn't find enough matches together of heroes 59 and 15 on radiant
Didn't find enough matches together of heroes 59 and 19 on radiant
Didn't find enough matches together of heroes 59 and 38 on radiant
Didn't find enough matches together of heroes 59 and 38 on dire
Didn't find enough matches together of heroes 59 and 38 on radiant
Didn't find enough matches together of heroes 59 and 43 on radiant
Didn't find enough matches together of heroes 59 and 49 on radiant
Didn't find enough matches together of heroes 59 and 49 on dire
Didn't find enough matches together of heroes 59 and 49 on radiant
Didn't find enough matches together of heroes 59 and 52 on radiant
Didn't find enough matches together of heroes 59 and 52 on dire
Didn't find enough matches together of heroes 59 and 52 on radiant
Didn't find enough matches together of heroes 59 and 58 on radiant
Didn't find enough matches together of heroes 59 and 58 on dire
Didn't find enough matches together of heroes 59 and 58 on radiant
Didn't 

Didn't find enough matches together of heroes 92 and 94 on radiant
Didn't find enough matches together of heroes 92 and 94 on radiant
Didn't find enough matches together of heroes 92 and 96 on radiant
Didn't find enough matches together of heroes 92 and 100 on radiant
Didn't find enough matches together of heroes 92 and 100 on dire
Didn't find enough matches together of heroes 92 and 100 on radiant
Didn't find enough matches together of heroes 92 and 101 on dire
Didn't find enough matches together of heroes 92 and 101 on radiant
Didn't find enough matches together of heroes 92 and 102 on radiant
Didn't find enough matches together of heroes 92 and 102 on dire
Didn't find enough matches together of heroes 92 and 102 on radiant
Didn't find enough matches together of heroes 92 and 102 on radiant
Didn't find enough matches together of heroes 92 and 103 on radiant
Didn't find enough matches together of heroes 92 and 103 on dire
Didn't find enough matches together of heroes 92 and 103 on rad

## Only save the advantages (current_winrate - global_winrate)

In [17]:
for j in range(114):
    if j != 23:
        apps_radiant = np.sum(winrates_radiant['apps'][j])
        wins_radiant = np.sum(winrates_radiant['wins'][j])

        global_radiant_winrate = wins_radiant / apps_radiant

        for i in range(len(winrates_radiant['winrate'][j])):
            if winrates_radiant['winrate'][j][i] != 0.:
                winrates_radiant['winrate'][j][i] -= global_radiant_winrate

        apps_dire = np.sum(winrates_dire['apps'][j])
        wins_dire = np.sum(winrates_dire['wins'][j])

        global_dire_winrate = wins_dire / apps_dire

        for i in range(len(winrates_dire['winrate'][j])):
            if winrates_dire['winrate'][j][i] != 0.:
                winrates_dire['winrate'][j][i] -= global_dire_winrate
    

## Augment game data

In [20]:
def augment_synergy(hero_list):
    syn_rad = 0
    syn_dire = 0
    cnt_rad = 0
    cnt_dire = 0
    
    for i in range(5):
        for j in range(5):
            if i > j:
                syn_rad += winrates_radiant['winrate'][int(hero_list[i]) - 1][int(hero_list[j]) - 1]
                syn_dire += winrates_dire['winrate'][int(hero_list[i + 5]) - 1][int(hero_list[j + 5]) - 1]
                
            cnt_rad += counter_radiant['winrate'][int(hero_list[i]) - 1][int(hero_list[j + 5]) - 1]
            cnt_dire += counter_dire['winrate'][int(hero_list[j + 5]) - 1][int(hero_list[i]) - 1]
    
    return (syn_rad - syn_dire, cnt_rad - cnt_dire)

## Augment the train data set

In [21]:
%%time

X_train_aug = np.zeros((X_train.shape[0], X_train.shape[1] + 2), dtype=np.float32)
X_train_aug[:,:-2] = X_train

for i in range(X_train.shape[0]):
    if i % 10000 == 9999:
        print (i + 1)
        
    not_indexed = []
    for j in range(len(X_train_aug[i])):
        if X_train_aug[i][j] == 1:
            not_indexed.append(j % 114 + 1)
            
    X_train_aug[i][-2:] = augment_synergy(not_indexed)

10000
20000
30000
CPU times: user 11.3 s, sys: 300 ms, total: 11.6 s
Wall time: 10.9 s


## Augment the test data set

In [22]:
%%time

X_test_aug = np.zeros((X_test.shape[0], X_test.shape[1] + 2), dtype=np.float32)
X_test_aug[:,:-2] = X_test

for i in range(X_test.shape[0]):
    if i % 10000 == 9999:
        print (i + 1)
        
    not_indexed = []
    for j in range(len(X_test_aug[i])):
        if X_test_aug[i][j] == 1:
            not_indexed.append(j % 114 + 1)
            
    X_test_aug[i][-2:] = augment_synergy(not_indexed)

CPU times: user 2.56 s, sys: 16 ms, total: 2.58 s
Wall time: 2.56 s


In [23]:
print "X train shape: ", X_train.shape
print "X test shape: ", X_test.shape
print "y train shape: ", y_train.shape
print "y test shape: ", y_test.shape

X train shape:  (35767, 228)
X test shape:  (8942, 228)
y train shape:  (35767,)
y test shape:  (8942,)


# Train the models
## Train the non-augmented model

In [24]:
model = LogisticRegression(n_jobs=-1)
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
probabilities = model.predict_proba(X_test)
roc_auc_score = metrics.roc_auc_score(y_test, probabilities[:, 1])

In [26]:
print roc_auc_score

0.600901330557


## Train the augmented model

In [27]:
model_aug = LogisticRegression(n_jobs=-1)
model_aug.fit(X_train_aug, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
probabilities_aug = model_aug.predict_proba(X_test_aug)
roc_auc_score_aug = metrics.roc_auc_score(y_test, probabilities_aug[:, 1])

In [29]:
print roc_auc_score_aug

0.561353197022


## Make predictions with the augmented model
Higher score means higher chance for dire to win

Need to implement for radiant too

In [30]:
#hero_list = [102, 65, 74, 77, 86, 94, 59, 13, 84] # github example
hero_list = [114, 108, 16, 95, 74, 88, 70, 83, 9]

In [31]:
hero_dict = {}
for i in range(114):
    new_list = np.zeros(230)
    if (i + 1) not in hero_list and (i + 1) != 24:
        hero_list.append(i + 1)
        
        for j in range(10):
            if j < 5:
                new_list[hero_list[j] - 1] = 1
            else:
                new_list[hero_list[j] - 1 + 114] = 1
            
        synergy = augment_synergy(hero_list)
        new_list[228] = synergy[0]
        new_list[229] = synergy[1]
        del hero_list[-1]
        
        hero_dict[i + 1] = model_aug.predict_proba(new_list.reshape(1, -1))[0][0]  

In [32]:
import operator
sorted_dict = sorted(hero_dict.items(), key=operator.itemgetter(1), reverse=True)

for (k, v) in sorted_dict:
    print (find_hero(k) ,v)

(u'Gyrocopter', 0.8958928681384527)
(u'Naga Siren', 0.88783839807396925)
(u'Drow Ranger', 0.85680400114301702)
(u'Leshrac', 0.7929109222697508)
(u'Batrider', 0.77871637570125363)
(u'Visage', 0.72345539974493378)
(u'Night Stalker', 0.71261654812637432)
(u'Sniper', 0.70306277109684223)
(u'Enchantress', 0.68928772562426288)
(u'Jakiro', 0.67998715337846771)
(u'Skywrath Mage', 0.66547708386593296)
(u'Enigma', 0.66358006888537791)
(u'Necrophos', 0.66125057094007611)
(u'Bane', 0.65595757320725978)
(u'Wisp', 0.64593813574314196)
(u'Riki', 0.64577832280720093)
(u'Dark Seer', 0.64524807891006497)
(u'Abaddon', 0.62014120268075423)
(u'Elder Titan', 0.61708465872823848)
(u'Phantom Lancer', 0.61579160269462396)
(u'Phoenix', 0.61175243592281392)
(u'Bloodseeker', 0.60888866001556008)
(u'Spirit Breaker', 0.60459476306908955)
(u'Medusa', 0.60299879897815722)
(u'Warlock', 0.58835982616106719)
(u'Crystal Maiden', 0.58342041158006441)
(u'Witch Doctor', 0.55878106526419291)
(u'Brewmaster', 0.552806503536586