# Synergy Analysis

The goal of this notebook is to analyze pairs of heros and assess how much synergy is there.  Synergy will be defined primarily by win rate when a specific pair of heros are chosen in a match|

In [2]:
import pandas as pd
import seaborn as sbn
import psycopg2

%matplotlib inline

In [3]:
#progress bar widget https://github.com/alexanderkuk/log-progress
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [4]:
##Functinons
def unstack_simplify(df):
    return df.unstack().iloc[10:20].reset_index(drop=True)

In [5]:
#Postgres connection
hostname = 'localhost'
username = 'postgres'
password = ''
database = 'dota2_data'


In [6]:
#get data from db and store into df

myConnection = psycopg2.connect(host=hostname, user=username, password=password, dbname=database)
cur = myConnection.cursor()
matches_df = pd.read_sql_query("select * from matches", myConnection)
players_df = pd.read_sql_query("select match_id, hero_id, account_id, player_slot from player_matches", myConnection)
heros_df = pd.read_sql_query("select * from heroes", myConnection)

In [7]:
heros_df.head()

Unnamed: 0,id,name,localized_name,primary_attr,attack_type,roles
0,21,npc_dota_hero_windrunner,Windranger,int,Ranged,"[Carry, Support, Disabler, Escape, Nuker]"
1,26,npc_dota_hero_lion,Lion,int,Ranged,"[Support, Disabler, Nuker, Initiator]"
2,36,npc_dota_hero_necrolyte,Necrophos,int,Ranged,"[Carry, Nuker, Durable, Disabler]"
3,41,npc_dota_hero_faceless_void,Faceless Void,agi,Melee,"[Carry, Initiator, Disabler, Escape, Durable]"
4,46,npc_dota_hero_templar_assassin,Templar Assassin,agi,Ranged,"[Carry, Escape]"


In [8]:
players_df = players_df.sort_values(['match_id', 'player_slot'])
players_df.head(10)

Unnamed: 0,match_id,hero_id,account_id,player_slot
248394,17955123,31,89782335,0
248395,17955123,32,82262664,1
248396,17955123,29,92926005,2
248397,17955123,39,85815961,3
248398,17955123,52,93712171,4
248399,17955123,9,69325073,128
307657,17955123,79,58017868,129
307658,17955123,28,88704095,130
307659,17955123,21,86747757,131
307660,17955123,33,88792641,132


Some issues found with players_df. Some of the matches do not have 10 rows as expected, which was breaking the unstacking of the dataframe when processing the data. Will need to do some cleaning here

In [18]:
#Some matches do not have 10 rows
print players_df.shape
test = players_df.groupby(['match_id']).agg('count')
bad_matches = test[test['hero_id']!=10].index.values
bad_matches
players_df = players_df[~players_df['match_id'].isin(bad_matches)]
players_df.shape

(478060, 4)


(478060, 4)

In [19]:
players_df

Unnamed: 0,match_id,hero_id,account_id,player_slot
248394,17955123,31,89782335,0
248395,17955123,32,82262664,1
248396,17955123,29,92926005,2
248397,17955123,39,85815961,3
248398,17955123,52,93712171,4
248399,17955123,9,69325073,128
307657,17955123,79,58017868,129
307658,17955123,28,88704095,130
307659,17955123,21,86747757,131
307660,17955123,33,88792641,132


In [30]:
matches_df.dropna(axis=1, inplace=True)
matches_df.head()

Unnamed: 0,match_id,radiant_win,start_time
0,1340936766,True,1427043504
1,1297534463,True,1425602112
2,1121216226,True,1419718875
3,1118709508,True,1419639325
4,962134110,True,1413381801


In [70]:
#Create one dataframe where for each match_id, there are 10 columns for the heros
#takes time so save df into csv

agg_df = players_df.head(20).groupby('match_id').apply(unstack_simplify)
agg_df.to_csv('synergy_analysis_agg_df.csv')
agg_df.reset_index(drop=True)

0     31
1     32
2     29
3     39
4     52
5      9
6     79
7     28
8     21
9     33
10    53
11    39
12    29
13    10
14    79
15    41
16    55
17    40
18    74
19    33
dtype: int64

In [119]:
players_df


Unnamed: 0,match_id,hero_id,account_id,player_slot
248394,17955123,31,89782335,0
248395,17955123,32,82262664,1
248396,17955123,29,92926005,2
248397,17955123,39,85815961,3
248398,17955123,52,93712171,4
248399,17955123,9,69325073,128
307657,17955123,79,58017868,129
307658,17955123,28,88704095,130
307659,17955123,21,86747757,131
307660,17955123,33,88792641,132


In [139]:
rows = []
columns = ['match_id','hero','a','b']
id = 0
#500k records, 50k distinct match_ids
for i in range(500000):
    row = [id, 'hero'+str(i+1), i+2, i+3]
    rows.append(row)
    if (i+1) % 10 == 0:# and i != 0 and i != 1:
        id += 1
df = pd.DataFrame(rows, columns=columns)
df

Unnamed: 0,match_id,hero,a,b
0,0,hero1,2,3
1,0,hero2,3,4
2,0,hero3,4,5
3,0,hero4,5,6
4,0,hero5,6,7
5,0,hero6,7,8
6,0,hero7,8,9
7,0,hero8,9,10
8,0,hero9,10,11
9,0,hero10,11,12


In [140]:
def ustack(df):
    return df.unstack().iloc[10:16].reset_index(drop=True)
df.groupby('match_id').apply(ustack)

Unnamed: 0_level_0,0,1,2,3,4,5
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,hero1,hero2,hero3,hero4,hero5,hero6
1,hero11,hero12,hero13,hero14,hero15,hero16
2,hero21,hero22,hero23,hero24,hero25,hero26
3,hero31,hero32,hero33,hero34,hero35,hero36
4,hero41,hero42,hero43,hero44,hero45,hero46
5,hero51,hero52,hero53,hero54,hero55,hero56
6,hero61,hero62,hero63,hero64,hero65,hero66
7,hero71,hero72,hero73,hero74,hero75,hero76
8,hero81,hero82,hero83,hero84,hero85,hero86
9,hero91,hero92,hero93,hero94,hero95,hero96


In [143]:
players_df.iloc[300000:400000]

Unnamed: 0,match_id,hero_id,account_id,player_slot
340139,1962916430,106,113705693,0
160620,1962916430,50,133084797,1
155505,1962916430,25,89166519,2
160982,1962916430,100,111291593,3
161585,1962916430,52,137193239,4
161830,1962916430,93,125581247,128
161853,1962916430,101,120569619,129
161845,1962916430,28,138885864,130
168247,1962916430,19,111189717,131
340996,1962916430,3,131043881,132


In [205]:
 #   return df.unstack().iloc[10:20].reset_index(drop=True)
    
    
def test(df):
    return df.unstack().iloc[10:16].reset_index(drop=True)
agg_df = players_df.iloc[338592:338600].groupby('match_id').apply(test)
agg_df.to_csv('synergy_analysis_agg_df.csv')

In [206]:
agg_df = pd.read_csv('synergy_analysis_agg_df.csv')
agg_df.head(20)

Unnamed: 0,2292934732,0,12
0,2292934732,1,53
1,2292934732,2,67
2,2292934732,3,62
3,2292934732,4,11550182
4,2292934732,5,53178236


In [225]:
def test(df):
    return df.unstack().iloc[10:16].reset_index(drop=True)
agg_df = players_df.iloc[0:50].groupby('match_id').apply(test)
agg_df.to_csv('synergy_analysis_agg_df.csv')
agg_df = pd.read_csv('synergy_analysis_agg_df.csv')
agg_df.head(20)

Unnamed: 0,match_id,0,1,2,3,4,5
0,17955123,31,32,29,39,52,9
1,17962237,53,39,29,10,79,41
2,17972480,33,17,21,53,20,62
3,18096178,33,40,74,60,38,52
4,18244878,31,16,9,74,33,52


In [229]:
def test(df):
    return df.unstack().iloc[10:16].reset_index(drop=True)
agg_df = players_df.iloc[338573:338590].groupby('match_id').apply(test)
agg_df.to_csv('synergy_analysis_agg_df.csv')
agg_df = pd.read_csv('synergy_analysis_agg_df.csv')
agg_df.head(20)

Unnamed: 0,match_id,0,1,2,3,4,5
0,2292850179,81,33,38,39,50,107
1,2292874498,86,38,74,111,92949094,95430068


In [226]:
players_df.iloc[338580:338590]

Unnamed: 0,match_id,hero_id,account_id,player_slot
400445,2292850179,3,123444610,130
400446,2292850179,13,86840554,131
400447,2292850179,53,123787524,132
399747,2292874498,66,92949094,0
399748,2292874498,8,95430068,1
399749,2292874498,43,107855479,2
399750,2292874498,86,86953414,3
399751,2292874498,38,27178898,4
399752,2292874498,74,116094746,128
399753,2292874498,111,26356855,129
