Install pygraphviz

In [None]:
!sudo apt-get install graphviz graphviz-dev
!pip install pygraphviz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'libgraphviz-dev' instead of 'graphviz-dev'
graphviz is already the newest version (2.42.2-6).
libgraphviz-dev is already the newest version (2.42.2-6).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


Load in the libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import pygraphviz as pgv
from sklearn.metrics import accuracy_score

Load in the data

In [None]:
df = pd.read_json('filtered_matches.json')
df.head()

Unnamed: 0,radiant_win,duration,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire,radiant_score,dire_score,picks_bans,region
0,False,1373,260,2046,51,63,11,50,"[{'is_pick': True, 'hero_id': 109, 'team': 1, ...",
1,True,1529,1983,0,63,0,43,10,"[{'is_pick': True, 'hero_id': 11, 'team': 1, '...",5.0
2,True,2068,1974,6,63,3,28,21,"[{'is_pick': True, 'hero_id': 72, 'team': 1, '...",3.0
3,True,2064,1975,0,63,0,45,15,"[{'is_pick': True, 'hero_id': 9, 'team': 1, 'o...",3.0
4,True,1963,1974,0,63,0,45,21,"[{'is_pick': True, 'hero_id': 53, 'team': 0, '...",5.0


Drop some columns that weren't dropped in the other notebook

In [None]:
columns_to_drop = ['match_id', 'chat', 'cosmetics', 'objectives', 'players', 'pre_game_duration', 'start_time',
                  'match_seq_num', 'series_id', 'series_type', 'replay_salt', 'all_word_counts', 'my_word_counts',
                  'throw', 'loss', 'replay_url', 'comeback', 'stomp', 'metadata', 'version', 'draft_timings', 'teamfights',
                  'radiant_gold_adv', 'radiant_xp_adv', 'engine', 'cluster', 'lobby_type', 'human_players', 'leagueid',
                   'game_mode', 'flags', 'patch', 'first_blood_time', 'od_data', 'region']
# Drop columns, ignoring errors for non-existing columns
df = df.drop(columns=columns_to_drop, errors='ignore')
df.head()

Unnamed: 0,radiant_win,duration,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire,radiant_score,dire_score,picks_bans
0,False,1373,260,2046,51,63,11,50,"[{'is_pick': True, 'hero_id': 109, 'team': 1, ..."
1,True,1529,1983,0,63,0,43,10,"[{'is_pick': True, 'hero_id': 11, 'team': 1, '..."
2,True,2068,1974,6,63,3,28,21,"[{'is_pick': True, 'hero_id': 72, 'team': 1, '..."
3,True,2064,1975,0,63,0,45,15,"[{'is_pick': True, 'hero_id': 9, 'team': 1, 'o..."
4,True,1963,1974,0,63,0,45,21,"[{'is_pick': True, 'hero_id': 53, 'team': 0, '..."


A function that takes the information from the pick_bans arrays and makes it into its own feature columns

In [None]:
def extract_pick_info(row):
    picks_bans = row['picks_bans']
    if (len(picks_bans) != 10):
      return {}
    pick_dict = {}
    i = 1
    for pick in picks_bans:
        order = pick['order']
        team = int(pick['team'])
        hero_id = int(pick['hero_id'])
        pick_dict[f'pick_{i}'] = hero_id
        pick_dict[f'pick_{i}_team'] = team
        i += 1
    return pick_dict

Extract the pick info and load it into the features

In [None]:
# Apply the function to each row and expand the resulting dictionary into columns
df_picks = pd.DataFrame(df.apply(extract_pick_info, axis=1).tolist())

# Concatenate the original DataFrame with the new DataFrame containing pick information
df_combined = pd.concat([df, df_picks], axis=1)

# Remove rows containing NaN values
df_combined = df_combined.dropna()

# Example: Convert columns to integers after handling non-integer values
cols_to_convert = ['pick_1', 'pick_2', 'pick_3', 'pick_4', 'pick_5', 'pick_6', 'pick_7', 'pick_8', 'pick_9', 'pick_10',
                   'pick_1_team', 'pick_2_team', 'pick_3_team', 'pick_4_team', 'pick_5_team', 'pick_6_team',
                   'pick_7_team', 'pick_8_team', 'pick_9_team', 'pick_10_team']

# Handle non-integer values and convert to integers
for col in cols_to_convert:
    # Convert to integers
    df_combined[col] = df_combined[col].astype(int)


# Reset index if needed
df_combined = df_combined.reset_index(drop=True)

Drop the pick_bans column now that the information has been extracted

In [None]:
df = df_combined.drop('picks_bans', axis=1)
df.head()

Unnamed: 0,radiant_win,duration,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire,radiant_score,dire_score,pick_1,pick_1_team,...,pick_6,pick_6_team,pick_7,pick_7_team,pick_8,pick_8_team,pick_9,pick_9_team,pick_10,pick_10_team
0,False,1373,260,2046,51,63,11,50,109,1,...,110,0,54,1,68,1,60,1,120,0
1,True,1529,1983,0,63,0,43,10,11,1,...,53,0,18,1,60,0,106,1,4,0
2,True,2068,1974,6,63,3,28,21,72,1,...,13,1,129,0,85,1,95,0,93,1
3,True,2064,1975,0,63,0,45,15,9,1,...,47,0,1,0,59,1,63,1,68,0
4,True,1963,1974,0,63,0,45,21,53,0,...,10,1,57,0,113,0,21,0,45,1


These features fall into the same category, so we combine it into a difference score and have 1 column rather than 2

In [None]:
df['radiant_tower_status_difference'] = df['tower_status_radiant'] - df['tower_status_dire']
df['radiant_barracks_status_difference'] = df['barracks_status_radiant'] - df['barracks_status_dire']
df['radiant_score_difference'] = df['radiant_score'] - df['dire_score']
colsToDrop = ['tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire',
              'radiant_score', 'dire_score']
df.drop(columns=colsToDrop, inplace=True)

Convert the class column into a string with 'radiant' or 'dire' as the options

In [None]:
# Convert boolean column to string and replace True with 'radiant' and False with 'dire'
df['win'] = df['radiant_win'].replace({True: 'radiant', False: 'dire'}).astype(str)

# Drop the original 'radiant_win' column if needed
df.drop(columns=['radiant_win'], inplace=True)

df.head()

Unnamed: 0,duration,pick_1,pick_1_team,pick_2,pick_2_team,pick_3,pick_3_team,pick_4,pick_4_team,pick_5,...,pick_8,pick_8_team,pick_9,pick_9_team,pick_10,pick_10_team,radiant_tower_status_difference,radiant_barracks_status_difference,radiant_score_difference,win
0,1373,109,1,22,1,86,0,19,0,41,...,68,1,60,1,120,0,-1786,-12,-39,dire
1,1529,11,1,3,0,30,1,84,0,93,...,60,0,106,1,4,0,1983,63,33,radiant
2,2068,72,1,102,1,123,0,19,0,61,...,85,1,95,0,93,1,1968,60,7,radiant
3,2064,9,1,109,0,72,0,64,1,16,...,59,1,63,1,68,0,1975,63,30,radiant
4,1963,53,0,109,1,136,1,105,0,15,...,113,0,21,0,45,1,1974,63,24,radiant


Just checking on the data to make sure there are no NaN values and all the types are properly set

In [None]:
# Check for NaN values in the DataFrame
nan_rows = df[df.isna().any(axis=1)]

# Print the rows with NaN values
print(len(df))
print(len(nan_rows))

1827
0


In [None]:
print(df.dtypes)

duration                               int64
pick_1                                 int64
pick_1_team                            int64
pick_2                                 int64
pick_2_team                            int64
pick_3                                 int64
pick_3_team                            int64
pick_4                                 int64
pick_4_team                            int64
pick_5                                 int64
pick_5_team                            int64
pick_6                                 int64
pick_6_team                            int64
pick_7                                 int64
pick_7_team                            int64
pick_8                                 int64
pick_8_team                            int64
pick_9                                 int64
pick_9_team                            int64
pick_10                                int64
pick_10_team                           int64
radiant_tower_status_difference        int64
radiant_ba

Using a decision tree to visualize and explore our data

In [None]:
def decisionTree(data, max_depth):
  train, test = train_test_split(data, test_size=0.2, random_state=42)
  train_data = train.drop(['win'], axis=1)
  test_data = test.drop(['win'], axis=1)
  train_labels_mc = train['win']
  test_labels_mc = test['win']

  tree_clf = DecisionTreeClassifier(max_depth=max_depth)
  tree_clf.fit(train_data, train_labels_mc)

  export_graphviz(
  tree_clf,
  out_file="match_tree.dot",
  feature_names=train_data.columns.values,
  class_names=tree_clf.classes_,
  rounded=True,
  filled=True
  )

  t = pgv.AGraph("match_tree.dot")
  t.layout(prog="dot")  # use dot layout
  t.draw("new_tree.png") #this file will appear under the folder icon on the left-hand side menu in Google Colab

  # Evaluate the model
  test_pred = tree_clf.predict(test_data)
  accuracy = accuracy_score(test_pred, test_labels_mc)
  print("Accuracy:", accuracy)

Testing with all features

In [None]:
decisionTree(df, 23)

Accuracy: 0.9972677595628415


Three features that stand out:

radiant_tower_status_difference

radiant_barracks_status_difference

radiant_score_difference

Exploring how the accuracy changes as we remove these 3 features from the data, 1 by 1 (each turn removing the root)

In [None]:
df.drop(columns=['radiant_tower_status_difference'], inplace=True)
decisionTree(df, 22)

Accuracy: 0.9234972677595629


In [None]:
df.drop(columns=['radiant_barracks_status_difference'], inplace=True)
decisionTree(df, 21)

Accuracy: 0.8688524590163934


In [None]:
df.drop(columns=['radiant_score_difference'], inplace=True)
decisionTree(df, 20)

Accuracy: 0.4972677595628415


We can see that without the 3 features, our accuracy drops from 99% -> 51%

Those 3 features, however, are not available until a match is over...

In [None]:
df.drop(columns=['duration'], inplace=True)
decisionTree(df, 19)

Accuracy: 0.5136612021857924


The duration feature doesn't make as big of an impact but is also not available until a match is over

So lets try to use the pick features to try and predict a match as those are the only thing known before a match begins

A function to take the current pick data and transform it into phases so that it can be explored better to see if the 49% can be improved

In [None]:
def transform_picks(row):
    radiant_picks = []
    dire_picks = []
    for i in range(1, 11):
        pick_id = row[f'pick_{i}']
        team = row[f'pick_{i}_team']
        if team == 0:
            radiant_picks.append(pick_id)
        else:
            dire_picks.append(pick_id)

    if len(radiant_picks) != 5:
        return [np.nan] * 10  # Return a list of 10 NaN values

    return [
        radiant_picks[0], radiant_picks[1],  # Radiant_P1P1 Radiant_P1P2
        dire_picks[0], dire_picks[1],        # Dire_P1P1 Dire_P1P2
        radiant_picks[2], radiant_picks[3],  # Radiant_P2P1 Radiant_P2P2
        dire_picks[2], dire_picks[3],        # Dire_P2P1 Dire_P2P2
        radiant_picks[4], dire_picks[4]      # Radiant_P3P1 Dire_P3P1
    ]

Creating the new columns, filling them up with data then combining it with the original dataframe

In [None]:
new_columns = ['Radiant_P1P1', 'Radiant_P1P2', 'Dire_P1P1', 'Dire_P1P2',
               'Radiant_P2P1', 'Radiant_P2P2', 'Dire_P2P1', 'Dire_P2P2',
               'Radiant_P3P1', 'Dire_P3P1']
# Apply the transform_picks function to the DataFrame
transformed_data = df.apply(transform_picks, axis=1)

# Create a DataFrame with the transformed data
transformed_df = pd.DataFrame(transformed_data.tolist(), columns=new_columns)

concatenated_df = pd.concat([df, transformed_df], axis=1)

# Check for NaN values in the DataFrame
nan_rows = concatenated_df[concatenated_df.isna().any(axis=1)]

# Print the rows with NaN values
print(len(concatenated_df))
print(len(nan_rows))

1827
7


We find 7 rows of bad data (The teams have an unbalanced amount of heroes) so we remove them

In [None]:
# Drop rows with NaN values
concatenated_df = concatenated_df.dropna()

# Convert columns to integers
for col in new_columns:
    # Convert to integers
    concatenated_df[col] = concatenated_df[col].astype(int)

# Verify the changes
print(len(concatenated_df))

1820


In [None]:
concatenated_df.drop(columns=['pick_1', 'pick_2', 'pick_3', 'pick_4', 'pick_5', 'pick_6', 'pick_7', 'pick_8', 'pick_9', 'pick_10',
                   'pick_1_team', 'pick_2_team', 'pick_3_team', 'pick_4_team', 'pick_5_team', 'pick_6_team',
                   'pick_7_team', 'pick_8_team', 'pick_9_team', 'pick_10_team'], inplace=True)
concatenated_df.head()

Unnamed: 0,win,Radiant_P1P1,Radiant_P1P2,Dire_P1P1,Dire_P1P2,Radiant_P2P1,Radiant_P2P2,Dire_P2P1,Dire_P2P2,Radiant_P3P1,Dire_P3P1
0,dire,86,19,109,22,41,110,54,68,120,60
1,radiant,3,84,11,30,53,60,93,18,4,106
2,radiant,123,19,72,102,61,129,13,85,95,93
3,radiant,109,72,9,64,47,1,16,59,68,63
4,radiant,53,105,109,136,57,113,15,10,21,45


Now that we transformed our data and removed the bad data, lets explore the different phases

In [None]:
dfWithPhases = concatenated_df.copy()

In [None]:
decisionTree(dfWithPhases, 10)

Accuracy: 0.5


We already see that with the bad data removed, an improvement of ~2%

In [None]:
dfPhase1 = dfWithPhases.copy()
dfPhase1.drop(columns=['Radiant_P2P1', 'Radiant_P2P2', 'Dire_P2P1', 'Dire_P2P2', 'Radiant_P3P1', 'Dire_P3P1'], inplace=True)
decisionTree(dfPhase1, 4)

Accuracy: 0.5027472527472527


In [None]:
dfPhase2 = dfWithPhases.copy()
dfPhase2.drop(columns=['Radiant_P1P1', 'Radiant_P1P2', 'Dire_P1P1', 'Dire_P1P2', 'Radiant_P3P1', 'Dire_P3P1'], inplace=True)
decisionTree(dfPhase2, 4)

Accuracy: 0.5274725274725275


In [None]:
dfPhase3 = dfWithPhases.copy()
dfPhase3.drop(columns=['Radiant_P1P1', 'Radiant_P1P2', 'Dire_P1P1', 'Dire_P1P2', 'Radiant_P2P1', 'Radiant_P2P2', 'Dire_P2P1', 'Dire_P2P2'], inplace=True)
decisionTree(dfPhase3, 2)

Accuracy: 0.5467032967032966


We see that looking at primarily only what is being picked in phase 3 of the draft, we get the best results which makes sense as that is the final pick who has the most information on the game and seeing certain heroes being last picked makes it easier to predict who is going to win

Ok we looked at the picks in their respective phases, but what about individual picks?

In [None]:
df_with_pick1 = dfWithPhases[['Radiant_P1P1', 'win']].copy()
df_with_pick2 = dfWithPhases[['Radiant_P1P2', 'win']].copy()
df_with_pick3 = dfWithPhases[['Dire_P1P1', 'win']].copy()
df_with_pick4 = dfWithPhases[['Dire_P1P2', 'win']].copy()
df_with_pick5 = dfWithPhases[['Radiant_P2P1', 'win']].copy()
df_with_pick6 = dfWithPhases[['Radiant_P2P2', 'win']].copy()
df_with_pick7 = dfWithPhases[['Dire_P2P1', 'win']].copy()
df_with_pick8 = dfWithPhases[['Dire_P2P2', 'win']].copy()
df_with_pick9 = dfWithPhases[['Radiant_P3P1', 'win']].copy()
df_with_pick10 = dfWithPhases[['Dire_P3P1', 'win']].copy()

In [None]:
decisionTree(df_with_pick1, 1)
decisionTree(df_with_pick2, 1)
decisionTree(df_with_pick3, 1)
decisionTree(df_with_pick4, 1)
decisionTree(df_with_pick5, 1)
decisionTree(df_with_pick6, 1)
decisionTree(df_with_pick7, 1)
decisionTree(df_with_pick8, 1)
decisionTree(df_with_pick9, 1)
decisionTree(df_with_pick10, 1)

Accuracy: 0.5274725274725275
Accuracy: 0.5274725274725275
Accuracy: 0.5137362637362637
Accuracy: 0.5274725274725275
Accuracy: 0.5274725274725275
Accuracy: 0.5274725274725275
Accuracy: 0.5274725274725275
Accuracy: 0.532967032967033
Accuracy: 0.5274725274725275
Accuracy: 0.5274725274725275


Ok.. No one individual pick is beating the accuracy score of phase 2..

Well, mathematically, we just may not have enough data to compute a higher accuracy for this.

If we take a look at our data:

In [None]:
print("Dataset length: ", len(dfWithPhases))
print("Training set length: ", len(dfWithPhases)*0.8)
print("Testing set length: ", len(dfWithPhases)*0.2)

Dataset length:  1820
Training set length:  1456.0
Testing set length:  364.0


We can see that we are only training on 1456 matches which means that we only see the results of 1456 combinations of possible heroes (very unlikely that any 2 matches in our dataset has the same combination of heroes).

Some math on the total combinations:

Total of 124 different heroes to choose from

5 possible hero combinations: (124! / (5! (124-5)!))

5 possible hero combinations from the remaining heroes: (119! / (5! (119-5)!))

Then side side doesn't matter, we can divide by 2 so we have:

((124! / (5! (124-5)!)) * (119! / (5! (119-5)!))) / 2

= 20,560,393,199,622,276

Which is about 20 quadrillion total combinations of heroes

However, I believe that the 54% isn't pure luck, as in this sort of game, there are heroes that are considered meta (high popularity) so a lot of games start seeing certain heroes being picked every single game...

In [None]:
# Step 1: Calculate top 3 most frequent heroes in each column and win rate for each hero
top_heroes = {}
win_rates = {}
all_top_heroes = set()  # Using a set to ensure no duplicates

for column in dfWithPhases.columns:
    if column != 'win':
        top_heroes[column] = dfWithPhases[column].value_counts().nlargest(3).index.tolist()
        all_top_heroes.update(top_heroes[column])  # Update the set with top heroes from each column

        win_rates[column] = {}
        for hero in top_heroes[column]:
            wins = dfWithPhases[dfWithPhases[column] == hero]['win'].value_counts()
            win_rates[column][hero] = wins.get('radiant', 0) / (wins.get('radiant', 0) + wins.get('dire', 0))

print("The top 3 heroes in each column are:", top_heroes)
print("The win rates of the top 3 heroes in each column are:", win_rates)

# Step 2: Create three DataFrame copies with varying constraints on the number of top heroes each row must contain
dfLimited = []
for i in range(1, 4):  # For each constraint: 1, 2, and 3
    filtered_rows = []
    for _, row in dfWithPhases.iterrows():
        heroes_in_row = set(row.drop('win')) & all_top_heroes  # Intersection of heroes in row and top heroes
        if len(heroes_in_row) >= i:  # Check if row meets the constraint
            filtered_rows.append(row)
    filtered_df = pd.DataFrame(filtered_rows)
    dfLimited.append(filtered_df)

# Print the dataframes
for i, df in enumerate(dfLimited, start=1):
    print(f"DataFrame {i} with at least {i} of the most frequent heroes:")
    print(len(df))

The top 3 heroes in each column are: {'Radiant_P1P1': [123, 14, 86], 'Radiant_P1P2': [123, 14, 26], 'Dire_P1P1': [123, 14, 53], 'Dire_P1P2': [123, 86, 53], 'Radiant_P2P1': [129, 54, 48], 'Radiant_P2P2': [54, 10, 129], 'Dire_P2P1': [54, 129, 1], 'Dire_P2P2': [48, 41, 54], 'Radiant_P3P1': [13, 22, 52], 'Dire_P3P1': [13, 22, 76]}
The win rates of the top 3 heroes in each column are: {'Radiant_P1P1': {123: 0.5462184873949579, 14: 0.5849056603773585, 86: 0.49382716049382713}, 'Radiant_P1P2': {123: 0.55, 14: 0.4714285714285714, 26: 0.4927536231884058}, 'Dire_P1P1': {123: 0.5283018867924528, 14: 0.5208333333333334, 53: 0.5208333333333334}, 'Dire_P1P2': {123: 0.5432098765432098, 86: 0.5, 53: 0.5409836065573771}, 'Radiant_P2P1': {129: 0.5555555555555556, 54: 0.5254237288135594, 48: 0.3793103448275862}, 'Radiant_P2P2': {54: 0.5454545454545454, 10: 0.5185185185185185, 129: 0.49056603773584906}, 'Dire_P2P1': {54: 0.49230769230769234, 129: 0.559322033898305, 1: 0.5555555555555556}, 'Dire_P2P2': {48

In [None]:
print(len(dfLimited[2]))
dfLimited[2].head()

886


Unnamed: 0,win,Radiant_P1P1,Radiant_P1P2,Dire_P1P1,Dire_P1P2,Radiant_P2P1,Radiant_P2P2,Dire_P2P1,Dire_P2P2,Radiant_P3P1,Dire_P3P1
0,dire,86,19,109,22,41,110,54,68,120,60
2,radiant,123,19,72,102,61,129,13,85,95,93
5,radiant,123,26,86,31,104,35,19,41,59,21
9,dire,66,26,58,87,2,129,96,41,42,52
10,dire,26,91,109,48,25,52,15,14,35,13


In [None]:
decisionTree(dfLimited[0], 10)
decisionTree(dfLimited[1], 10)
decisionTree(dfLimited[2], 10)

Accuracy: 0.5300859598853869
Accuracy: 0.501779359430605
Accuracy: 0.4438202247191011


So we can see some similiar hero_ids in each pick in the same phases showing that each phase has popular picks but we also see that none of the popular pick winrates are high enough for the model to be "if a team gets x hero then they will win with high certainty" and even when we try to use only rows of data where the most meta heroes are, it still doesn't provide better accuracy which is probably due to winrate.

In [None]:
for i in range(3):
  dfWithPhases2 = dfLimited[i].copy()
  dfPhase21 = dfWithPhases2.copy()
  dfPhase21.drop(columns=['Radiant_P2P1', 'Radiant_P2P2', 'Dire_P2P1', 'Dire_P2P2', 'Radiant_P3P1', 'Dire_P3P1'], inplace=True)
  decisionTree(dfPhase21, 4)

Accuracy: 0.5358166189111748
Accuracy: 0.5124555160142349
Accuracy: 0.4606741573033708


In [None]:
for i in range(3):
  dfWithPhases2 = dfLimited[i].copy()
  dfPhase22 = dfWithPhases2.copy()
  dfPhase22.drop(columns=['Radiant_P1P1', 'Radiant_P1P2', 'Dire_P1P1', 'Dire_P1P2', 'Radiant_P3P1', 'Dire_P3P1'], inplace=True)
  decisionTree(dfPhase22, 4)

Accuracy: 0.5329512893982808
Accuracy: 0.49110320284697506
Accuracy: 0.4943820224719101


In [None]:
for i in range(3):
  dfWithPhases2 = dfLimited[i].copy()
  dfPhase23 = dfWithPhases2.copy()
  dfPhase23.drop(columns=['Radiant_P1P1', 'Radiant_P1P2', 'Dire_P1P1', 'Dire_P1P2', 'Radiant_P2P1', 'Radiant_P2P2', 'Dire_P2P1', 'Dire_P2P2'], inplace=True)
  decisionTree(dfPhase23, 2)

Accuracy: 0.5558739255014327
Accuracy: 0.597864768683274
Accuracy: 0.47752808988764045


Even trying to lower the amount of combinations by taking games with certain heroes in it lowering the overall hero pool doesn't change the results much. I did manage to increase to ~60% which is not bad considering the small dataset. Underfitting is also another problem that can occur with this sort of tactic.