# Dota 2 Shop items bought and used during match

# Items
- Purchased with gold in shop during match
- Some are dropped by creeps
- Lower tier items (basic) can be combined into higher tier items (upgraded) using recipes
- Heroes have 6 item slots

https://dota2.fandom.com/wiki/Items

# OpenDota API match JSON
- `match.players.item_uses`
- `match.players.purchase`
- Item list will have both `bloodthorn` and `recipe_bloodthorn`


In [1]:
# Imports
from tqdm import tqdm
import glob
import os
import json
from pprint import pprint
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import os
import sys
sys.path.append(os.path.join("..", "src"))
from game_data_collector.api import Role, get_hero_ids_of_role

### build df

In [2]:
# Building the data frame from the raw data
# TODO I'm skipping the database integration for now (query building etc.) to cut some time for faster EDA

json_files = list(glob.iglob("C:\\Users\\cedri\\dota-project\\mmr-predictor\\resources\\data\\*.json"))
print(f"found {len(json_files):,} match dumps")

found 22,529 match dumps


In [3]:
# Get all distinct items from the data for the cols
distinct_items = set()

# 5m20s
for j in tqdm(json_files, total=len(json_files)):
    with open(j, "r", encoding="utf-8") as fi:
        match = json.load(fi)
        for player in match["players"]:
            # Combine item uses and purchases into one loop
            distinct_items.update(player["item_uses"].keys())
            distinct_items.update(player["purchase"].keys())

print(f"found {len(distinct_items):,} distinct items in the data")
# print("\n".join([d for d in distinct_items]))

distinct_items_only = [i for i in distinct_items if "recipe" not in i]
print(f"found {len(distinct_items_only):,} items without recipes")

100%|██████████| 22529/22529 [00:33<00:00, 664.01it/s]

found 305 distinct items in the data
found 226 items without recipes





### player item uses

In [4]:
# Let's start with item uses

cols = ["hero_id", "rank_tier", "kda", "gold_per_min"] + list(distinct_items_only)
data_rows = []

# 34s
for j in tqdm(json_files, total=len(json_files)):
    with open(j, "r", encoding="utf-8") as fi:
        match = json.load(fi)
                
        for player in match["players"]:
            
            # filter players not in rank tier interval (0, 80)
            if (player["rank_tier"] == None) or (player["rank_tier"] == 80):
                continue
            
            # Initialize a dictionary for the current row
            row_data = {col: 0 for col in cols}  # Set default counts to 0
            
            # player stats
            row_data["hero_id"] = player["hero_id"]
            row_data["rank_tier"] = player["rank_tier"]
            row_data["kda"] = player["kda"]
            row_data["gold_per_min"] = player["gold_per_min"]

            # Update counts based on the player's item uses and purchases
            for item, count in player["item_uses"].items():
                if item in row_data:
                    row_data[item] += count
        
            data_rows.append(row_data)

df_item_uses = pd.DataFrame(data_rows)
print(f"df with {len(df_item_uses):,} players item uses") # note that 10 player per match = match count times 10

100%|██████████| 22529/22529 [00:34<00:00, 654.59it/s]


df with 92,005 players item uses


In [5]:
# Calculate the correlation matrix
correlation_matrix = df_item_uses.corr()

# Extract the correlation of all columns with 'rank_tier'
rank_tier_correlation = correlation_matrix['rank_tier'].drop(columns=["hero_id", "kda", "gold_per_min"]).sort_values(ascending=False).dropna()

# Print the correlation with rank_tier
print(rank_tier_correlation)

rank_tier          1.000000
quelling_blade     0.106584
power_treads       0.100497
gold_per_min       0.084440
tango              0.084408
                     ...   
glimmer_cape      -0.016106
ward_dispenser    -0.018241
enchanted_mango   -0.024239
shadow_amulet     -0.025911
invis_sword       -0.028520
Name: rank_tier, Length: 132, dtype: float64


### PCA of item used


TODO: Look at specific hero roles rather than at all classes might produce some more meaningful results

In [6]:
# do a PCA to figure out important items (and reduce dimensions)

# Separate features and target variable
X = df_item_uses.drop(columns=['rank_tier', "hero_id", "kda", "gold_per_min"])  # Drop identifiers and target
y = df_item_uses['rank_tier']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

pc_all_kpis = pd.DataFrame(X_pca, columns=[f"PC{n}" for n in range(1, X_pca.shape[1]+1)])
print(pc_all_kpis.shape)
print("\npreview of the first 5 PCs:")
print(pc_all_kpis.iloc[:, :5].head())

print('\nProportion of Variance Explained per PC:', pca.explained_variance_ratio_[:5])


(92005, 226)

preview of the first 5 PCs:
        PC1       PC2       PC3       PC4       PC5
0  2.219334 -0.199512 -0.346145 -0.377145 -0.089146
1 -0.691853  0.750845 -0.882895 -1.324274 -0.909313
2 -0.424086  0.654621 -0.729224 -1.384323 -0.862887
3 -0.417838  0.554094 -0.558946 -0.387742 -0.985827
4  0.781095 -0.225483 -1.409099  0.240768 -0.614009

Proportion of Variance Explained per PC: [0.03742774 0.01990786 0.01608841 0.01522237 0.01446721]


In [7]:
print(f"First 10 most important features for PC1 ({pca.explained_variance_ratio_[0]:.2} of PVE):\n")

for f, i in sorted(zip(df_item_uses.columns[5:], pca.components_[0]), key=lambda x: x[1], reverse=True)[:20]:
    print(f"{i:.2}\t{f}")

First 10 most important features for PC1 (0.037 of PVE):

0.36	ultimate_orb
0.34	boots
0.3	gauntlets
0.29	tpscroll
0.25	branches
0.25	orb_of_venom
0.19	bloodstone
0.18	force_staff
0.14	chainmail
0.13	rod_of_atos
0.12	helm_of_iron_will
0.11	fluffy_hat
0.11	blade_of_alacrity
0.11	craggy_coat
0.1	monkey_king_bar
0.1	ghost
0.093	dagon_5
0.084	helm_of_the_dominator
0.07	aether_lens
0.069	blades_of_attack


In [8]:
print(f"First 10 most important features for PC2 ({pca.explained_variance_ratio_[1]:.2} of PVE):\n")

for f, i in sorted(zip(df_item_uses.columns[5:], pca.components_[1]), key=lambda x: x[1], reverse=True)[:20]:
    print(f"{i:.2}\t{f}")

First 10 most important features for PC2 (0.02 of PVE):

0.37	refresher
0.37	ethereal_blade
0.37	polliwog_charm
0.36	arcane_blink
0.35	tiara_of_selemene
0.3	boots_of_elves
0.16	vanguard
0.15	ward_observer
0.12	javelin
0.095	ultimate_scepter_roshan
0.082	blade_of_alacrity
0.077	magic_wand
0.071	lifesteal
0.071	claymore
0.069	gauntlets
0.056	heart
0.049	talisman_of_evasion
0.042	pirate_hat
0.036	quelling_blade
0.033	silver_edge


In [9]:
print(f"First 10 most important features for PC3 ({pca.explained_variance_ratio_[2]:.2} of PVE):\n")

for f, i in sorted(zip(df_item_uses.columns[5:], pca.components_[2]), key=lambda x: x[1], reverse=True)[:20]:
    print(f"{i:.2}\t{f}")

First 10 most important features for PC3 (0.016 of PVE):

0.38	quelling_blade
0.33	basher
0.3	shivas_guard
0.24	talisman_of_evasion
0.2	silver_edge
0.19	magic_wand
0.16	sange_and_yasha
0.14	essence_ring
0.14	trusty_shovel
0.12	lifesteal
0.11	heart
0.11	demon_edge
0.11	vladmir
0.11	mithril_hammer
0.1	unstable_wand
0.1	monkey_king_bar
0.098	great_famango
0.097	bullwhip
0.094	broadsword
0.092	nullifier
