In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# import the required libraries for web scraping
from bs4 import BeautifulSoup
import requests
import re
import time
import random

In [None]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)

In [None]:
dota_players = pd.read_csv('data/dota_players.csv')
dota_teams = pd.read_csv('data/dota_teams.csv')
dota_heroes = pd.read_csv('data/dota_heroes.csv')
hero_stats = pd.read_csv('data/hero_stats.csv')

In [434]:
hero_stats

Unnamed: 0,Hero,Kills_Min,Kills_Max,Kills_Mean,Deaths_Min,Deaths_Max,Deaths_Mean,Assists_Min,Assists_Max,Assists_Mean,GPM_Min,GPM_Max,GPM_Mean,XPM_Min,XPM_Max,...,LVL_Max,LVL_Mean,HD_Min,HD_Max,HD_Mean,TD_Min,TD_Max,TD_Mean,HH_Min,HH_Max,HH_Mean,GS_Min,GS_Max,GS_Mean,HERO_ID
0,Abaddon,0,2,1,2,5,3,4,22,13,214,298,256,251,579,...,23,18,4645,7268,5956,131,354,242,596,12983,6789,9455,13435,11445,1
1,Ancient Apparition,3,3,3,6,6,6,16,16,16,320,320,320,484,484,...,20,20,12624,12624,12624,589,589,589,0,0,0,12035,12035,12035,2
2,Anti-Mage,10,10,10,0,0,0,9,9,9,866,866,866,997,997,...,24,24,20489,20489,20489,16618,16618,16618,0,0,0,29100,29100,29100,3
3,Arc Warden,3,10,5,0,5,2,7,17,12,603,910,798,606,856,...,27,23,21184,57266,39504,601,7521,4387,0,0,0,19515,38665,30241,4
4,Axe,5,13,9,4,11,8,2,10,7,299,579,471,322,926,...,26,21,20286,44131,29495,315,1271,770,0,0,0,10330,24110,18076,5
5,Bane,0,3,1,5,10,7,6,22,12,196,269,233,214,361,...,17,14,3026,11634,6504,0,214,91,0,0,0,5575,10200,8050,6
6,Batrider,1,16,7,0,11,4,1,30,13,300,745,526,349,982,...,30,22,8837,66647,25457,0,3730,350,0,228,16,6065,38300,18835,7
7,Beastmaster,0,9,3,0,10,4,1,36,13,335,650,512,345,826,...,27,19,6581,32255,15012,499,16267,6648,308,14017,3465,8515,33785,17555,8
8,Bloodseeker,0,16,5,0,9,4,2,17,7,391,769,566,373,910,...,28,20,6959,71068,21327,0,13110,3423,0,195,13,6860,38645,19778,9
9,Bounty Hunter,0,4,1,3,8,5,4,9,5,185,555,344,211,540,...,18,14,4192,19602,9655,0,874,480,0,0,0,5015,18305,11866,10


In [None]:
# drop the row where Hero is Outworld Devourer
hero_stats = hero_stats[hero_stats['Hero'] != 'Outworld Devourer']

In [None]:
# hero_stats.HERO_ID = hero_stats.index
# hero_stats.drop(columns=['Unnamed: 0', 'index'], inplace=True)
hero_stats.HERO_ID = hero_stats.HERO_ID + 1

In [None]:
hero_stats

In [None]:
single_performances = pd.read_csv('data/datdota_singleperformances.csv')

In [None]:
single_performances

In [None]:
# get a list of how gpm scales with kills and last hits
gpm_kills = single_performances.groupby('Kills')['GPM'].mean()
gpm_lh = single_performances.groupby('Last Hits')['GPM'].mean()
xpm_kills = single_performances.groupby('Kills')['XPM'].mean()
xpm_lh = single_performances.groupby('Last Hits')['XPM'].mean()
xpm_level = single_performances.groupby('LVL')['XPM'].mean()

# plot the data
fig, ax = plt.subplots(figsize=(10, 6))
fig2, ax2 = plt.subplots(figsize=(10, 6))
ax.plot(gpm_kills.index, gpm_kills.values, label='GPM per kill')
ax2.plot(gpm_lh.index, gpm_lh.values, label='GPM per last hit')
ax.set_xlabel('Kills')
ax.set_ylabel('GPM')
ax2.set_xlabel('Last Hits')
ax2.set_ylabel('GPM')
ax2.legend()
ax.legend()
plt.show()

In [None]:
# drop the major outliers in single_performances
single_performances = single_performances[single_performances['Kills'] < 30]

In [None]:
# get the value ranges for all columns in single_performances sorted by hero, ignoring Match, Player, Result, and End Game Items
hero_stats = single_performances.drop(['Match', 'Player', 'Result', 'End Game Items', 'KDA'], axis=1).groupby('Hero').agg(['min', 'max', 'mean'])

In [None]:
# get the value ranges for all columns in single_performances and put them in a dictionary
value_ranges = {}
value_averages = {}
for col in single_performances.columns:
    if col != 'Match' and col != 'Hero' and col != 'Player' and col != 'Result' and col != 'End Game Items':
        value_ranges[col] = [min(single_performances[col]), max(single_performances[col])]
        value_averages[col] = np.mean(single_performances[col])
value_ranges

In [None]:
player_options = dota_players['ID'].unique()

In [None]:
# a class to store single performance data
class SinglePerformance:
    def __init__(
        self,
        match_id,
        player_id,
        hero_id,
        kills,
        deaths,
        assists,
        last_hits,
        denies,
        gpm,
        xpm,
        hero_damage,
        tower_damage,
        hero_healing,
        level,
        win,
    ):
        self.match_id = match_id
        self.player_id = player_id
        self.hero_id = hero_id
        self.kills = kills
        self.deaths = deaths
        self.assists = assists
        self.last_hits = last_hits
        self.denies = denies
        self.gpm = gpm
        self.xpm = xpm
        self.hero_damage = hero_damage
        self.tower_damage = tower_damage
        self.hero_healing = hero_healing
        self.level = level
        self.win = win

    def get_info(self):
        return {
            "Match": self.match_id,
            "Player": self.player_id,
            "Hero": self.hero_id,
            "Kills": self.kills,
            "Deaths": self.deaths,
            "Assists": self.assists,
            "Last_Hits": self.last_hits,
            "Denies": self.denies,
            "GPM": self.gpm,
            "XPM": self.xpm,
            "Hero_Damage": self.hero_damage,
            "Tower_Damage": self.tower_damage,
            "Hero_Healing": self.hero_healing,
            "Level": self.level,
            "Win": self.win,
        }

In [None]:
# drop heroes from dota_heroes that are not in hero_stats
dota_heroes = dota_heroes[dota_heroes['HERO'].isin(hero_stats.Hero)]

In [None]:
# drop the row where Hero is Outworld Devourer
hero_stats = hero_stats[hero_stats['Hero'] != 'Outworld Devourer']

In [None]:
# a function to create random match data. it accepts 2 arrays of player ids. it returns a list of 10 SinglePerformance objects. The hero ids are randomly selected from the list of heroes, and if a hero has already been chosen, choose a new one. The stats are generated based on the hero values in hero_stats.
def create_random_match(players1, players2):
    # drop players if there are more than 5
    if len(players1) > 5:
        players1 = players1[:5]
    if len(players2) > 5:
        players2 = players2[:5]

    match_id = random.randint(100000, 999999)
    performances = []
    for player in players1:
        # select a random hero
        hero_id = random.choice(dota_heroes['HERO_ID'].values)
        while hero_id in [x.hero_id for x in performances]:
            hero_id = random.choice(dota_heroes['HERO_ID'].values)
        kills = random.randint(hero_stats.loc[hero_id]['Kills_Min'], hero_stats.loc[hero_id]['Kills_Max'])
        deaths = random.randint(hero_stats.loc[hero_id]['Deaths_Min'], hero_stats.loc[hero_id]['Deaths_Max'])
        assists = random.randint(hero_stats.loc[hero_id]['Assists_Min'], hero_stats.loc[hero_id]['Assists_Max'])
        last_hits = random.randint(hero_stats.loc[hero_id]['LH_Min'], hero_stats.loc[hero_id]['LH_Max'])
        denies = random.randint(hero_stats.loc[hero_id]['Denies_Min'], hero_stats.loc[hero_id]['Denies_Max'])
        gpm = random.randint(hero_stats.loc[hero_id]['GPM_Min'], hero_stats.loc[hero_id]['GPM_Max'])
        xpm = random.randint(hero_stats.loc[hero_id]['XPM_Min'], hero_stats.loc[hero_id]['XPM_Max'])
        hero_damage = random.randint(hero_stats.loc[hero_id]['HD_Min'], hero_stats.loc[hero_id]['HD_Max'])
        tower_damage = random.randint(hero_stats.loc[hero_id]['TD_Min'], hero_stats.loc[hero_id]['TD_Max'])
        hero_healing = random.randint(hero_stats.loc[hero_id]['HH_Min'], hero_stats.loc[hero_id]['HH_Max'])
        level = random.randint(hero_stats.loc[hero_id]['LVL_Min'], hero_stats.loc[hero_id]['LVL_Max'])
        win = -1

        performances.append(SinglePerformance(match_id, player, hero_id, kills, deaths, assists, last_hits, denies, gpm, xpm, hero_damage, tower_damage, hero_healing, level, win))

    for player in players2:
        # select a random hero
        hero_id = random.choice(dota_heroes['HERO_ID'].values)
        while hero_id in [x.hero_id for x in performances]:
            hero_id = random.choice(dota_heroes['HERO_ID'].values)
        kills = random.randint(hero_stats.loc[hero_id]['Kills_Min'], hero_stats.loc[hero_id]['Kills_Max'])
        deaths = random.randint(hero_stats.loc[hero_id]['Deaths_Min'], hero_stats.loc[hero_id]['Deaths_Max'])
        assists = random.randint(hero_stats.loc[hero_id]['Assists_Min'], hero_stats.loc[hero_id]['Assists_Max'])
        last_hits = random.randint(hero_stats.loc[hero_id]['LH_Min'], hero_stats.loc[hero_id]['LH_Max'])
        denies = random.randint(hero_stats.loc[hero_id]['Denies_Min'], hero_stats.loc[hero_id]['Denies_Max'])
        gpm = random.randint(hero_stats.loc[hero_id]['GPM_Min'], hero_stats.loc[hero_id]['GPM_Max'])
        xpm = random.randint(hero_stats.loc[hero_id]['XPM_Min'], hero_stats.loc[hero_id]['XPM_Max'])
        hero_damage = random.randint(hero_stats.loc[hero_id]['HD_Min'], hero_stats.loc[hero_id]['HD_Max'])
        tower_damage = random.randint(hero_stats.loc[hero_id]['TD_Min'], hero_stats.loc[hero_id]['TD_Max'])
        hero_healing = random.randint(hero_stats.loc[hero_id]['HH_Min'], hero_stats.loc[hero_id]['HH_Max'])
        level = random.randint(hero_stats.loc[hero_id]['LVL_Min'], hero_stats.loc[hero_id]['LVL_Max'])
        win = -1

        performances.append(SinglePerformance(match_id, player, hero_id, kills, deaths, assists, last_hits, denies, gpm, xpm, hero_damage, tower_damage, hero_healing, level, win))

    # set the team with the least deaths as the winning team
    if sum([x.deaths for x in performances[:5]]) > sum([x.deaths for x in performances[5:]]):
        for performance in performances[5:]:
            performance.win = 1
        for performance in performances[:5]:
            performance.win = 0
    else:
        for performance in performances[:5]:
            performance.win = 1
        for performance in performances[5:]:
            performance.win = 0
        

    for performance in performances:
        performances[performances.index(performance)] = performance.get_info()
    return pd.DataFrame(performances)

In [None]:
randommatch = create_random_match(["A", "B", "C", "D", "E"],[ "F", "G", "H", "I", "J"])
randommatch

In [None]:
# convert all hero_stats columns to integers except for the Hero column
for col in hero_stats.columns:
    if col != 'Hero':
        hero_stats[col] = hero_stats[col].astype(float).astype(int)

In [None]:
# create a dataframe of dataframes for each team_name in dota_players
team_dataframes = {}
for team_name in dota_players['team_name'].unique():
    team_dataframes[team_name] = dota_players[dota_players['team_name'] == team_name]

# get a subframe of all teams with at least 5 players
team_dataframes_5 = {k: v for k, v in team_dataframes.items() if len(v) >= 5}

# get a list of all team names in team_dataframes_5
team_names = list(team_dataframes_5.keys())

In [None]:
# get a subframe of all teams with at least 5 players
team_dataframes_5 = {k: v for k, v in team_dataframes.items() if len(v) >= 5}

In [None]:
# get a list of all team names in team_dataframes_5
team_names = list(team_dataframes_5.keys())

In [None]:
team_dataframes_5["Alliance.LATAM"]

In [None]:
# a function to select 2 random teams from team_names, then create a random match between their players
def create_random_match_data():
    # select 2 random teams
    team1 = random.choice(team_names)
    team2 = random.choice(team_names)
    while team1 == team2:
        team2 = random.choice(team_names)

    # create a random match between the 2 teams
    randommatch = create_random_match(team_dataframes_5[team1]['p_id'].values, team_dataframes_5[team2]['p_id'].values)

    return randommatch

In [429]:
create_random_match_data()

Unnamed: 0,Match,Player,Hero,Kills,Deaths,Assists,Last_Hits,Denies,GPM,XPM,Hero_Damage,Tower_Damage,Hero_Healing,Level,Win
0,140011,199,49,5,7,6,32,6,307,472,10028,384,104,20,1
1,140011,200,42,9,0,24,52,6,270,279,36364,1606,10544,17,1
2,140011,201,26,2,4,16,229,25,549,555,35580,20909,48,26,1
3,140011,202,116,11,3,17,135,24,511,857,21349,351,919,20,1
4,140011,203,97,4,5,25,147,11,338,567,17787,6396,9981,27,1
5,140011,145,29,1,9,11,25,3,285,450,17156,140,151,15,0
6,140011,146,72,2,8,28,105,0,318,564,16633,318,14045,23,0
7,140011,147,11,11,1,12,429,7,752,737,25044,10319,0,21,0
8,140011,148,104,7,5,11,159,21,423,567,22967,494,3708,15,0
9,140011,149,1,3,6,16,40,2,320,484,12624,589,0,20,0


In [None]:
regions = ['North America', 'Europe', 'China', 'Southeast Asia', 'South America', 'CIS']

In [None]:
# function that takes a pandas dataframe and assign a region based on each player country
def assign_region(df):
    df['Region'] = df['country'].apply(lambda x: 'North America' if x in ['United States', 'Canada'] else 'Europe' if x in ['United Kingdom', 'Germany', 'France', 'Spain', 'Italy', 'Netherlands', 'Poland', 'Sweden', 'Denmark', 'Norway', 'Finland', 'Belgium', 'Switzerland', 'Austria', 'Portugal', 'Greece', 'Czech Republic', 'Hungary', 'Romania', 'Bulgaria', 'Serbia', 'Slovenia', 'Bosnia and Herzegovina', 'Croatia', 'Ireland', 'Luxembourg', 'Slovakia', 'Estonia', 'Latvia', 'Lithuania', 'Moldova', 'Montenegro', 'Albania', 'Cyprus', 'Macedonia', 'Malta', 'Ukraine', 'Armenia', 'Azerbaijan', 'Georgia', 'Kazakhstan', 'Russia', 'Turkey', 'Belarus', 'Iceland', 'Liechtenstein', 'Monaco', 'San Marino', 'Vatican City'] else 'China' if x in ['China', 'Hong Kong', 'Taiwan'] else 'Southeast Asia' if x in ['Singapore', 'Malaysia', 'Philippines', 'Indonesia', 'Thailand', 'Vietnam', 'Cambodia', 'Laos', 'Myanmar', 'Brunei'] else 'South America' if x in ['Brazil', 'Argentina', 'Chile', 'Colombia', 'Ecuador', 'Peru', 'Venezuela', 'Uruguay', 'Bolivia', 'Paraguay', 'Guyana', 'Suriname', 'French Guiana', 'Easter Island', 'Falkland Islands', 'South Georgia and the South Sandwich Islands', 'South Sandwich Islands', 'Antarctica'] else 'CIS' if x in ['Russia', 'Ukraine', 'Belarus', 'Kazakhstan', 'Azerbaijan', 'Armenia', 'Georgia', 'Kyrgyzstan', 'Moldova', 'Tajikistan', 'Turkmenistan', 'Uzbekistan', 'Kosovo', 'Abkhazia', 'South Ossetia'] else 'Other')
    return df

In [None]:
dota_players = assign_region(dota_active_players)

In [None]:
# function that detects first occurence of char
def find_first(string, char):
    for i, c in enumerate(string):
        if c == char:
            return i
    return -1

# function that detects last occurence of char
def find_last(string, char):
    for i, c in enumerate(string[::-1]):
        if c == char:
            return len(string) - i - 1
    return -1

In [None]:
# split the team column into team name and role
dota_players['team_name'] = dota_players['team'].apply(lambda x: x[:find_first(x, '(')].strip())
dota_players['role'] = dota_players['team'].apply(lambda x: x[find_first(x, '(')+1:find_last(x, ')')])
dota_players.drop('team', axis=1, inplace=True)

In [None]:
roles = ['Support', 'Offlaner', 'Solo Middle', 'Carry', 'Coach']

In [None]:
# filter the roles
dota_players = dota_players[dota_players['role'].isin(roles)]

### Below is for scraping dota data from liquipedia

In [None]:
# scrape the data from the website
url = "https://liquipedia.net/dota2/Players_(all)"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
rows = soup.findAll("tr")
# # find the table with the data
# table = soup.find('table', class_='wikitable')

In [None]:
rows[1]

In [None]:
indexes = rows[1]
index_values = []
for cell in indexes.find_all("th"):
    index_values.append(cell.get_text().rstrip())
players = []

In [None]:
index_values

In [None]:
for row in rows:
    if len(row) > 3:
        player = {}
        cells = row.find_all("td")
        for i in range(0, len(cells)):
            key = index_values[i]
            if key == " Links":
                key = "country"
                value = cells[0].find("a").get("title")
            else:
                value = cells[i].get_text().rstrip()
            player[key] = value
        if len(player) > 0:
            players.append(player)

In [None]:
player_df = pd.DataFrame(players)

In [None]:
player_df.rename(
    columns={" Real Name": "name", " Team": "team", " ID": "ID"}, inplace=True
)

In [None]:
filter_roles = {
    "Carry",
    "Mid",
    "Offlane",
    "Support",
    "Hard Support",
    "Soft Support",
    "Hard",
    "Soft",
    "Hard Carry",
    "Soft Carry",
    "Hard Carry",
    "Solo Middle",
    "Analyst",
    "Caster",
    "Host",
    "Offlaner"
}
# drop the rows that are in the filter_roles
player_df = player_df[~player_df["team"].isin(filter_roles)]

In [None]:
player_df.to_csv("liquipedia_dota_players.csv", index=False)

In [None]:
# strip all leading and trailing spaces
player_df = player_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [None]:
url = "https://liquipedia.net/dota2/Portal:Teams"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
teams = []
divs = soup.find_all('div',class_="panel-box")

In [None]:


def get_teams():
	url = "https://liquipedia.net/dota2/Portal:Teams"
	page = requests.get(url)
	soup = BeautifulSoup(page.content, "html.parser")
	teams = []
	divs = soup.find_all('div',class_="panel-box")
	templates = soup.find_all('span',class_="team-template-team-standard")
	for team in templates:
		teams.append(team.a['title'])
			
	return teams

In [None]:
teamdf = pd.DataFrame(get_teams())

In [None]:
teamdf.to_csv("activeteams.csv")

In [None]:
print(
    "Min: {}, Max: {}".format(
        dota_performances["GPM"].min(), dota_performances["GPM"].max()
    )
)