In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# import the required libraries for web scraping
# from bs4 import BeautifulSoup
# import requests
# import re
# import time
# import random

In [2]:
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)

In [None]:
dota_players = pd.read_csv('data/dota_players_final.csv')
dota_teams = pd.read_csv('data/dota_teams.csv')
dota_heroes = pd.read_csv('data/dota_heroes.csv')
hero_stats = pd.read_csv('data/hero_stats.csv')
dota_items = pd.read_csv('data/dota_items.csv')
dota_shop_items = pd.read_csv('data/dota_shop_items.csv')
item_abilities = pd.read_csv('data/item_abilities.csv')

In [2]:
random_matches = pd.read_csv('random_matches.csv')
random_game_data = pd.read_csv('random_game_data.csv')
random_team_games = pd.read_csv('r_team_games.csv')

In [16]:
# create a dataframe for every row where g_win is 1
# this will be used to create a dataframe for every team that won a game

winning_teams = random_matches[random_matches['g_win'] == 1]

In [21]:
w_teams = winning_teams.drop_duplicates(subset=['game_id', 'team_id'])

In [23]:
# set the values of g_win in random_game_data to the matching team_id in w_teams (match based on game_id)

random_game_data['g_win'] = random_game_data['game_id'].map(w_teams.set_index('game_id')['team_id'])

In [28]:
random_game_data.game_date = pd.to_datetime(random_game_data.game_date)

In [25]:
random_game_data.to_csv('random_game_data.csv', index=False)

In [11]:
def split_team_id(df):
    t1 = df["team_ids"][0]
    t2 = df["team_ids"][1]
    df["team_1"] = t1
    df["team_2"] = t2
    return df

In [None]:
r_games = r_m_d.apply(split_team_id, axis=1)

In [None]:
r_games.to_csv('data/r_team_games.csv', index=False)

In [None]:
# output a dictionary of team_id and the game_id they played in and an indicator of whether they won or lost
team_id_dict = {}
for team_id in random_match_data['team_id'].unique():
    team_id_dict[team_id] = random_match_data[random_match_data['team_id'] == team_id][['game_id', 'g_win']].values.tolist()

In [None]:
# use the random_match_data dataframe to create a dataframe with game_id, game_win (the team_id of the team with g_win = 1), random game_duration (between 20 and 60 minutes), and random game_date (between 2017-03-19 and 2022-12-31)
# random_game_data = random_match_data[['game_id', 'team_id']].drop_duplicates(subset=['game_id'])
random_game_data['game_duration'] = np.random.randint(20, 60, random_game_data.shape[0])
random_game_data['game_date'] = pd.to_datetime(np.random.randint(1489987200, 1672531200, random_game_data.shape[0]), unit='s')
random_game_data.head()

In [None]:
random_match_data

In [None]:
# use the random_match_data dataframe to create a dataframe with game_id, game_win (the team_id of the team with g_win = 1), random game_duration (between 20 and 60 minutes), and random game_date (between 2017-03-19 and 2022-12-31)
random_game_data = random_match_data[['game_id', 'g_win']].drop_duplicates(subset=['game_id'])

In [None]:
random_game_data.sort_values(by='game_date', ascending=True, inplace=True)

In [None]:
random_game_data

In [None]:
# calculate the wins for each team from the random_match_data dataframe
team_wins = random_match_data.groupby('team_id')['g_win'].sum().reset_index()
team_wins.g_win = team_wins.g_win/5

In [None]:
single_performances = pd.read_csv('data/datdota_singleperformances.csv')

In [None]:
item_abilities.to_csv('data/item_abilities.csv', index=False)

In [None]:
dota_shop_items

In [None]:
dota_shop_items.to_csv('data/dota_shop_items.csv', index=False)

In [None]:
# get a list of how gpm scales with kills and last hits
gpm_kills = single_performances.groupby('Kills')['GPM'].mean()
gpm_lh = single_performances.groupby('Last Hits')['GPM'].mean()
xpm_kills = single_performances.groupby('Kills')['XPM'].mean()
xpm_lh = single_performances.groupby('Last Hits')['XPM'].mean()
xpm_level = single_performances.groupby('LVL')['XPM'].mean()

# plot the data
fig, ax = plt.subplots(figsize=(10, 6))
fig2, ax2 = plt.subplots(figsize=(10, 6))
ax.plot(gpm_kills.index, gpm_kills.values, label='GPM per kill')
ax2.plot(gpm_lh.index, gpm_lh.values, label='GPM per last hit')
ax.set_xlabel('Kills')
ax.set_ylabel('GPM')
ax2.set_xlabel('Last Hits')
ax2.set_ylabel('GPM')
ax2.legend()
ax.legend()
plt.show()

In [None]:
# drop the major outliers in single_performances
single_performances = single_performances[single_performances['Kills'] < 30]

In [None]:
# get the value ranges for all columns in single_performances sorted by hero, ignoring Match, Player, Result, and End Game Items
hero_stats = single_performances.drop(['Match', 'Player', 'Result', 'End Game Items', 'KDA'], axis=1).groupby('Hero').agg(['min', 'max', 'mean'])

In [None]:
# get the value ranges for all columns in single_performances and put them in a dictionary
value_ranges = {}
value_averages = {}
for col in single_performances.columns:
    if col != 'Match' and col != 'Hero' and col != 'Player' and col != 'Result' and col != 'End Game Items':
        value_ranges[col] = [min(single_performances[col]), max(single_performances[col])]
        value_averages[col] = np.mean(single_performances[col])
value_ranges

In [None]:
player_options = dota_players['ID'].unique()

In [None]:
# create a dataframe of dataframes for each team_name in dota_players
team_dataframes = {}
for team_name in dota_players['team_name'].unique():
    team_dataframes[team_name] = dota_players[dota_players['team_name'] == team_name]

# get a subframe of all teams with at least 5 players
team_dataframes_5 = {k: v for k, v in team_dataframes.items() if len(v) >= 5}

# get a list of all team names in team_dataframes_5
team_names = list(team_dataframes_5.keys())

In [None]:
# get a subframe of all teams with at least 5 players
team_dataframes_5 = {k: v for k, v in team_dataframes.items() if len(v) >= 5}

In [None]:
# get a list of all team names in team_dataframes_5
team_names = list(team_dataframes_5.keys())

In [None]:
team_dataframes_5["Alliance.LATAM"]

In [None]:
# a function to select 2 random teams from team_names, then create a random match between their players
def create_random_match_data():
    # select 2 random teams
    team1 = random.choice(team_names)
    team2 = random.choice(team_names)
    while team1 == team2:
        team2 = random.choice(team_names)

    # create a random match between the 2 teams
    randommatch = create_random_match(team_dataframes_5[team1]['p_id'].values, team_dataframes_5[team2]['p_id'].values)

    return randommatch

In [None]:
create_random_match_data()

In [None]:
regions = ['North America', 'Europe', 'China', 'Southeast Asia', 'South America', 'CIS']

In [None]:
# function that takes a pandas dataframe and assign a region based on each player country
def assign_region(df):
    df['Region'] = df['country'].apply(lambda x: 'North America' if x in ['United States', 'Canada'] else 'Europe' if x in ['United Kingdom', 'Germany', 'France', 'Spain', 'Italy', 'Netherlands', 'Poland', 'Sweden', 'Denmark', 'Norway', 'Finland', 'Belgium', 'Switzerland', 'Austria', 'Portugal', 'Greece', 'Czech Republic', 'Hungary', 'Romania', 'Bulgaria', 'Serbia', 'Slovenia', 'Bosnia and Herzegovina', 'Croatia', 'Ireland', 'Luxembourg', 'Slovakia', 'Estonia', 'Latvia', 'Lithuania', 'Moldova', 'Montenegro', 'Albania', 'Cyprus', 'Macedonia', 'Malta', 'Ukraine', 'Armenia', 'Azerbaijan', 'Georgia', 'Kazakhstan', 'Russia', 'Turkey', 'Belarus', 'Iceland', 'Liechtenstein', 'Monaco', 'San Marino', 'Vatican City'] else 'China' if x in ['China', 'Hong Kong', 'Taiwan'] else 'Southeast Asia' if x in ['Singapore', 'Malaysia', 'Philippines', 'Indonesia', 'Thailand', 'Vietnam', 'Cambodia', 'Laos', 'Myanmar', 'Brunei'] else 'South America' if x in ['Brazil', 'Argentina', 'Chile', 'Colombia', 'Ecuador', 'Peru', 'Venezuela', 'Uruguay', 'Bolivia', 'Paraguay', 'Guyana', 'Suriname', 'French Guiana', 'Easter Island', 'Falkland Islands', 'South Georgia and the South Sandwich Islands', 'South Sandwich Islands', 'Antarctica'] else 'CIS' if x in ['Russia', 'Ukraine', 'Belarus', 'Kazakhstan', 'Azerbaijan', 'Armenia', 'Georgia', 'Kyrgyzstan', 'Moldova', 'Tajikistan', 'Turkmenistan', 'Uzbekistan', 'Kosovo', 'Abkhazia', 'South Ossetia'] else 'Other')
    return df

In [None]:
dota_players = assign_region(dota_active_players)

In [None]:
# function that detects first occurence of char
def find_first(string, char):
    for i, c in enumerate(string):
        if c == char:
            return i
    return -1

# function that detects last occurence of char
def find_last(string, char):
    for i, c in enumerate(string[::-1]):
        if c == char:
            return len(string) - i - 1
    return -1

In [None]:
# split the team column into team name and role
dota_players['team_name'] = dota_players['team'].apply(lambda x: x[:find_first(x, '(')].strip())
dota_players['role'] = dota_players['team'].apply(lambda x: x[find_first(x, '(')+1:find_last(x, ')')])
dota_players.drop('team', axis=1, inplace=True)

In [None]:
roles = ['Support', 'Offlaner', 'Solo Middle', 'Carry', 'Coach']

In [None]:
# filter the roles
dota_players = dota_players[dota_players['role'].isin(roles)]

### Below is for scraping dota data from liquipedia

In [None]:
# scrape the data from the website
url = "https://liquipedia.net/dota2/Players_(all)"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
rows = soup.findAll("tr")
# # find the table with the data
# table = soup.find('table', class_='wikitable')

In [None]:
rows[1]

In [None]:
indexes = rows[1]
index_values = []
for cell in indexes.find_all("th"):
    index_values.append(cell.get_text().rstrip())
players = []

In [None]:
index_values

In [None]:
for row in rows:
    if len(row) > 3:
        player = {}
        cells = row.find_all("td")
        for i in range(0, len(cells)):
            key = index_values[i]
            if key == " Links":
                key = "country"
                value = cells[0].find("a").get("title")
            else:
                value = cells[i].get_text().rstrip()
            player[key] = value
        if len(player) > 0:
            players.append(player)

In [None]:
player_df = pd.DataFrame(players)

In [None]:
player_df.rename(
    columns={" Real Name": "name", " Team": "team", " ID": "ID"}, inplace=True
)

In [None]:
filter_roles = {
    "Carry",
    "Mid",
    "Offlane",
    "Support",
    "Hard Support",
    "Soft Support",
    "Hard",
    "Soft",
    "Hard Carry",
    "Soft Carry",
    "Hard Carry",
    "Solo Middle",
    "Analyst",
    "Caster",
    "Host",
    "Offlaner"
}
# drop the rows that are in the filter_roles
player_df = player_df[~player_df["team"].isin(filter_roles)]

In [None]:
player_df.to_csv("liquipedia_dota_players.csv", index=False)

In [None]:
# strip all leading and trailing spaces
player_df = player_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [None]:
url = "https://liquipedia.net/dota2/Portal:Teams"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
teams = []
divs = soup.find_all('div',class_="panel-box")

In [None]:


def get_teams():
	url = "https://liquipedia.net/dota2/Portal:Teams"
	page = requests.get(url)
	soup = BeautifulSoup(page.content, "html.parser")
	teams = []
	divs = soup.find_all('div',class_="panel-box")
	templates = soup.find_all('span',class_="team-template-team-standard")
	for team in templates:
		teams.append(team.a['title'])
			
	return teams

In [None]:
teamdf = pd.DataFrame(get_teams())

In [None]:
teamdf.to_csv("activeteams.csv")

In [None]:
print(
    "Min: {}, Max: {}".format(
        dota_performances["GPM"].min(), dota_performances["GPM"].max()
    )
)