In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
from urllib.parse import urlparse, unquote
import re

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark==3.3.2
!pip install beautifulsoup4 requests

import os
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, regexp_replace, format_number
from pyspark.sql.types import FloatType

os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Create a Spark session
spark = SparkSession.builder.appName("HeroData").getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 10000)

Collecting pyspark==3.3.2
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5 (from pyspark==3.3.2)
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824005 sha256=b2a4d07d043ac9fb5dfbd38eb5006c0b13d37a942ce9e66426c75f9613a0ef2e
  Stored in directory: /root/.cache/pip/wheels/89/d6/52/1178e354ba2207673484f0ccd7b2ded0ab6671ae5c1fc5b49a
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py

In [46]:
# URL for the Dota 2 heroes list on Gamepedia
url = "https://dota2.gamepedia.com/Heroes"

# Fetch the HTML content from the URL
response = requests.get(url)
html_content = response.content

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Find all hero names within <span> elements with the specified style
hero_spans = soup.find_all("span", style="font-size:17px; color:white; text-shadow:-1px 0 0.2em black, 0 1px 0.2em black, 1px 0 0.2em black, 0 -1px 0.2em black;")

# Extract hero names and store them in a list
hero_names = [span.text.strip() for span in hero_spans]

all_hero_names = [
    "https://dota2protracker.com/hero/" + quote(name) + "/new" for name in hero_names
]


In [47]:
analysis_modes = {
    'pos1': "pos 1,carry,core",
    'pos2': "pos 2,mid,core",
    'pos3': "pos 3,offlane,core",
    'pos4': "pos 4,support",
    'pos5': "pos 5,support"
}

roles = [
    "th-pos-1",
    "th-pos-2",
    "th-pos-3",
    "th-pos-4",
    "th-pos-5"
]

def get_role_by_input_parameter(input_parameter):
    try:
        role = roles[input_parameter - 1]
        return role
    except IndexError:
        return "Invalid input_parameter. Please choose a valid role."


In [72]:
def best_heroes_to_draft(input_heroes, analysis_mode, minimum_matches):

  position = analysis_modes.get(analysis_mode)

  print("Correlating input heroes with their pages...")
  # List of hero URLs from dota2protracker
  hero_list = []
  hero_list_index = []

  # Iterate through the input hero names
  for index, input_hero in enumerate(input_heroes):
      # Convert the input hero name to a standardized format for comparison
      standardized_input_hero = input_hero[0].lower().replace(" ", "%20")

      # Find matches in all_hero_names and store the index of the match
      matches = [(i, link) for i, link in enumerate(all_hero_names) if standardized_input_hero in link.lower()]

      # Append matches to the hero_list list and store the indices in hero_list_index
      for match_index, match_link in matches:
          hero_list.append(match_link)
          hero_list_index.append(match_index)

  # Check if the lengths of input_heroes and all_hero_names are equal
  if len(input_heroes) != len(hero_list):
      print(hero_list)
      assert len(input_heroes) == len(hero_list), "Lengths of input_heroes and all_hero_names are not equal."

  display_cleaned_hero_list = [unquote(urlparse(hero).path.split('/')[-2]).replace('%20', '_').replace('%27', '').replace('#', '').replace(' ', '_').replace('-', '_').replace("'", '').upper() for hero in hero_list]

  #WebScrap data and generate pyspark dataframe
  print("Web scraping data into pyspark dataframe...")

  for index, url in enumerate(hero_list):

    # Fetch the HTML content from the URL
    response = requests.get(url)

    html_content = response.content

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    heroes_data = []

    # Find the div with the corresponding id role and class "top-heroes-box"
    hero_rows = soup.find(id = get_role_by_input_parameter(input_heroes[index][1]), class_="top-heroes-box")

    # Find all divs with the specified class within the previously found div
    hero_rows_filtered = hero_rows.find_all(class_="flex py-1 px-2 bg-d2pt-gray-3 justify-start border-solid border-b border-d2pt-gray-5")

    # Iterate through each found div
    for row in hero_rows_filtered:
        # Extract the values of data-hero, data-wr, and data-pos attributes
        data_hero = row.get('data-hero')
        data_wr = row.get('data-wr')
        data_pos = row.get('data-pos')
        data_matches = row.get('data-matches')

        #Filtering by number of matches and corresponding role you want to play
        if int(data_matches) > minimum_matches and data_pos == position:
            heroes_data.append({'data_hero': data_hero, 'data_wr': data_wr})

    assert len(heroes_data) > 0, f"No data found on {input_heroes[index][0]}'s choosen role. Try setting it's popular role instead."
    print(url, "Done.")

    df = spark.createDataFrame(heroes_data)

    df = df.withColumn("data_wr", (regexp_replace(col("data_wr"), "%", "").cast(FloatType()) / 100))
    df = df.withColumn("data_wr", format_number(col("data_wr"), 3))

    df.createOrReplaceTempView(f"hero_{index}")

  print("Generating SQL query...")
  # Generate the SQL query for creating the selected_heroes view
  union_queries = "\n    UNION\n    ".join([
      f"(SELECT data_hero FROM hero_{i})" # Add WHERE data_wr < .5 ORDER BY data_wr ASC if you want the old version
      for i in range(len(display_cleaned_hero_list))
  ])

  # Generate the main SQL query
  select_queries = ",\n    ".join([
      f"b{i}.data_wr AS {data_hero}"
      for i, data_hero in enumerate(display_cleaned_hero_list)
  ])

  aggregate_queries = " + ".join([
      f"b{i}.data_wr"
      for i in range(len(display_cleaned_hero_list))
  ])

  left_join_queries = "\n  ".join([
      f"LEFT JOIN hero_{i} b{i} ON a.data_hero = b{i}.data_hero"
      for i in range(len(display_cleaned_hero_list))
  ])

  where_conditions = "\n    AND ".join([
      f"b{i}.data_wr IS NOT NULL"
      for i in range(len(display_cleaned_hero_list))
  ])

  sql_query = f'''
    SELECT *
    FROM (
      SELECT
        UPPER(a.data_hero)                                                 AS POTENTIAL_HERO,
        ROUND(({aggregate_queries}) / {len(display_cleaned_hero_list)}, 3) AS LOSS_PROBABILITY_SCORE,
        {select_queries}
      FROM (
        {union_queries}
      ) a
      {left_join_queries}
      WHERE {where_conditions}
      ORDER BY LOSS_PROBABILITY_SCORE ASC
    )
    WHERE 1=1
      --AND LOSS_PROBABILITY_SCORE < 0.5
  '''

  # Execute the SQL query
  spark.sql(sql_query).show()

In [73]:
# Input hero names
input_heroes = [
                ["necro", 1],
                ["templar", 2],
                ["wraith", 3],
                ["phoenix", 4],
                ["grim", 5]
               ]

# Available Analysys mode options: "pos1", "pos2", "pos3", "pos4", "pos5"
best_heroes_to_draft(input_heroes, analysis_mode = 'pos3', minimum_matches = 1)

Correlating input heroes with their pages...
Web scraping data into pyspark dataframe...
https://dota2protracker.com/hero/Necrophos/new Done.
https://dota2protracker.com/hero/Templar%20Assassin/new Done.
https://dota2protracker.com/hero/Wraith%20King/new Done.
https://dota2protracker.com/hero/Phoenix/new Done.
https://dota2protracker.com/hero/Grimstroke/new Done.
Generating SQL query...
+-----------------+----------------------+---------+----------------+-----------+-------+----------+
|   POTENTIAL_HERO|LOSS_PROBABILITY_SCORE|NECROPHOS|TEMPLAR_ASSASSIN|WRAITH_KING|PHOENIX|GRIMSTROKE|
+-----------------+----------------------+---------+----------------+-----------+-------+----------+
|     CHAOS KNIGHT|                 0.361|    0.083|           0.367|      0.469|  0.470|     0.414|
|       LONE DRUID|                 0.446|    0.286|           0.578|      0.460|  0.473|     0.432|
|CENTAUR WARRUNNER|                 0.451|    0.357|           0.489|      0.455|  0.455|     0.501|
|   

In [6]:
#TODO: make it get only heroes with matches > 15 but if it's not possible, set it at minimum and specifies what heroes has this condition
#TODO: improve performance to atleast 10s (web scraping is taking longer)

In [50]:
url

'https://dota2.gamepedia.com/Heroes'

In [7]:
spark.sql('''
select * from hero_0 WHERE DATA_HERO LIKE "%TREANT%"
''')

data_hero,data_wr


In [8]:
spark.sql('''
select count(*) from hero_0
union all
select count(*) from hero_1
union all
select count(*) from hero_2
union all
select count(*) from hero_3
''')

AnalysisException: ignored