In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
from urllib.parse import urlparse, unquote
import re

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark==3.3.2
!pip install beautifulsoup4 requests

import os
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, regexp_replace, format_number
from pyspark.sql.types import FloatType

os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Create a Spark session
spark = SparkSession.builder.appName("HeroData").getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 10000)



In [2]:
# URL for the Dota 2 heroes list on Gamepedia
url = "https://dota2.gamepedia.com/Heroes"

# Fetch the HTML content from the URL
response = requests.get(url)
html_content = response.content

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Find all hero names within <span> elements with the specified style
hero_spans = soup.find_all("span", style="font-size:17px; color:white; text-shadow:-1px 0 0.2em black, 0 1px 0.2em black, 1px 0 0.2em black, 0 -1px 0.2em black;")

# Extract hero names and store them in a list
hero_names = [span.text.strip() for span in hero_spans]

all_hero_names = [
    "https://dota2protracker.com/hero/" + quote(name) + "/new" for name in hero_names
]


In [3]:
analysis_modes = {
    'pos1': "pos 1,carry,core",
    'pos2': "pos 2,mid,core",
    'pos3': "pos 3,offlane,core",
    'pos4': "pos 4, support",
    'pos5': "pos 5, support"
}

roles = [
    "th-pos-1",
    "th-pos-2",
    "th-pos-3",
    "th-pos-4",
    "th-pos-5"
]

def get_role_by_input_parameter(input_parameter):
    try:
        role = roles[input_parameter - 1]
        return role
    except IndexError:
        return "Invalid input_parameter. Please choose a valid role."

# def get_analysis_mode(position):
#   if position is not None:
#       print(position)
#   else:
#       print(f"Invalid analysis_mode: {analysis_mode}")

In [4]:
def best_heroes_to_draft(input_heroes, analysis_mode):

  position = analysis_modes.get(analysis_mode)

  print("Correlating input heroes with their pages...")
  # List of hero URLs from dota2protracker
  hero_list = []
  hero_list_index = []

  # Iterate through the input hero names
  for index, input_hero in enumerate(input_heroes):
      # Convert the input hero name to a standardized format for comparison
      standardized_input_hero = input_hero[0].lower().replace(" ", "%20")

      # Find matches in all_hero_names and store the index of the match
      matches = [(i, link) for i, link in enumerate(all_hero_names) if standardized_input_hero in link.lower()]

      # Append matches to the hero_list list and store the indices in hero_list_index
      for match_index, match_link in matches:
          hero_list.append(match_link)
          hero_list_index.append(match_index)

  # Check if the lengths of input_heroes and all_hero_names are equal
  if len(input_heroes) != len(hero_list):
      print(hero_list)
      assert len(input_heroes) == len(hero_list), "Lengths of input_heroes and all_hero_names are not equal."

  display_cleaned_hero_list = [unquote(urlparse(hero).path.split('/')[-2]).replace('%20', '_').replace('%27', '').replace('#', '').replace(' ', '_').replace('-', '_').replace("'", '').upper() for hero in hero_list]


  #WebScrap data and generate pyspark dataframe
  print("Web scraping data into pyspark dataframe...")
  for index, url in enumerate(hero_list):

    # Fetch the HTML content from the URL
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    heroes_data = []

    # Find the div with the corresponding id role and class "top-heroes-box"
    hero_rows = soup.find(id = get_role_by_input_parameter(input_heroes[index][1]), class_="top-heroes-box")

    # Find all divs with the specified class within the previously found div
    hero_rows_filtered = hero_rows.find_all(class_="flex py-1 px-2 bg-d2pt-gray-3 justify-start border-solid border-b border-d2pt-gray-5")

    # Iterate through each found div
    for row in hero_rows_filtered:
        # Extract the values of data-hero, data-wr, and data-pos attributes
        data_hero = row.get('data-hero')
        data_wr = row.get('data-wr')
        data_pos = row.get('data-pos')
        data_matches = row.get('data-matches')

        #Filtering by number of matches and corresponding role you want to play
        if int(data_matches) > 15 and data_pos == position:
            heroes_data.append({'data_hero': data_hero, 'data_wr': data_wr})

    df = spark.createDataFrame(heroes_data)

    df = df.withColumn("data_wr", (regexp_replace(col("data_wr"), "%", "").cast(FloatType()) / 100))
    df = df.withColumn("data_wr", format_number(col("data_wr"), 3))

    df.createOrReplaceTempView(f"hero_{index}")

  print("Generating SQL query...")
  # Generate the SQL query for creating the selected_heroes view
  union_queries = "\n    UNION\n    ".join([
      f"(SELECT data_hero FROM hero_{i})" # Add WHERE data_wr < .5 ORDER BY data_wr ASC if you want the old version
      for i in range(len(display_cleaned_hero_list))
  ])

  # Generate the main SQL query
  select_queries = ",\n    ".join([
      f"b{i}.data_wr AS {data_hero}"
      for i, data_hero in enumerate(display_cleaned_hero_list)
  ])

  aggregate_queries = " + ".join([
      f"b{i}.data_wr"
      for i in range(len(display_cleaned_hero_list))
  ])

  left_join_queries = "\n  ".join([
      f"LEFT JOIN hero_{i} b{i} ON a.data_hero = b{i}.data_hero"
      for i in range(len(display_cleaned_hero_list))
  ])

  where_conditions = "\n    AND ".join([
      f"b{i}.data_wr IS NOT NULL"
      for i in range(len(display_cleaned_hero_list))
  ])

  sql_query = f'''
    SELECT *
    FROM (
      SELECT
        UPPER(a.data_hero)                                                 AS POTENTIAL_HERO,
        ROUND(({aggregate_queries}) / {len(display_cleaned_hero_list)}, 3) AS LOSS_PROBABILITY_SCORE,
        {select_queries}
      FROM (
        {union_queries}
      ) a
      {left_join_queries}
      WHERE {where_conditions}
      ORDER BY LOSS_PROBABILITY_SCORE ASC
    )
    WHERE 1=1
      AND LOSS_PROBABILITY_SCORE < 0.5
  '''

  # Execute the SQL query
  spark.sql(sql_query).show()

In [6]:
# Input hero names
input_heroes = [
                ["PUCK", 2],
                ["GRIMSTROKE", 4],
                ["SPECTRE", 1],
                ["CENTAUR", 3],
                # ["PHOENIX", 5]
               ]

# Available Analysys mode options: "pos1", "pos2", "pos3", "pos4", "pos5"
best_heroes_to_draft(input_heroes, analysis_mode = 'pos3')

Correlating input heroes with their pages...
Web scraping data into pyspark dataframe...
Generating SQL query...
+--------------+----------------------+-----+----------+-------+-----------------+
|POTENTIAL_HERO|LOSS_PROBABILITY_SCORE| PUCK|GRIMSTROKE|SPECTRE|CENTAUR_WARRUNNER|
+--------------+----------------------+-----+----------+-------+-----------------+
|        VISAGE|                 0.409|0.500|     0.343|  0.392|            0.400|
|         VIPER|                 0.433|0.480|     0.424|  0.386|            0.441|
|     PANGOLIER|                 0.448|0.368|     0.520|  0.405|            0.500|
| NIGHT STALKER|                  0.45|0.373|     0.451|  0.512|            0.463|
|        KUNKKA|                 0.456|0.429|     0.441|  0.493|            0.460|
|         LYCAN|                 0.462|0.375|     0.419|  0.400|            0.652|
|   EARTHSHAKER|                 0.471|0.433|     0.472|  0.569|            0.409|
|         MARCI|                 0.474|0.432|     0.509| 