# Scrape

In [2]:
from bs4 import BeautifulSoup
import requests
import polars as pl
import re

league_name = 'nemelee'
nemelee_tournaments_url = f'https://braacket.com/league/{league_name}/tournament?rows=200'

In [2]:
tournament_url = 'https://braacket.com/tournament/765DFCA4-947A-4218-A1F8-3EBF3E9CDCBE/match'

# Define scraper components

In [3]:
def scrape_tournament_tags(url):
    response = requests.get(url)
    tournament_soup = BeautifulSoup(response.content, 'html.parser')
    num_pages = (
        tournament_soup
        .find(class_='search-pagination')
        .find_all(class_='input-group-addon')[-1]
        .text.split(' ')[-1].strip()
    )

    tournament_tags = []
    for i in range(1, int(num_pages) + 1):
        url = f'{nemelee_tournaments_url}&page={i}'
        response = requests.get(url)
        tournament_soup = BeautifulSoup(response.content, 'html.parser')
        tournament_tags += list(
            map(
                lambda x: x.find_parent(class_='panel'),
                tournament_soup.find_all(string='Detail')
            )
        )
    return tournament_tags


In [4]:
tournament_tags = scrape_tournament_tags(nemelee_tournaments_url)

In [57]:
tournament_tags[0].find(class_='country_flag').find('img').get('src', None)

'/assets/images/country/flag/us.png'

In [5]:
def extract_tournament_data(tournament_tags):
    tournaments = []
    for tournament_tag in tournament_tags:
        data = {
            'url': tournament_tag.find(class_='panel-heading').find('a')['href'],
            'name': tournament_tag.find(class_='panel-heading').find('a').text.strip(),
            'date': tournament_tag.find(string='Date').parent
                     .find_next_sibling().text.strip() if tournament_tag.find(string='Date').parent.find_next_sibling() else None,
            'country': (tournament_tag.find(class_='country_flag').find('img').get('src', None)
                        if tournament_tag.find(class_='country_flag') else None),
            'region': (tournament_tag.find(class_='country_region_flag').get('src', None)
                       if tournament_tag.find(class_='country_region_flag') else None),
            'number_of_players': (int(tournament_tag.find(data_original_title_='Imported players').text.strip())
                                  if tournament_tag.find(data_original_title_='Imported players') else None)
        }
        tournaments.append(data)
    return tournaments


In [None]:
tournaments = extract_tournament_data(tournament_tags)
# Convert the list of dictionaries to a Polars DataFrame
tournaments_df = pl.DataFrame(tournaments, schema={col: pl.String for col in tournaments[0].keys()}).filter(pl.col('url').is_not_null())
# Save the DataFrame to a CSV file
pl.config.set_tbl_rows(2)
tournaments_df

In [None]:
def scrape_stage_urls(match_url):
    response = requests.get(match_url)
    match_soup = BeautifulSoup(response.content, 'html.parser')
    stage_urls = list(set(
        map(
            lambda x: x['href'],
            match_soup.find_all(attrs={"href": re.compile(r"^/tournament/.*/stage/.*$")}),
        )
    ))
    return stage_urls


In [None]:
match_url = f'https://braacket.com/{tournaments[0]['url']}/match'
print(match_url)
stage_urls = scrape_stage_urls(match_url)

In [None]:
tournament_url = 'https://braacket.com/tournament/765DFCA4-947A-4218-A1F8-3EBF3E9CDCBE/match'
scrape_stage_urls(tournament_url)

In [7]:
player_url = '/tournament/633C3E6E-A97A-45F9-B1B4-1ADA348C350C/player/F6E88BFB-8602-4A5A-9A02-7FA6D3098A6C'

def scrape_player_url(player_url):
    response = requests.get(f'https://braacket.com/{player_url}')
    player_soup = BeautifulSoup(response.content, 'html.parser')
    league_links = player_soup.find_all(attrs={"href": re.compile(rf"^/league/{league_name}/player/.*$")})
    if not league_links:
        return None
    player_url = league_links[0]['href']
    return player_url
scrape_player_url(player_url)

'/league/nemelee/player/C2069576-DB5B-42E0-9C4B-31182802A60F?'

In [8]:
def scrape_stage_matches(tournament: dict, stage_url: str):
    matches = []
    response = requests.get(f'https://braacket.com/{stage_url}')
    stage_soup = BeautifulSoup(response.content, 'html.parser')

    player_url_dict = {}
    stage_encounters = stage_soup.find_all(class_='tournament_encounter-row')
    for encounter in stage_encounters:
        winner_tournament_url = encounter.select('.tournament_encounter_opponent.winner a')[0]['href']
        loser_tournament_url = encounter.select('.tournament_encounter_opponent.loser a')[0]['href']
        if winner_tournament_url not in player_url_dict:
            player_url_dict[winner_tournament_url] = scrape_player_url(winner_tournament_url)
        if loser_tournament_url not in player_url_dict:
            player_url_dict[loser_tournament_url] = scrape_player_url(loser_tournament_url)
        data = {
            'encounter_id': encounter.find(class_='tournament_encounter-id').text.strip(),
            'winner_url': player_url_dict[winner_tournament_url],
            'winner': encounter.select('.tournament_encounter_opponent.winner')[0].text.strip(),
            'loser_url': player_url_dict[loser_tournament_url],
            'loser': encounter.select('.tournament_encounter_opponent.loser')[0].text.strip(),
            'winner_score': encounter.select('.tournament_encounter-score.winner')[0].text.strip(),
            'loser_score': encounter.select('.tournament_encounter-score.loser')[0].text.strip(),
        }
        data.update({
            f'tournament_{k}': v
            for k, v in tournament.items()
        })
        matches.append(data)
    return matches


#matches = scrape_stage_matches(tournaments[0], stage_urls[0])

In [None]:
# Convert the list of dictionaries to a Polars DataFrame
matches_df = (
    pl.DataFrame(matches, schema={col: pl.String for col in matches[0].keys()})
)
# Save the DataFrame to a CSV file
matches_df

In [78]:
response = requests.get(f'https://braacket.com/{stage_urls[0]}')
stage_soup = BeautifulSoup(response.content, 'html.parser')

stage_encounters = stage_soup.find_all(class_='tournament_encounter-row')


In [79]:
encounter = stage_encounters[0]
data = {
    'encounter_id': encounter.find(class_='tournament_encounter-id').text.strip(),
    'winner_url': encounter.select('.tournament_encounter_opponent.winner a')[0]['href'],
    'winner': encounter.select('.tournament_encounter_opponent.winner')[0].text.strip(),
    'loser_url': encounter.select('.tournament_encounter_opponent.loser a')[0]['href'],
    'loser': encounter.select('.tournament_encounter_opponent.loser')[0].text.strip(),
    'winner_score': encounter.select('.tournament_encounter-score.winner')[0].text.strip(),
    'loser_score': encounter.select('.tournament_encounter-score.loser')[0].text.strip(),
}
data

{'encounter_id': '1',
 'winner_url': '/tournament/633C3E6E-A97A-45F9-B1B4-1ADA348C350C/player/F6E88BFB-8602-4A5A-9A02-7FA6D3098A6C',
 'winner': 'MATE | Kalvar',
 'loser_url': '/tournament/633C3E6E-A97A-45F9-B1B4-1ADA348C350C/player/81101D13-20BA-4EAE-B751-833DA282F716',
 'loser': 'kuro',
 'winner_score': '3',
 'loser_score': '0'}

In [80]:
encounter.select('.tournament_encounter_opponent.loser a')

[<a href="/tournament/633C3E6E-A97A-45F9-B1B4-1ADA348C350C/player/81101D13-20BA-4EAE-B751-833DA282F716">kuro</a>]

## Run Scraper

In [84]:
tournament_tags = scrape_tournament_tags(nemelee_tournaments_url)

In [None]:
tournaments = extract_tournament_data(tournament_tags)
tournaments_df = pl.DataFrame(tournaments, schema={col: pl.String for col in tournaments[0].keys()}).filter(pl.col('url').is_not_null())

matches = []
for tournament in tournaments:
    tournament_matches = []
    try:
        match_url = f'https://braacket.com/{tournament["url"]}/match'
        stage_urls = scrape_stage_urls(match_url)
        for stage_url in stage_urls:
            tournament_matches += scrape_stage_matches(tournament, stage_url)
    except Exception as e:
        print(f"Error processing tournament {tournament['name']}, {tournament['url']}: {e}")
        continue
    matches += tournament_matches


Error processing tournament FreePlay Melee #104, /tournament/5DBE0AC6-EBEB-4C2E-B64A-F8F432F85EF8: list index out of range
Error processing tournament New Game Plus Revival 7.7, /tournament/765DFCA4-947A-4218-A1F8-3EBF3E9CDCBE: list index out of range
Error processing tournament Melee Monday @ Bumper 9, /tournament/CADFAF5F-9AEE-4071-BE1E-977E41C05085: list index out of range
Error processing tournament Buffalo Wild Wednesdays v5.1.3, /tournament/9651E764-012D-4652-B7D7-9B81BC56B081: list index out of range
Error processing tournament The 2024 Upstate Melee Championship & Arcadian [NE Only], /tournament/6F173939-0D68-46C0-BAB3-B0392200CE1F: list index out of range
Error processing tournament One Up Melee 9.4.24, /tournament/343299CA-F3AE-47B1-9899-E6E0E1B17DFF: list index out of range
Error processing tournament One Up Melee 8.7.24, /tournament/AF7BAD62-EBED-461E-BC34-04A47B1F7B48: list index out of range
Error processing tournament New Game Plus Revival 6.9, /tournament/8DB1E88B-E0BF-

KeyboardInterrupt: 

In [13]:
match_url = 'https://braacket.com/tournament/5DBE0AC6-EBEB-4C2E-B64A-F8F432F85EF8/match'
tournament_matches = []
stage_urls = scrape_stage_urls(match_url)
for stage_url in stage_urls:
    tournament_matches += scrape_stage_matches(stage_url)

ConnectionError: HTTPSConnectionPool(host='braacket.com', port=443): Max retries exceeded with url: /tournament/5DBE0AC6-EBEB-4C2E-B64A-F8F432F85EF8/match (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x74503654fb10>: Failed to resolve 'braacket.com' ([Errno -2] Name or service not known)"))

In [None]:
# Only scrape matches from tournaments not already in matches
matches_df = pl.read_csv('data/matches.csv')

tournaments = extract_tournament_data(tournament_tags)
tournaments_df = pl.DataFrame(tournaments, schema={col: pl.String for col in tournaments[0].keys()}).filter(pl.col('url').is_not_null())

matches = []
for tournament in tournaments:
    if matches_df.select((pl.col('tournament_url') == tournament['url']).any()).item():
        continue
    print(f"Scraping matches for tournament {tournament['name']}, {tournament['url']}")
    tournament_matches = []
    try:
        match_url = f'https://braacket.com/{tournament["url"]}/match'
        stage_urls = scrape_stage_urls(match_url)
        for stage_url in stage_urls:
            tournament_matches += scrape_stage_matches(tournament, stage_url)
    except Exception as e:
        print(f"Error processing tournament {tournament['name']}, {tournament['url']}: {e}")
        continue
    matches += tournament_matches

Scraping matches for tournament New Game Plus Revival 8.3, /tournament/EBFBECF8-08C9-4962-8EA1-53039F6048A7
Scraping matches for tournament FreePlay Melee #114, /tournament/271B645C-277C-48E8-ADD9-7B66683698AC
Scraping matches for tournament Pho Tai Melee #97 OwO, /tournament/A75B8B35-2596-464F-8F26-35B5889CEB5B
Scraping matches for tournament FreePlay Melee #104, /tournament/5DBE0AC6-EBEB-4C2E-B64A-F8F432F85EF8
Scraping matches for tournament New Game Plus Revival 7.7, /tournament/765DFCA4-947A-4218-A1F8-3EBF3E9CDCBE
Scraping matches for tournament FreePlay Melee #98, /tournament/A812E6CC-BB8F-447E-A868-672565EEDF59
Scraping matches for tournament Melee Monday @ Bumper’s 12, /tournament/19F24CF6-D9D0-4615-869D-09B637E364E6
Scraping matches for tournament Return to Vacationland 7, /tournament/E237231B-7841-4665-BE3E-207D170BA6EB
Scraping matches for tournament SSS 20.2 - A New Hampshire Melee Monthly!, /tournament/69327137-A2AB-4549-BEC3-90FC65436081
Scraping matches for tournament Pho

KeyboardInterrupt: 

In [15]:
matches_df = (
    pl.DataFrame(matches, schema={col: pl.String for col in matches[0].keys()})
    # Parse date from format like '01 April 2018'
    .with_columns(pl.col('tournament_date').str.strptime(pl.Date, format='%d %B %Y'))
    .sort('encounter_id', descending=False)
    .sort('tournament_date')
)
#matches_df.write_csv('data/matches.csv')
pl.Config(tbl_rows=2)
matches_df

encounter_id,winner_url,winner,loser_url,loser,winner_score,loser_score,tournament_url,tournament_name,tournament_date,tournament_country,tournament_region,tournament_number_of_players
str,str,str,str,str,str,str,str,str,date,str,str,str
"""1""","""/league/nemelee/player/C9B8492…","""bonfire10""","""/league/nemelee/player/BAB54F3…","""Electroman""","""3""","""0""","""/tournament/FEC28155-A4E2-40EF…","""The Function 2 [NE Only]""",2022-05-07,"""/assets/images/country/flag/us…","""/assets/images/country/regions…",
…,…,…,…,…,…,…,…,…,…,…,…,…
"""94""","""/league/nemelee/player/BAB54F3…","""OUG | Electroman""","""/league/nemelee/player/0CDB11B…","""hc | kraft""","""3""","""1""","""/tournament/633C3E6E-A97A-45F9…","""Mass Madness 50 - 10th Anniver…",2025-05-03,"""/assets/images/country/flag/us…","""/assets/images/country/regions…",
