In [4]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import re
import numpy as np
import os
from typing import List, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm

In [5]:
base_url = 'https://th.sportscorpion.com/eng/tournament/archive/?page='

# Start from page 1
page_num = 1

# Create a set to store the URLs
all_urls = set()

# Load the existing URLs from the file if it exists
if os.path.exists('tournament_urls.txt'):
    with open('tournament_urls.txt', 'r') as f:
        all_urls.update(line.strip() for line in f)

In [144]:

new_urls = set()

while True:
    # Get the HTML content of the page
    response = requests.get(base_url + str(page_num))

    # If the page doesn't exist, break the loop
    if response.status_code != 200:
        break

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table in the HTML
    table = soup.find('table', {'class': 'sTable'})

    # If there's no table on the page, break the loop
    if table is None:
        break

    # Find all 'a' tags within the table
    links = table.find_all('a')

    # Extract the href attribute from each 'a' tag, but only if the last character is a digit
    urls = {link.get('href') for link in links if link.get('href')[-2].isdigit()}

    # Check if any of the URLs is already in the set
    if any(url in all_urls for url in urls):
        # If any of the URLs is already in the set, remove the urls that are in the set
        urls = {url for url in urls if url not in all_urls}
    
    # Add the new URLs to the file
    with open('tournament_urls.txt', 'a') as f:
        for url in urls:
            f.write(url + '\n')

    # Add the URLs to the set
    all_urls.update(urls)
    new_urls.update(urls)
    # Go to the next page
    page_num += 1

# Now new_urls contains all the new unique URLs
print(f"Collected {len(new_urls)} new unique URLs.")


Collected 0 new unique URLs.


In [3]:
len(all_urls)


5483

In [146]:
len(new_urls)

0

In [6]:
BASE_URL = "https://th.sportscorpion.com"

# Mapping of playoff stage names to fraction values
PLAYOFF_STAGE_MAP = {
    "1/64 final": 1/64,
    "1/32 final": 1/32,
    "1/16 final": 1/16,
    "1/8 final": 1/8,
    "Quarterfinal": 1/4,
    "Semi-final": 1/2,
    "Final": 1,
    "Match for the third place": 1
}

def fetch_page(session, url: str) -> BeautifulSoup:
    # Fetch the page content and return a BeautifulSoup object.
    response = session.get(url)
    return BeautifulSoup(response.text, 'lxml')

def get_playoff_stage_fraction(stage_name: str) -> float:
    # Convert the playoff stage name into a numeric fraction, e.g. 'Quarterfinal' -> 0.25
    stage_name = stage_name.strip().lower()
    for key, val in PLAYOFF_STAGE_MAP.items():
        if key.lower() in stage_name:
            return val
    # If not found in map, default to 1 (e.g. unknown final stage)
    return None

def get_match_info(session, url: str) -> List[Tuple[str, str, str, int, int, str, str, float]]:
    # Fetch match information from a given stage page.
    # For playoff matches, RoundNumber will be set to a numeric value (e.g. 0.25 for quarterfinal).
    # For round-robin matches, RoundNumber will remain an integer or None.
    soup = fetch_page(session, url)
    
    # Remove the 'saved-matches' section to avoid duplicates
    # This ensures we ignore any matches that belong to earlier stages.
    saved_matches_div = soup.find('div', class_='saved-matches')
    if saved_matches_div:
        saved_matches_div.decompose()
    
    match_info = []

    # Check if the page is for the playoff stage.
    is_playoff = len(soup.select('tr.series-container')) > 0

    if is_playoff:
        # For playoff stages, matches are organized by subheaders (Quarterfinal, Semi-final, etc.)
        subheaders = soup.select('div.subheader')
        for subheader in subheaders:
            stage_name = subheader.get_text(strip=True)
            playoff_fraction = get_playoff_stage_fraction(stage_name)

            # Find the .gr_match blocks after this subheader until next subheader
            next_siblings = subheader.find_all_next('div', class_='gr_match')
            for block in next_siblings:
                # If block belongs to another subheader, stop processing further blocks
                next_sub = block.find_previous_sibling('div', class_='subheader')
                if next_sub and next_sub != subheader:
                    break

                series = block.select('tr.series-container')
                for serie in series:
                    players = serie.select('td[class^="ma_name"] a')
                    if len(players) < 2:
                        continue
                    player_1 = players[0].text.strip()
                    player_2 = players[1].text.strip()
                    
                    scores = serie.select('td[class^="ma_result_"]')
                    # Ignore the last score which is the total series score
                    for score in scores[:-1]:
                        if ':' in score.text:
                            score_cleaned = score.text.replace('(OT)', '').replace('(W.O)', '').replace('\xa0', '').replace('*', '').replace('\n', '')
                            try:
                                goals_player_1, goals_player_2 = map(int, score_cleaned.split(':'))
                                overtime = 'Yes' if '(OT)' in score.text else 'No'
                                match_info.append((url, player_1, player_2, goals_player_1, goals_player_2, overtime, 'Playoff', playoff_fraction))
                            except ValueError:
                                continue

    else:
        # Scrape round-robin matches
        match_tables = soup.select('table.grTable')
        for table in match_tables:
            header = table.select_one('th:-soup-contains("Tour")')
            if header:
                round_text = header.get_text(strip=True)
                round_match = re.search(r'(\d+)\s*Tour', round_text)
                round_number = float(round_match.group(1)) if round_match else None
            else:
                round_number = None

            rows = table.select('tr[id^="match"]')
            for row in rows:
                player_1 = row.select_one('td.ma_name1').text.strip()
                player_2 = row.select_one('td.ma_name2').text.strip()
                score = row.select_one('td[class^="ma_result_"]')
                if score and ':' in score.text:
                    score_cleaned = score.text.replace('(OT)', '').replace('(W.O)', '').replace('\xa0', '').replace('*', '').replace('\n', '')
                    try:
                        goals_player_1, goals_player_2 = map(int, score_cleaned.split(':'))
                        overtime = 'Yes' if '(OT)' in score.text else 'No'
                        match_info.append((url, player_1, player_2, goals_player_1, goals_player_2, overtime, 'Round-Robin', round_number))
                    except ValueError:
                        print(f"Unable to parse score '{score_cleaned}' from match {url}")

    return match_info

def get_tournament_matches(tournament_urls: List[str], existing_stage_ids: set[str]) -> pd.DataFrame:
    all_matches = []
    headers = {'User-Agent': 'Mozilla/5.0'}

    def fetch_tournament_data(url):
        with requests.Session() as session:
            session.headers.update(headers)
            tournament_id = url.split('/')[-2]
            tournament_url = f"{BASE_URL}/eng/tournament/id/{tournament_id}/"
            tournament_soup = fetch_page(session, tournament_url)

            # Check if tournament is a team tournament, skip if yes.
            tournament_type_element = tournament_soup.select_one("th:-soup-contains('Tournament type') + td")
            tournament_type = tournament_type_element.text.strip() if tournament_type_element else 'Unknown'
            if tournament_type.lower() == 'team':
                return []

            # Extract tournament name and date
            tournament_name_element = tournament_soup.select_one("h1#header")
            tournament_name = tournament_name_element.text.strip() if tournament_name_element else 'Unknown'

            date_element = tournament_soup.select_one("th:-soup-contains('Date of the tournament') + td")
            date = date_element.text.strip() if date_element else 'Unknown'

            # Extract the stages and their sequences
            stage_rows = tournament_soup.select('table.stages-table tr')
            stage_data = []
            for row in stage_rows:
                seq_cell = row.select_one('td.stage-gr')
                if seq_cell:
                    stage_sequence = seq_cell.get_text(strip=True)
                    sched_link = row.select_one('a:-soup-contains("Schedule and results")')
                    if sched_link:
                        stage_url = f"{BASE_URL}{sched_link['href']}?print"
                        stage_id = stage_url.split('/')[-3]
                        stage_data.append((stage_id, stage_url, stage_sequence))

            stage_matches = []
            for stage_id, stage_url, stage_sequence in stage_data:
                if stage_id in existing_stage_ids:
                    continue
                matches = get_match_info(session, stage_url)
                for match in matches:
                    # match: (url, Player1, Player_2, G1, G2, OT, Stage, RoundNumber)
                    stage_matches.append((
                        stage_id,
                        match[1],
                        match[2],
                        match[3],
                        match[4],
                        match[5],
                        match[6],
                        match[7],
                        date,
                        tournament_name,
                        tournament_id,
                        stage_sequence
                    ))
            return stage_matches

    processed_tournaments = 0

    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(fetch_tournament_data, url): url for url in tournament_urls}
        for future in tqdm(as_completed(future_to_url), total=len(future_to_url), desc="Processing tournaments", unit="tournament"):
            url = future_to_url[future]
            try:
                matches = future.result()
                all_matches.extend(matches)
                processed_tournaments += 1
                tqdm.write(f"\rProcessed tournaments: {processed_tournaments}", end='')
            except Exception as exc:
                print(f'{url} generated an exception: {exc}')

    df = pd.DataFrame(
        all_matches, 
        columns=[
            'StageID', 'Player1', 'Player2', 'GoalsPlayer1', 'GoalsPlayer2',
            'Overtime', 'Stage', 'RoundNumber', 'Date', 'TournamentName',
            'TournamentID', 'StageSequence'
        ]
    )

    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y', errors='coerce')
    df['GoalsPlayer1'] = pd.to_numeric(df['GoalsPlayer1'], errors='coerce')
    df['GoalsPlayer2'] = pd.to_numeric(df['GoalsPlayer2'], errors='coerce')
    df.sort_values(by=["Date", "StageSequence", "RoundNumber"], inplace=True, na_position='last')
    # Remove playoff draws.
    df = df[~((df['Stage'] == 'Playoff') & (df['GoalsPlayer1'] == df['GoalsPlayer2']))]
    df.reset_index(drop=True, inplace=True)

    return df

In [7]:
df = get_tournament_matches(list(all_urls), existing_stage_ids=set())

Processing tournaments:   0%|          | 0/5483 [00:00<?, ?tournament/s]

Processed tournaments: 5483

In [8]:
df

Unnamed: 0,StageID,Player1,Player2,GoalsPlayer1,GoalsPlayer2,Overtime,Stage,RoundNumber,Date,TournamentName,TournamentID,StageSequence
0,186,Andreev Igor,Alexey Chernov,3,3,No,Round-Robin,1.0,2010-01-19,"Невская Хоккейная Лига, осень-2010, 10-й этап,...",80,1
1,186,Alexandr Danilov,Anton Fedoseev,1,1,No,Round-Robin,1.0,2010-01-19,"Невская Хоккейная Лига, осень-2010, 10-й этап,...",80,1
2,186,Alexey Klimko,Nikolay Smirnov,2,1,No,Round-Robin,1.0,2010-01-19,"Невская Хоккейная Лига, осень-2010, 10-й этап,...",80,1
3,186,Igor Masloboev,Vitaly Skorobogatov,4,3,No,Round-Robin,1.0,2010-01-19,"Невская Хоккейная Лига, осень-2010, 10-й этап,...",80,1
4,186,Mishurinskikh Konstantin,Mikhail Sivakov,2,1,No,Round-Robin,1.0,2010-01-19,"Невская Хоккейная Лига, осень-2010, 10-й этап,...",80,1
...,...,...,...,...,...,...,...,...,...,...,...,...
784186,19886,Андрей Васильев,Daniil Perov,1,6,No,Round-Robin,8.0,2024-12-19,Малая Хоккейная Лига. Зима. 4 этап,6899,2
784187,19886,Андрей Лукашук,Фёдор Марченко,0,2,No,Round-Robin,8.0,2024-12-19,Малая Хоккейная Лига. Зима. 4 этап,6899,2
784188,19886,Пётр Марченко,Daniil Perov,0,6,No,Round-Robin,9.0,2024-12-19,Малая Хоккейная Лига. Зима. 4 этап,6899,2
784189,19886,Андрей Васильев,Фёдор Марченко,1,0,No,Round-Robin,9.0,2024-12-19,Малая Хоккейная Лига. Зима. 4 этап,6899,2


In [10]:
# convert df to csv
df.to_csv('th_matches.csv', index=False, encoding='utf-8-sig', header=False, mode='a')