In [6]:
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
import requests
import pandas as pd
from datetime import datetime
import re
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

In [12]:
def fetch_ranking(ranking_id):
    """
    Fetch ranking data for a specific ranking ID.

    Args:
        ranking_id (int): Ranking period ID.

    Returns:
        list: List of dictionaries with ranking data for the specified ID.
    """
    rankings = []
    try:
        # Construct the URL for the given ranking ID
        url = f"https://stiga.trefik.cz/ithf/ranking/history.aspx?id={ranking_id}"
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the ranking table
        table = soup.find('table', {'border': '1'})
        if not table:
            print(f"No ranking table found for ID {ranking_id}")
            return rankings

        # Extract the date from the page header
        date_header = soup.find('span', {'id': 'LabTitle'}).text
        date_str = date_header.split('as on')[-1].strip()
        ranking_date = datetime.strptime(date_str, "%d.%m.%Y")

        # Extract rows from the table
        rows = table.find_all('tr')[2:]  # Skip headers and spacer rows

        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 4:  # Ensure there are 4 columns (Rank, Player, Nation, Points)
                rank = cols[0].get_text(strip=True)
                player_name = cols[1].get_text(strip=True)
                player_id = (
                    cols[1].find('a')['href'].split('=')[-1]
                    if cols[1].find('a') else None
                )
                nation = cols[2].img['alt'] if cols[2].find('img') else None
                points = cols[3].get_text(strip=True).replace('&nbsp;', '').replace('\u00a0', '').strip()

                # Append data to the list
                rankings.append({
                    'Rank': int(rank.rstrip('.')),
                    'PlayerName': player_name,
                    'PlayerID': int(player_id) if player_id else None,
                    'Nation': nation,
                    'Points': int(points.replace(',', '')) if points.isdigit() else None,
                    'Date': ranking_date
                })
    except Exception as e:
        print(f"Error fetching data for ID {ranking_id}: {e}")
    return rankings


def extract_ranking_data_parallel(start_id=249, end_id=417, max_workers=10):
    """
    Extract ranking data in parallel for a range of IDs.
    
    Args:
        start_id (int): Starting ID for ranking periods (e.g., 249 for Jan 2010).
        end_id (int): Ending ID for ranking periods.
        max_workers (int): Number of threads for parallel processing.
    
    Returns:
        pd.DataFrame: Combined ranking data for all periods.
    """
    rankings = []

    # Use ThreadPoolExecutor with a progress bar
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create a list of ranking IDs
        ranking_ids = list(range(start_id, end_id + 1))
        
        # Submit tasks to the executor and use tqdm for progress tracking
        for result in tqdm(executor.map(fetch_ranking, ranking_ids), total=len(ranking_ids)):
            rankings.extend(result)
    
    # Convert the collected data into a DataFrame
    return pd.DataFrame(rankings)

# Extract ranking data for all periods with a progress bar
ranking_data = extract_ranking_data_parallel()

  0%|          | 0/169 [00:00<?, ?it/s]

No ranking table found for ID 288
No ranking table found for ID 389


In [14]:
ranking_data.to_parquet('ranking_history.parquet', engine='fastparquet', compression='zstd')