In [1]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import re
import numpy as np
import os
from typing import List, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
base_url = 'https://th.sportscorpion.com/eng/tournament/archive/?page='

# Start from page 1
page_num = 1

# Create a set to store the URLs
all_urls = set()

# Load the existing URLs from the file if it exists
if os.path.exists('tournament_urls.txt'):
    with open('tournament_urls.txt', 'r') as f:
        all_urls.update(line.strip() for line in f)

new_urls = set()

while True:
    # Get the HTML content of the page
    response = requests.get(base_url + str(page_num))

    # If the page doesn't exist, break the loop
    if response.status_code != 200:
        break

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table in the HTML
    table = soup.find('table', {'class': 'sTable'})

    # If there's no table on the page, break the loop
    if table is None:
        break

    # Find all 'a' tags within the table
    links = table.find_all('a')

    # Extract the href attribute from each 'a' tag, but only if the last character is a digit
    urls = {link.get('href') for link in links if link.get('href')[-2].isdigit()}

    # Check if any of the URLs is already in the set
    if any(url in all_urls for url in urls):
        # If any of the URLs is already in the set, remove the urls that are in the set
        urls = {url for url in urls if url not in all_urls}
    
    # Add the new URLs to the file
    with open('tournament_urls.txt', 'a') as f:
        for url in urls:
            f.write(url + '\n')

    # Add the URLs to the set
    all_urls.update(urls)
    new_urls.update(urls)
    # Go to the next page
    page_num += 1

# Now new_urls contains all the new unique URLs
print(f"Collected {len(new_urls)} new unique URLs.")


Collected 16 new unique URLs.


In [3]:
len(all_urls)

5237

In [4]:
len(new_urls)

16

In [5]:
BASE_URL = "https://th.sportscorpion.com"

def fetch_page(session, url: str) -> BeautifulSoup:
    response = session.get(url)
    return BeautifulSoup(response.text, 'lxml')

def get_match_info(session, url: str) -> List[Tuple[str, str, str, int, int, str, str]]:
    soup = fetch_page(session, url)
    match_info = []

    # Check if the page is for the playoff stage
    is_playoff = len(soup.select('tr.series-container')) > 0

    if is_playoff:
        # Scrape playoff matches
        series = soup.select('tr.series-container')
        for serie in series:
            players = serie.select('td[class^="ma_name"] a')
            scores = serie.select('td[class^="ma_result_"]')
            # Ignore the last score which represents the total score of the match series
            for score in scores[:-1]:
                if ':' in score.text:
                    player_1 = players[0].text.strip()
                    player_2 = players[1].text.strip()
                    score_cleaned = score.text.replace('(OT)', '').replace('(W.O)', '').replace('\xa0', '').replace('*', '').replace('\n', '')
                    try:
                        goals_player_1, goals_player_2 = map(int, score_cleaned.split(':'))
                        overtime = 'Yes' if '(OT)' in score.text else 'No'
                        match_info.append((url, player_1, player_2, goals_player_1, goals_player_2, overtime, 'Playoff'))
                    except ValueError:
                        continue
    else:
        # Scrape round-robin matches
        match_tables = soup.select('table.grTable')
        for table in match_tables:
            rows = table.select('tr[id^="match"]')
            for row in rows:
                player_1 = row.select_one('td.ma_name1').text.strip()
                player_2 = row.select_one('td.ma_name2').text.strip()
                score = row.select_one('td[class^="ma_result_"]')
                if score and ':' in score.text:  
                    score_cleaned = score.text.replace('(OT)', '').replace('(W.O)', '').replace('\xa0', '').replace('*', '').replace('\n', '')
                    try:
                        goals_player_1, goals_player_2 = map(int, score_cleaned.split(':'))
                        overtime = 'Yes' if '(OT)' in score.text else 'No'
                        match_info.append((url, player_1, player_2, goals_player_1, goals_player_2, overtime, 'Round-Robin'))
                    except ValueError:
                        print(f"Unable to parse score '{score_cleaned}' from match {url}")

    return match_info

# Function to scrape matches from tournaments concurrently
def get_tournament_matches(tournament_urls: List[str], existing_stage_ids: set[str]) -> pd.DataFrame:
    all_matches = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    def fetch_tournament_data(url):
        with requests.Session() as session:
            session.headers.update(headers)
            tournament_id = url.split('/')[-2]
            tournament_url = f"{BASE_URL}/eng/tournament/id/{tournament_id}/"
            tournament_soup = fetch_page(session, tournament_url)
            
            tournament_name_element = tournament_soup.select_one("h1#header")
            tournament_name = tournament_name_element.text.strip() if tournament_name_element else 'Unknown'
            
            date_element = tournament_soup.select_one("th:contains('Date of the tournament') + td")
            date = date_element.text.strip() if date_element else 'Unknown'
            
            result_links = tournament_soup.select('a:contains("Schedule and results")')
            stages_urls = [f"{BASE_URL}{link['href']}?print" for link in result_links]
            
            stage_matches = []
            for stage_url in stages_urls:
                stage_id = stage_url.split('/')[-3]
                if stage_id in existing_stage_ids:
                    continue

                matches = get_match_info(session, stage_url)
                for match in matches:
                    stage_matches.append((stage_id, *match[1:], date, tournament_name))
            return stage_matches

    processed_tournaments = 0

    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(fetch_tournament_data, url): url for url in tournament_urls}
        for future in tqdm(as_completed(future_to_url), total=len(future_to_url), desc="Processing tournaments", unit="tournament"):
            url = future_to_url[future]
            try:
                matches = future.result()
                all_matches.extend(matches)
                processed_tournaments += 1
                tqdm.write(f"\rProcessed tournaments: {processed_tournaments}", end='')
            except Exception as exc:
                print(f'{url} generated an exception: {exc}')

    df = pd.DataFrame(
        all_matches, 
        columns=['StageID', 'Player1', 'Player2', 'GoalsPlayer1', 'GoalsPlayer2', 'Overtime', 'Stage', 'Date', 'TournamentName']
    )

    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y', errors='coerce')
    df['GoalsPlayer1'] = pd.to_numeric(df['GoalsPlayer1'], errors='coerce')
    df['GoalsPlayer2'] = pd.to_numeric(df['GoalsPlayer2'], errors='coerce')
    df.sort_values(by="Date", inplace=True)
    df = df[~((df['Stage'] == 'Playoff') & (df['GoalsPlayer1'] == df['GoalsPlayer2']))]
    df.reset_index(drop=True, inplace=True)

    return df

In [6]:
df = get_tournament_matches(list(new_urls), existing_stage_ids=set())

Processing tournaments:   6%|▋         | 1/16 [00:00<00:14,  1.02tournament/s]

Processed tournaments: 2

Processing tournaments:  25%|██▌       | 4/16 [00:01<00:04,  2.91tournament/s]

Processed tournaments: 4

Processing tournaments:  31%|███▏      | 5/16 [00:01<00:02,  3.76tournament/s]

Processed tournaments: 5

Processing tournaments:  44%|████▍     | 7/16 [00:02<00:02,  3.87tournament/s]

Processed tournaments: 7

Processing tournaments:  50%|█████     | 8/16 [00:02<00:01,  4.19tournament/s]

Processed tournaments: 10

Processing tournaments:  69%|██████▉   | 11/16 [00:03<00:01,  4.95tournament/s]

Processed tournaments: 11

Processing tournaments:  75%|███████▌  | 12/16 [00:03<00:00,  4.77tournament/s]

Processed tournaments: 12

Processing tournaments:  88%|████████▊ | 14/16 [00:03<00:00,  3.89tournament/s]

Processed tournaments: 14

Processing tournaments: 100%|██████████| 16/16 [00:04<00:00,  3.31tournament/s]


Processed tournaments: 16

In [7]:
# convert df to csv
df.to_csv('th_matches.csv', index=False, encoding='utf-8-sig', header=False, mode='a')