## Function to web scrape dataset

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random  

def scrape(tour_ids, output_file):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    browser = webdriver.Chrome(options=options)

    # Initialize empty list
    all_tournament_ids = []
    all_rounds = []
    all_scores = []
    all_types = []
    all_winner_ids = []
    all_loser_ids = []

    # Loop over each tournament ID 
    for tour_id in tour_ids:
        browser.get(f"https://badmintonranks.com/tournament?id={tour_id}")
        try:
            WebDriverWait(browser, 30).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".el-table"))
            )
            time.sleep(random.uniform(10, 30)) 
        except Exception as e:
            print(f"Error loading data for tournament ID {tour_id}: {e}")
            continue  
        try:
            max_pages = browser.find_element(By.CSS_SELECTOR, ".el-pager li:last-child")
            max_pages = int(max_pages.text)
        except Exception as e:
            print(f"Error finding the number of pages for tournament ID {tour_id}: {e}")
            continue

        # Loop through pages
        for page in range(max_pages):
            time.sleep(random.uniform(10, 20))  

            # Scrape rounds, scores, and types for this tournament
            rounds = [el.text for el in browser.find_elements(By.CSS_SELECTOR, "tbody .el-table_1_column_2")]
            scores = [el.text for el in browser.find_elements(By.CSS_SELECTOR, "tbody .el-table_1_column_5")]
            types = [el.text for el in browser.find_elements(By.CSS_SELECTOR, "tbody .el-table_1_column_3")]

            # Scrape winner IDs for this tournament
            winner_cells = browser.find_elements(By.CSS_SELECTOR, ".el-table_1_column_4")
            winner_ids = []
            header = True  

            for cell in winner_cells:
                if header:
                    header = False
                    continue  
                links = cell.find_elements(By.TAG_NAME, 'a')
                ids = [link.get_attribute('href').split('=')[-1] for link in links]
                winner_ids.append(', '.join(ids))

            # Scrape loser IDs for this tournament
            loser_cells = browser.find_elements(By.CSS_SELECTOR, '.el-table_1_column_6')
            loser_ids = []
            header = True  
            for cell in loser_cells:
                if header:
                    header = False
                    continue  
                links = cell.find_elements(By.TAG_NAME, 'a')
                ids = [link.get_attribute('href').split('=')[-1] for link in links]
                loser_ids.append(', '.join(ids))

            if not rounds or not scores or not types or not winner_ids or not loser_ids:
                print(f"No data found for tournament ID {tour_id}")
            else:
                all_tournament_ids.extend([tour_id] * len(rounds))
                all_rounds.extend(rounds)
                all_scores.extend(scores)
                all_types.extend(types)
                all_winner_ids.extend(winner_ids)
                all_loser_ids.extend(loser_ids)

            try:
                button = browser.find_element(By.CSS_SELECTOR, ".el-pager .number.active+li")
                button.click()
            except:
                print(f"Max page for tournament ID {tour_id} reached")

    browser.quit()

    print(len(all_tournament_ids), len(all_rounds), len(all_scores), len(all_types), len(all_winner_ids), len(all_loser_ids))

    data = {
        "Tournament_ID": all_tournament_ids,
        "Type": all_types,
        "Round": all_rounds,
        "Winner": all_winner_ids,
        "Loser": all_loser_ids,
        "Score": all_scores
    }
    df = pd.DataFrame(data)
    df.to_csv(output_file, index=False)
    print(f"Data exported to {output_file}")

    return df

## Full tournament_id list for web scraper to visit

In [None]:
tour_id=[10700,10426,10073,13659,10786,12160,12485,12084,11125,13052,13352,13382,10318,14007,12770,
12614,10395,12867,12398,10827,13984,11220,10198,12257,11301,10745,13514,11272,10665,10557,
13212,12282,11287,13450,12553,12856,14173,13751,12788,10516,12136,12930,11755,11154,10342,
14003,12691,11376,13615,13286,11470,13278,10767,11677,13046,12658,11865,14055,13990,11785,
12229,14042,13876,13111,12073,10489,13494,13950,10641,11433,12241,11054,13183,12891,13735,
10777,14041,11262,12838,10583,11424,10533,12873,13115,13826,10348,11617,13887,13445,13483,
12727,10231,12852,14044,12094,11932,11193,12970,13035,13893,12063,13173,13216,11133,10817,
14174,13925,10614,12491,11858,11040,10001,12077,12523,13570,13056,13878,12117,10711,14030,
13890,12494,10361,12876,11806,13592,13811,11171,13584,10355,13200,13602,10682,13874,10107,
13830,13673,12149,10304,11851,12462,12591,13010,13102,11603,12908,11751,11291,12497,10417,
14018,11849,10148,13203,10260,10646,13595,12605,13174,11443,12057,14317,13500,11323,13008,
12281,12388,12025,14211,11289,14188,10385,14111,13070,11412,10128,14328,12550,13498,12786,
12482,13300,11695,12866,10365,13949,10564,13892,13065,11855,13710,14337,10108,13201,12725,
10685,13339,13132,10754,10094,12585,10692,11127,13583,10618,11654,14357,10050,10281,11675,
11582,10197,12697,11538,10581,11545,14124,11361,12815,13230,10722,12342,11208,11961,12316,
11622,11286,11246,11816,11610,10740,12988,10057,10410,11371,11704,13007,12730,12005,13944,
10118,12340,13952,12251,10936,10297,10324,12578,12932,13736,10640,10982,10022,12677,14303,
13868,10321,12756,13847,11696,12937,12631,10507,11030,13971,13466,13449,10054,13096,14309,
11227,13222,13376,13083,12085,11401,13692,11837,11628,12274,10161,11632,11024,10121,10613,
13732,11644,11575,13647,10149,10945,12664,12637,10312,12740,12940,13556,12881,10097,11519,
10597,12672,11877,13706,12280,10087,13075,11222,13899,10173,12750,10429,10363,11131,12466,
11904,14056,12738,10278,10903,10019,13614,13258,11126,11112,12777,13885,12519,11269,10496,
12434,13279,10553,11596,12029,11670,14054,13404,11967,13120,12558,12134,11502,11149,11107,
10127,11129,14048,10483,10601,10369,12590,10368,13272,12423,11699,12444,12936,10606,12075,
10404,10549,11836,13002,11957,13758,10460,11438,11352,11068,13600,10237,11325,11769,12662,
13616,12703,12758,14119,12246,11623,10792,12789,13733,14012,10882,13154,11756,13223,11067,
10205,11868,11927,12414,12883,12304,13639,12966,14155,12507,13418,13067,13231,10812,12561,
11754,11591,11386,11179,13759,11520,13064,12757,12574,11261,12235,13749,14011,12395,10701,
10590,13412,12372,12830,13432,13772,10505,11419,11372,11803,11086,11574,13121,14217,13040,
11113,11840,14066,10901,13997,13740,11752,12682,10220,10967,10911,14004,11234,14240,14083,
10067,12031,12511,12652,13552,12236,11275,10183,10452,12468,10298,12625,13821,14262,10287,
11867,11843,12825,10524,12675,11944,11991,11528,10033,12071,12111,12021,13426,14277,13959,
12486,10236,12096,11884,10289,13565,11077,12683,10930,11189,12836,10939,11489,14289,10998,
11841,11954,10923,13972,11860,13983,12748,10931,14029,11671,11953,13221,13795,10710,13001,
13663,13125,12037,10520,12178,13170,10381,12624,11425,11377,10353,12205,13645,13579,10088,
11160,11267,12047,10155,13652,13495,12896,11492,13051,12514,12291,12365,11692,10712,12232,
12292,13044,12967,13588,13988,11791,13918,13934,13699,12437,12736,11996,11878,11159,12287,
13575,14069,10244,10357,10316,10335,11239,13392,11719,11772,13431,10034,10039,13532,13232,
12979,13596,13995,14117,12559,10229,12215,10598,11167,13502,12557,12004,13684,13986,13881,
14205,11564,10380,12010,11727,13662,12320,13061,12698,10803,13140,11608,13249,13079,11743,
12183,10474,12983,11871,10623,13982,12821,13108,10938,12214,10014,13473,14167,10394,12144,
11428,13357,13049,14096,10181,11351,12912,12646,10430,10423,11214,13916,13837,12140,11190,
11621,12038,13366,12331,13110,13274,12828,13621,14377,14380,10214,12396,11387,10586,11612,
13669,14179,11034,12204,10860,12643,13867,11911,10279,13536,14046,12145,12457,12950,12710,
12622,13905,10857,10977,10082,10401,11123,10758,13238,13695]

## Example run

In [None]:
example = [14410]
scrape(example, output_file="RAW_example.csv")