In [1]:
"""
Module to set up Chrome Web Driver for Scalping.

This module provides a function to set up the Chrome Web Driver with specific options
for automated web scraping tasks related to Scalping.

This script was taken from https://github.com/beauhobba/NRL-Data/.

Future work may be needed to make this more efficient as it takes ~8 minutes to run each year.
"""

import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chromedriver_autoinstaller.install()

def set_up_driver():
    """Set up the Chrome Web Driver for Scraping.

    This function sets up the Chrome Web Driver with specified options.
    
    :return: WebDriver object for Chrome
    """
    options = Options()
    # Ignore annoying messages from the NRL website 
    options.add_argument('--ignore-certificate-errors')
    
    # Run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('log-level=3')
    
    # Exclude logging to assist with errors caused by NRL website 
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    
    driver = webdriver.Chrome(options=options)
    return driver

In [2]:
"""
Webscraper for finding NRL data related to team statistics
"""
from bs4 import BeautifulSoup
#from utilities.set_up_driver import set_up_driver

import sys
sys.path.append('..')
sys.path.append('..')
#import ENVIRONMENT_VARIABLES as EV

def get_nrl_data(round=1, year=1):
    url = f"https://www.nrl.com/draw/?competition=111&round={round}&season={year}"
    # Webscrape the NRL WEBSITE
    driver = set_up_driver() 
    driver.get(url)
    page_source = driver.page_source

    driver.quit()

    # get the goodies
    soup = BeautifulSoup(page_source, "html.parser")
    # Get the NRL data box
    match_elements = soup.find_all(
        "div", class_="match o-rounded-box o-shadowed-box")

    # name of html elements to poach from the data to get the nrl specific attributes
    find_data = ["h3", "p", "p", "div", "p", "div", "p"]
    class_data = ["u-visually-hidden", "match-header__title", "match-team__name--home",
                  "match-team__score--home", "match-team__name--away", "match-team__score--away", "match-venue o-text"]

    # Extract all the useful game data
    matches_json = []
    for match_element in match_elements:
        match_details, match_date, home_team, home_score, away_team, away_score, venue = [match_element.find(
            html_val, class_=class_val).text.strip() for html_val, class_val in zip(find_data, class_data)]

        match = {
            "Details": match_details.replace("Match: ", ""),
            "Date": match_date,
            "Home": home_team,
            "Home_Score": home_score.replace("Scored", "").replace("points", "").strip(),
            "Away": away_team,
            "Away_Score": away_score.replace("Scored", "").replace("points", "").strip(),
            "Venue": venue.replace("Venue:", "").strip()
        }
        matches_json.append(match)
    round_data = {
        f"{round}": matches_json
    }
    return round_data

In [4]:
# Imports
#from nrl.scraping.utilities.get_nrl_data import get_nrl_data
import json

years = [2024]

if __name__ == "__main__":
    match_json_datas = []  # List to store JSON data for matches
    for year in years:
        year_json_data = []  # List to store JSON data for a particular year
        for round_nu in range(1, 2):  # Loop through 31 rounds - In 2024, the Grand Final was round 31.
            try:
                # Attempt to fetch NRL data for a specific round and year
                match_json = get_nrl_data(round_nu, year)
                # Append fetched JSON to year's data list
                year_json_data.append(match_json)
            except Exception as ex:
                print(f"Error: {ex}")
        # Store year's data in a dictionary
        year_data = {
            f"{year}": year_json_data
        }
        # Append year's data to the main list
        match_json_datas.append(year_data)

    # Create overall data dictionary
    overall_data = {
        "NRL": match_json_datas
    }
    # Convert overall data to JSON format with indentation for better readability
    overall_data_json = json.dumps(overall_data, indent=4)

    # Write JSON data to a file
    with open("D:\\Downloads\\match_data.json", "w") as file:
        file.write(overall_data_json)

In [None]:
import json
import re
# Convert JSON to table format

with open('D:\\Downloads\\match_data_2024.json', 'r') as file:
    json_data = json.load(file)

# Function to remove special characters and new line characters from a string
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = text.replace('\n', ' ').strip()  # Remove new line characters and strip leading/trailing spaces
    text = re.sub(r'     .*', '', text) # Remove text after multiple spaces
    text = re.sub(r'Home of the.*', '', text)
    return text

# Convert JSON to table format
table_data = []
headers = ["Competition", "Year", "Round", "Details", "Date", "Home", "Home_Score", "Away", "Away_Score", "Venue"]

for competition, years in json_data.items():
    for year_data in years:
        for year, rounds in year_data.items():
            for round_data in rounds:
                for round_num, matches in round_data.items():
                    for match in matches:
                        # Clean the 'Venue' item
                        match['Venue'] = clean_text(match['Venue'])
                        row = [competition, year, round_num]
                        row.extend([match[header] for header in headers[3:]])
                        table_data.append(row)

# Write table to txt file
with open("D:\\Downloads\\nrl_table.txt", "w") as file:
    file.write("\t".join(headers) + "\n")
    for row in table_data:
        file.write("\t".join(row) + "\n")

print("Table has been written to nrl_table.txt")


In [39]:
"""
Webscraper for detailed NRL data related to team statistics
"""

import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
#import sys  # noqa

chromedriver_autoinstaller.install()

def set_up_driver():
    """Set up the Chrome Web Driver for Scraping.

    This function sets up the Chrome Web Driver with specified options.
    
    :return: WebDriver object for Chrome
    """
    options = Options()
    # Ignore annoying messages from the NRL website 
    options.add_argument('--ignore-certificate-errors')
    
    # Run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('log-level=3')
    
    # Exclude logging to assist with errors caused by NRL website 
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    
    driver = webdriver.Chrome(options=options)
    return driver


#from utilities.set_up_driver import set_up_driver


#sys.path.append('..')  # noqa
#sys.path.append('..')  # noqa
#import ENVIRONMENT_VARIABLES as EV  # noqa

BARS_DATA: dict = {'time_in_possession': -1,
                   'all_runs': -1,
                   'all_run_metres': -1,
                   'post_contact_metres': -1,
                   'line_breaks': -1,
                   'tackle_breaks': -1,
                   'average_set_distance': -1,
                   'kick_return_metres': -1,
                   'offloads': -1,
                   'receipts': -1,
                   'total_passes': -1,
                   'dummy_passes': -1,
                   'kicks': -1,
                   'kicking_metres': -1,
                   'forced_drop_outs': -1,
                   'bombs': -1,
                   'grubbers': -1,
                   'tackles_made': -1,
                   'missed_tackles': -1,
                   'intercepts': -1,
                   'ineffective_tackles': -1,
                   'errors': -1,
                   'penalties_conceded': -1,
                   'ruck_infringements': -1,
                   'inside_10_metres': -1,
                   'interchanges_used': -1}

DONUT_DATA = {
        'Completion Rate': -1,
        'Average_Play_Ball_Speed': -1,
        'Kick_Defusal': -1,
        'Effective_Tackle': -1}

def get_detailed_nrl_data(input_years, round_range, home_team, away_team):
    
    DONUT_DATA_2 = {'tries': -1, 'conversions': -1, 'penalty_goals':-1, 'sin_bins': -1, '1_point_field_goals': -1, '2_point_field_goals': -1, 'half_time': -1}
    DONUT_DATA_2_WORDS = ['TRIES', 'CONVERSIONS', 'PENALTY GOALS',  'SIN BINS', '1 POINT FIELD GOALS','2 POINT FIELD GOALS', 'HALF TIME']

    # Initialise competition level data
    competition_team_data = []
    competition_match_data = []
    competition_try_data = []

    for year in input_years:
        print('Scraping data from the ' + str(year) + ' season')
        
        # Initialise year level data
        year_team_data = []
        year_match_data = []
        year_try_data = []

        for round in round_range:

            # Initialise round level data
            round_team_data = []
            round_match_data = []
            round_try_data = []
            
            for home, away in zip(home_team_input, away_team_input):
                home, away = [x.replace(" ", "-") for x in [home, away]]
                
                url = f"{'https://www.nrl.com/draw/nrl-premiership/'}{year}/round-{round}/{home}-v-{away}/"
                print(f"Scraping Round 1: {round} - {url}")

                # Webscrape the NRL WEBSITE
                driver = set_up_driver()
                driver.get(url)
                page_source = driver.page_source
                driver.quit()
                soup = BeautifulSoup(page_source, "html.parser")

                home_possession, away_possession = None, None
                try:
                    # Home possession
                    home_possession = soup.find(
                        'p', class_='match-centre-card-donut__value--home').text.strip()
                    away_possession = soup.find(
                        'p', class_='match-centre-card-donut__value--away').text.strip()
                except BaseException as BE:
                    print(f"Error in home possession {BE}")

                home_all_run_metres_list = soup.find_all(
                    'dd',
                    class_=[
                        "stats-bar-chart__label stats-bar-chart__label--home u-font-weight-700",
                        "stats-bar-chart__label stats-bar-chart__label--home"])
                away_all_run_metres_list = soup.find_all(
                    'dd',
                    class_=[
                        "stats-bar-chart__label stats-bar-chart__label--away u-font-weight-700",
                        "stats-bar-chart__label stats-bar-chart__label--away"])

                home_bars, away_bars = BARS_DATA.copy(), BARS_DATA.copy()

                try:
                    # Loop through each element
                    for item, bar_name in zip(home_all_run_metres_list, home_bars.keys()):
                        # Get the text of each element and strip any whitespace
                        home_all_run_metres = item.get_text(strip=True)
                        # Do whatever you want with the text
                        home_bars[bar_name] = home_all_run_metres

                    for item, bar_name in zip(away_all_run_metres_list, away_bars.keys()):
                        # Get the text of each element and strip any whitespace
                        home_all_run_metres = item.get_text(strip=True)
                        # Do whatever you want with the text
                        away_bars[bar_name] = home_all_run_metres
                except BaseException:
                    print(f"Error with home bars")

                home_donut = DONUT_DATA.copy()
                away_donut = DONUT_DATA.copy()
                
                try:
                    elements = soup.find_all("p", class_="donut-chart-stat__value")
                    # Loop through each element to extract the numbers
                    numbers = []
                    for element in elements:
                        # Extract the text from the element
                        text = element.get_text()
                        # Find the number in the text
                        number = ''.join(filter(lambda x: x.isdigit() or x == '.', text))
                        numbers.append(number)
                    home_donut.update({k: v for k, v in zip(home_donut, numbers[::2])})
                    away_donut.update({k: v for k, v in zip(away_donut, numbers[1::2])})
                except BaseException:
                    print("error in donuts")

                # Initialise a list to store all names
                home_try_names_list, home_try_minute_list = [], []

                try:
                    li_elements = soup.find(
                        "ul", class_="match-centre-summary-group__list--home").find_all("li")

                    # Loop through each <li> element and extract the name
                    for li in li_elements:
                        # Extract the text and remove leading/trailing whitespace
                        text = li.get_text(strip=True)
                        # Split the text at the space character
                        parts = text.split()
                        # Join the parts except the last one (which is the number) to get the
                        # name
                        name = ' '.join(parts[:-1])
                        # Get the last part as the number
                        number = parts[-1]
                        # Append name and number to their respective lists
                        home_try_names_list.append(name)
                        home_try_minute_list.append(number)
                except BaseException:
                    print("error in home try scorers")
                home_first_try_scorer = home_try_names_list[0] if len(
                    home_try_names_list) > 0 else None
                home_first_minute_scorer = home_try_minute_list[0] if len(
                    home_try_minute_list) > 0 else None

                away_try_names_list = []
                away_try_minute_list = []
                try:
                    li_elements = soup.find(
                        "ul", class_="match-centre-summary-group__list--away").find_all("li")
                    # Initialise a list to store all names

                    # Loop through each <li> element and extract the name
                    for li in li_elements:
                        # Extract the text and remove leading/trailing whitespace
                        text = li.get_text(strip=True)
                        # Split the text at the space character
                        parts = text.split()
                        # Join the parts except the last one (which is the number) to get the
                        # name
                        name = ' '.join(parts[:-1])
                        # Get the last part as the number
                        number = parts[-1]
                        # Append name and number to their respective lists
                        away_try_names_list.append(name)
                        away_try_minute_list.append(number)
                except BaseException:
                    print("error in away try scorers")
                away_first_try_scorer = away_try_names_list[0] if len(
                    away_try_names_list) > 0 else None
                away_first_minute_scorer = away_try_minute_list[0] if len(
                    away_try_minute_list) > 0 else None

                overall_first_try_scorer, overall_first_try_minute, overall_first_scorer_team = None, None, None
                if away_first_try_scorer is None and home_first_try_scorer is None:
                    overall_first_try_scorer = None
                else:
                    if away_first_minute_scorer is None:
                        overall_first_try_scorer = home_first_try_scorer
                        overall_first_try_minute = home_first_minute_scorer
                        overall_first_scorer_team = home
                    elif home_first_minute_scorer is None:
                        overall_first_try_scorer = away_first_try_scorer
                        overall_first_try_minute = away_first_minute_scorer
                        overall_first_scorer_team = away
                    elif away_first_minute_scorer > home_first_minute_scorer:
                        overall_first_try_scorer = away_first_try_scorer
                        overall_first_try_minute = away_first_minute_scorer
                        overall_first_scorer_team = away
                    else:
                        overall_first_try_scorer = home_first_try_scorer
                        overall_first_try_minute = home_first_minute_scorer
                        overall_first_scorer_team = home

                # Find all span elements with the specified class
                span_elements = soup.find_all('span', class_='match-centre-summary-group__name')

                # Check if any span element contains the desired text
                for word in DONUT_DATA_2_WORDS:
                    exists = any(span.text.strip().upper() == word for span in span_elements)
                    if not exists:
                        DONUT_DATA_2[word.lower().replace(' ', '_')] = -10
                
                home_game_stats, away_game_stats = DONUT_DATA_2.copy(), DONUT_DATA_2.copy()
                
                
                numbers = []
                try:
                    span_elements = soup.find_all(
                        "span", class_="match-centre-summary-group__value")
                    # Loop through each <span> element and extract the number
                    for span_element in span_elements:
                        numbers.append(span_element.span.get_text(strip=True))
                        
                    filtered_home_stats = {key: value for key, value in home_game_stats.items() if value != -10}

                    for k, v in zip(filtered_home_stats, numbers[::2]):
                        home_game_stats[k] = v

                    for k, v in zip(filtered_home_stats, numbers[1::2]):
                        away_game_stats[k] = v
                        
                except BaseException as Be:
                    print(f"Error with match top data {Be}")
                    

                main_ref_name, ref_names, ref_positions = None, [], []
                try:
                    a_elements = soup.find_all("a", class_="card-team-mate")
                    for a in a_elements:
                        # Extract the name from <h3> element
                        name = a.find("h3",
                                    class_="card-team-mate__name").get_text(strip=True)
                        ref_names.append(name)

                        # Extract the position from <p> element
                        position = a.find(
                            "p", class_="card-team-mate__position").get_text(strip=True)
                        ref_positions.append(position)
                    main_ref_name = ref_names[0]
                except BaseException:
                    print("error with ref data")

                # Initialise variables to store ground condition and weather condition
                ground_condition, weather_condition = "", ""
                try:
                    # Find all <p> elements with class 'match-weather__text'
                    p_elements = soup.find_all("p", class_="match-weather__text")

                    # Loop through each <p> element and extract the text
                    for p_element in p_elements:
                        # Extract the text from the <span> element within the <p>
                        condition_type = p_element.get_text(
                            strip=True).split(":")[0].strip()
                        condition_value = p_element.span.get_text(strip=True)

                        # Check condition type and assign values accordingly
                        if condition_type == "Ground Conditions":
                            ground_condition = condition_value
                        elif condition_type == "Weather":
                            weather_condition = condition_value
                except BaseException:
                    print("error with conditions")

                # Join all the data togethor into an export
                team_data_temp ={
                    home+'.v.'+away: {
                        home: {
                                'home/away': 'home',
                                'possession': home_possession,
                                'first_try_scorer': home_first_try_scorer,
                                'first_try_time': home_first_minute_scorer,
                                **home_bars,
                                **home_donut,
                                **home_game_stats
                            },
                        away: {
                                'home/away': 'away',
                                'possession': away_possession,
                                'first_try_scorer': away_first_try_scorer,
                                'first_try_time': away_first_minute_scorer,
                                **away_bars,
                                **away_donut,
                                **away_game_stats
                            }
                        }
                    }
                
                # Append the game data to the round data
                round_team_data.append(team_data_temp)

                match_data_temp = {
                    home+'.v.'+away: {
                        'overall_first_try_scorer': overall_first_try_scorer,
                        'overall_first_try_minute': overall_first_try_minute,
                        'overall_first_try_round': overall_first_scorer_team,
                        #'ref_names': ref_names, #removed for now
                        #'ref_positions': ref_positions,
                        'main_ref': main_ref_name,
                        'ground_condition': ground_condition,
                        'weather_condition': weather_condition
                    }
                }

                # Append the game data to the round data
                round_match_data.append(match_data_temp)

                try_data_temp ={
                    home+'.v.'+away: {
                        home: {
                                'home/away': 'home',
                                'try_names': home_try_names_list, 
                                'try_minutes': home_try_minute_list, 
                            },
                        away: {
                                'home/away': 'away',
                                'try_names': away_try_names_list,
                                'try_minutes': away_try_minute_list,
                            }
                        }
                }
                                
                # Append the game data to the round data
                round_try_data.append(try_data_temp)

            # Add round level hierarchy to json
            round_team_data_combined = {
                f"{round}": round_team_data
            }

            match_team_data_combined = {
                f"{round}": round_match_data
            }

            match_try_data_combined = {
                f"{round}": round_try_data
            }

        # Append round data to year data
        year_team_data.append(round_team_data_combined)
        year_match_data.append(match_team_data_combined)
        year_try_data.append(match_try_data_combined)

        # Add year level hierarchy to json
        year_team_data_combined = {
                    f"{year}": year_team_data
                }
        
        year_match_data_combined = {
                    f"{year}": year_match_data
                }
        
        year_try_data_combined = {
                    f"{year}": year_try_data
                }

    competition_team_data.append(year_team_data_combined)
    competition_match_data.append(year_match_data_combined)
    competition_try_data.append(year_try_data_combined)

    master_team_data = {
        "NRL": competition_team_data
    }

    master_match_data = {
        "NRL": competition_match_data
    }

    master_try_data = {
        "NRL": competition_try_data
    }

    # Write JSON data to a file
    with open(f"{output_destination}_match_statistics.json", "w") as file:
        file.write(json.dumps(master_team_data, indent=4))

    with open(f"{output_destination}_match_details.json", "w") as file:
        file.write(json.dumps(master_match_data, indent=4))

    with open(f"{output_destination}_match_try.json", "w") as file:
        file.write(json.dumps(master_try_data, indent=4))

    print(f"Tables has been written as {output_destination}")

    # Flatten the nested dictionary structure
    """""
    team_data_df = []
    for competition, years in competition_team_data.items():
        for year, rounds in competition.items():
            for round_number, games in rounds.items():
                for games, stats in game.items():
                    row = {
                        'Year': year,
                        'Round': round_number,
                        'Game': game,
                    }
                    row.update(stats)
                    team_data_df.append(row)

    # Create DataFrame
    df = pd.DataFrame(team_data_df)

    # Display the DataFrame
    print(df)

    # Write JSON data to a file
    with open(f"D:\\Downloads\\match_data_detailed_text.txt", "w") as file:
        file.write(df)

    """""

if __name__ == "__main__":
    home_team_input = ['sea eagles', 'roosters']
    away_team_input = ['rabbitohs', 'broncos']
    years = [2024]
    rounds = [1]
    output_destination = f"D:\\Downloads\\2024_rd1"
    get_detailed_nrl_data(years, rounds, home_team_input, away_team_input)


Scraping data from the 2024 season
Scraping Round 1: 1 - https://www.nrl.com/draw/nrl-premiership/2024/round-1/sea-eagles-v-rabbitohs/
Scraping Round 1: 1 - https://www.nrl.com/draw/nrl-premiership/2024/round-1/roosters-v-broncos/
Tables has been written as D:\Downloads\2024_rd1
