In [1]:
"""
Module to set up Chrome Web Driver for Scalping.

This module provides a function to set up the Chrome Web Driver with specific options
for automated web scraping tasks related to Scalping.

This script was taken from https://github.com/beauhobba/NRL-Data/.

Future work may be needed to make this more efficient as it takes ~8 minutes to run each year.
"""

import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chromedriver_autoinstaller.install()

def set_up_driver():
    """Set up the Chrome Web Driver for Scraping.

    This function sets up the Chrome Web Driver with specified options.
    
    :return: WebDriver object for Chrome
    """
    options = Options()
    # Ignore annoying messages from the NRL website 
    options.add_argument('--ignore-certificate-errors')
    
    # Run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('log-level=3')
    
    # Exclude logging to assist with errors caused by NRL website 
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    
    driver = webdriver.Chrome(options=options)
    return driver

In [2]:
"""
Webscraper for finding NRL data related to team statistics
"""
from bs4 import BeautifulSoup
#from utilities.set_up_driver import set_up_driver

import sys
sys.path.append('..')
sys.path.append('..')
#import ENVIRONMENT_VARIABLES as EV

def get_nrl_data(round=1, year=1):
    url = f"https://www.nrl.com/draw/?competition=111&round={round}&season={year}"
    # Webscrape the NRL WEBSITE
    driver = set_up_driver() 
    driver.get(url)
    page_source = driver.page_source

    driver.quit()

    # get the goodies
    soup = BeautifulSoup(page_source, "html.parser")
    # Get the NRL data box
    match_elements = soup.find_all(
        "div", class_="match o-rounded-box o-shadowed-box")

    # name of html elements to poach from the data to get the nrl specific attributes
    find_data = ["h3", "p", "p", "div", "p", "div", "p"]
    class_data = ["u-visually-hidden", "match-header__title", "match-stats__name--home",
                  "match-team__score--home", "match-team__name--away", "match-team__score--away", "match-venue o-text"]

    # Extract all the useful game data
    matches_json = []
    for match_element in match_elements:
        match_details, match_date, home_team, home_score, away_team, away_score, venue = [match_element.find(
            html_val, class_=class_val).text.strip() for html_val, class_val in zip(find_data, class_data)]

        match = {
            "Details": match_details.replace("Match: ", ""),
            "Date": match_date,
            "Home": home_team,
            "Home_Score": home_score.replace("Scored", "").replace("points", "").strip(),
            "Away": away_team,
            "Away_Score": away_score.replace("Scored", "").replace("points", "").strip(),
            "Venue": venue.replace("Venue:", "").strip()
        }
        matches_json.append(match)
    round_data = {
        f"{round}": matches_json
    }
    return round_data

In [4]:
# Imports
#from nrl.scraping.utilities.get_nrl_data import get_nrl_data
import json

years = [2024]

if __name__ == "__main__":
    match_json_datas = []  # List to store JSON data for matches
    for year in years:
        year_json_data = []  # List to store JSON data for a particular year
        for round_nu in range(1, 2):  # Loop through 31 rounds - In 2024, the Grand Final was round 31.
            try:
                # Attempt to fetch NRL data for a specific round and year
                match_json = get_nrl_data(round_nu, year)
                # Append fetched JSON to year's data list
                year_json_data.append(match_json)
            except Exception as ex:
                print(f"Error: {ex}")
        # Store year's data in a dictionary
        year_data = {
            f"{year}": year_json_data
        }
        # Append year's data to the main list
        match_json_datas.append(year_data)

    # Create overall data dictionary
    overall_data = {
        "NRL": match_json_datas
    }
    # Convert overall data to JSON format with indentation for better readability
    overall_data_json = json.dumps(overall_data, indent=4)

    # Write JSON data to a file
    with open("D:\\Downloads\\match_data.json", "w") as file:
        file.write(overall_data_json)

In [None]:
import json
import re
# Convert JSON to table format

with open('D:\\Downloads\\match_data_2024.json', 'r') as file:
    json_data = json.load(file)

# Function to remove special characters and new line characters from a string
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = text.replace('\n', ' ').strip()  # Remove new line characters and strip leading/trailing spaces
    text = re.sub(r'     .*', '', text) # Remove text after multiple spaces
    text = re.sub(r'Home of the.*', '', text)
    return text

# Convert JSON to table format
table_data = []
headers = ["Competition", "Year", "Round", "Details", "Date", "Home", "Home_Score", "Away", "Away_Score", "Venue"]

for competition, years in json_data.items():
    for year_data in years:
        for year, rounds in year_data.items():
            for round_data in rounds:
                for round_num, matches in round_data.items():
                    for match in matches:
                        # Clean the 'Venue' item
                        match['Venue'] = clean_text(match['Venue'])
                        row = [competition, year, round_num]
                        row.extend([match[header] for header in headers[3:]])
                        table_data.append(row)

# Write table to txt file
with open("D:\\Downloads\\nrl_table.txt", "w") as file:
    file.write("\t".join(headers) + "\n")
    for row in table_data:
        file.write("\t".join(row) + "\n")

print("Table has been written to nrl_table.txt")


In [4]:
"""
Webscraper for detailed NRL match data. This includes team statistics, match details, and try scorers.
"""

import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
#import sys  # noqa

chromedriver_autoinstaller.install()

def set_up_driver():
    """Set up the Chrome Web Driver for Scraping.

    This function sets up the Chrome Web Driver with specified options.
    
    :return: WebDriver object for Chrome
    """
    options = Options()
    # Ignore annoying messages from the NRL website 
    options.add_argument('--ignore-certificate-errors')
    
    # Run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('log-level=3')
    
    # Exclude logging to assist with errors caused by NRL website 
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    
    driver = webdriver.Chrome(options=options)
    return driver


#from utilities.set_up_driver import set_up_driver

#sys.path.append('..')  # noqa
#sys.path.append('..')  # noqa
#import ENVIRONMENT_VARIABLES as EV  # noqa

BARS_DATA: dict = {'time_in_possession': -1,
                   'all_runs': -1,
                   'all_run_metres': -1,
                   'post_contact_metres': -1,
                   'line_breaks': -1,
                   'tackle_breaks': -1,
                   'average_set_distance': -1,
                   'kick_return_metres': -1,
                   'offloads': -1,
                   'receipts': -1,
                   'total_passes': -1,
                   'dummy_passes': -1,
                   'kicks': -1,
                   'kicking_metres': -1,
                   'forced_drop_outs': -1,
                   'bombs': -1,
                   'grubbers': -1,
                   'tackles_made': -1,
                   'missed_tackles': -1,
                   'intercepts': -1,
                   'ineffective_tackles': -1,
                   'errors': -1,
                   'penalties_conceded': -1,
                   'ruck_infringements': -1,
                   'inside_10_metres': -1,
                   'interchanges_used': -1}

DONUT_DATA = {
        'Completion Rate': -1,
        'Average_Play_Ball_Speed': -1,
        'Kick_Defusal': -1,
        'Effective_Tackle': -1}



if __name__ == "__main__":
    home_team_input = ['sea eagles', 'roosters']
    away_team_input = ['rabbitohs', 'broncos']
    years = [2024]
    rounds = [1]
    output_destination = f"D:\\Downloads\\"
    get_detailed_nrl_data(years, rounds, home_team_input, away_team_input)


Scraping data from the 2024 season
Scraping Round 1: sea eagles v rabbitohs
Match scraping complete
Scraping Round 1: roosters v broncos
Match scraping complete
Tables has been written as D:\Downloads\
  Competition  Year Round                    Game Home/Away Possession  \
0         NRL  2024     1  sea-eagles.v.rabbitohs      home        51%   
1         NRL  2024     1  sea-eagles.v.rabbitohs      away        49%   
2         NRL  2024     1      roosters.v.broncos      home        52%   
3         NRL  2024     1      roosters.v.broncos      away        48%   

     First Try Scorer First Try Time Time In Possession All Runs  ...  \
0  Haumole Olakau'atu             22              26:17      208  ...   
1      Richard Kennar              6              25:45      206  ...   
2         Joseph Manu              4              31:14      213  ...   
3       Deine Mariner             35              27:42      207  ...   

  Average Play Ball Speed Kick Defusal Effective Tackle Tries