In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [2]:
# create url for specified league and year
league = 'urc'
year = 2022

if league in ['urc', 'premiership', 'top-14', 'champions-cup']:
    if year < 2025:
        url = f"https://all.rugby/tournament/{league}-{str(year)}/fixtures-results"
    elif year == 2025:
        url = f"https://all.rugby/tournament/{league}/fixtures-results"
    print(url)
else: 
    print("league not recognised")

print(league, year)

https://all.rugby/tournament/urc-2022/fixtures-results
urc 2022


In [3]:
# links_df containing the end of the url for each match
# scores is a list containing the score or the time of fixture if the hasnt happened yet
def get_links_df(url):

    # text of fixtures page
    fixture_text = requests.get(url, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
                                                AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15"})
    fixture_soup = BeautifulSoup(fixture_text.text)
    li = fixture_soup.find_all("li", class_="clearfix")

    fixture_links = [] # list of links to each fixture
    scores = [] # scores of matches

    # iterate over fixture li elements, getting url and score for match
    for i in li:
        scores.append(i.text.split("\n")[3])
        fix_link = i.find("a")['href']
        fixture_links.append(fix_link)
        
    # create df with all fixture links in it
    links_df = pd.DataFrame(fixture_links,columns=['links'])

    return links_df, scores

links_df, scores = get_links_df(url)

print(links_df.shape)

(151, 1)


In [4]:
# parse the mins of events in the match
def parse_mins(messy_mins):
    if messy_mins is np.nan:
        mins=''
        n=0
    else:
        mins = re.findall(r'[\d]+', messy_mins)
        n = len(mins)
        mins = '_'.join(mins)
    return mins, n


# list to store data from each fixture
list_of_match_dataframes = []
list_of_player_dataframes = []

count_ = 0
# loop over links and get match data
for link in links_df['links']:
    count_+=1
    if count_ % 25 == 0:
        print(count_)
        
    match_link = "https://all.rugby" + link
    match_html_text = requests.get(match_link, 
       headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
                        AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15" }).text

    # date, time, venue
    match_soup = BeautifulSoup(match_html_text)
    match_meta = match_soup.find("div", class_="txtcenter").text
    match_meta_str = match_meta.replace("\t", "").replace("\n", " ")

    # time of match on website, french time by default
    french_time = re.findall('\d\d:\d\d', match_meta_str)[0]

    # stadium
    stadium_start_index = re.search('Venue : ', match_meta_str).span()[1]
    stadium_end_index = re.search(' Tournament :', match_meta_str).span()[0]
    stadium = match_meta_str[stadium_start_index:stadium_end_index]

    # date
    date_start_index = re.search('Date :  ', match_meta_str).span()[1]
    date_end_index = re.search(' Kick Off :', match_meta_str).span()[0]
    match_date = match_meta_str[date_start_index:date_end_index]

    dfs = pd.read_html(match_html_text)
    # print(f"Number of dataframes: {len(dfs)}")

    # fixture, we dont know team news - just grab meta and home + away team
    if len(dfs) in [5,6,7]:
        # print("Fixture - no team news")
        home_team = dfs[0].columns[0]
        away_team = dfs[0].columns[2]
        match_df = pd.DataFrame([{"Home team": home_team,
                                "Away team": away_team,
                                 "Stadium": stadium,
                                 "Match Date": match_date,
                                 "Match Time French": french_time}])
        list_of_match_dataframes.append(match_df)
    
    # matches called off by covid, ignore
    if 'Coronavirus' in match_meta_str:
        print("Covid postponed")
        continue

    if 'blessures' in match_meta_str:
        print("Blessures postponed")
        continue

    # matches that havent happened yet but we know team news
    elif len(dfs) == 9:
        # pack weight and age
        meta_df = dfs[2]

        home_team = meta_df.columns[0]
        away_team = meta_df.columns[2]

        cols = meta_df['VS'].tolist()
        cols.append('team')
        data = meta_df[[home_team, away_team]].T

        data['team'] = data.index
        data.columns = cols

        data_home = data.head(1)
        data_away = data.tail(1)

        data_home = data_home.add_prefix("Home ").reset_index(drop=True)
        data_away = data_away.add_prefix("Away ").reset_index(drop=True)

        match_df = pd.concat([data_home, data_away], axis=1)

        match_df["Stadium"] = stadium
        match_df["Match Date"] = match_date,
        match_df["Match Time French"] = french_time
        
        list_of_match_dataframes.append(match_df)
    
    # if the match is a past result
    elif len(dfs) < 5:
        # pack weight and age
        meta_df = dfs[1]

        home_team = meta_df.columns[0]
        away_team = meta_df.columns[2]

        cols = meta_df['VS'].tolist()
        cols.append('team')
        data = meta_df[[home_team, away_team]].T

        data['team'] = data.index
        data.columns = cols

        data_home = data.head(1)
        data_away = data.tail(1)

        data_home = data_home.add_prefix("Home ").reset_index(drop=True)
        data_away = data_away.add_prefix("Away ").reset_index(drop=True)

        match_df = pd.concat([data_home, data_away], axis=1)
        # match_df['meta'] = match_meta_str

        match_df["Stadium"] = stadium
        match_df["Match Date"] = match_date,
        match_df["Match Time French"] = french_time

        # Players in match with events and substitutions 
        # dfs[0]

        match_events = dfs[0]

        # n_tries - must account for penalty tries appearing a row after the players if one is present.
        if match_events['Try'][24] == 'Replacements':
            last_index = 24
            home_pen_try_str = ''
            home_n_pen_tries = 0
            
            away_pen_try_str = ''
            away_n_pen_tries = 0
            
        else: # pen try scored
            last_index = 25
            home_pen_try_mins_messy = match_events['Try'].iloc[24]
            home_pen_try_str, home_n_pen_tries = parse_mins(home_pen_try_mins_messy)

            away_pen_try_mins_messy = match_events['Try.1'].iloc[24]
            away_pen_try_str, away_n_pen_tries = parse_mins(away_pen_try_mins_messy)

        # tries
        home_try_mins_messy = ' '.join(match_events['Try'].iloc[:last_index].dropna().values)
        home_tries_str, home_n_tries = parse_mins(home_try_mins_messy)

        away_try_mins_messy = ' '.join(match_events['Try.1'].iloc[:last_index].dropna().values)
        away_tries_str, away_n_tries = parse_mins(away_try_mins_messy)

        # n_penalties
        home_pen_kicks_mins_messy = ' '.join(match_events['Penalty'].iloc[:24].dropna().values)
        home_pen_kicks_str, home_n_pen_kicks = parse_mins(home_pen_kicks_mins_messy)

        away_pen_kicks_mins_messy = ' '.join(match_events['Penalty.1'].iloc[:24].dropna().values)
        away_pen_kicks_str, away_n_pen_kicks = parse_mins(away_pen_kicks_mins_messy)

        # n_conversions
        home_conversions_mins_messy = ' '.join(match_events['Conversion'].iloc[:24].dropna().values)
        home_conversions_str, home_n_conversions = parse_mins(home_conversions_mins_messy)

        away_conversions_mins_messy = ' '.join(match_events['Conversion.1'].iloc[:24].dropna().values)
        away_conversions_str, away_n_conversions = parse_mins(away_conversions_mins_messy)


        match_df['home_n_tries'] = home_n_tries
        match_df['home_n_conversions'] = home_n_conversions
        match_df['home_n_pen_kicks'] = home_n_pen_kicks
        match_df['home_n_pen_tries'] = home_n_pen_tries

        match_df['away_n_tries'] = away_n_tries
        match_df['away_n_conversions'] = away_n_conversions
        match_df['away_n_pen_kicks'] = away_n_pen_kicks
        match_df['away_n_pen_tries'] = away_n_pen_tries

        match_df['mins_of_home_tries'] = home_tries_str
        match_df['mins_of_home_conversions'] = home_conversions_str
        match_df['mins_of_home_pen_kicks'] = home_pen_kicks_str
        match_df['mins_of_home_pen_tries'] = home_pen_try_str

        match_df['mins_of_away_tries'] = away_tries_str
        match_df['mins_of_away_conversions'] = away_conversions_str
        match_df['mins_of_away_pen_kicks'] = away_pen_kicks_str
        match_df['mins_of_away_pen_tries'] = away_pen_try_str

        list_of_match_dataframes.append(match_df)
        



25
50
75
100
125
150


In [7]:
matches_df = pd.concat(list_of_match_dataframes)

matches_df.head(2)

Unnamed: 0,Home Pack weight (average),Home Forwards average age,Home Backs average age,Home Tallest player,Home Differents nationalities for starters,Home Differents nationalities for all the team,Home team,Away Pack weight (average),Away Forwards average age,Away Backs average age,...,away_n_pen_kicks,away_n_pen_tries,mins_of_home_tries,mins_of_home_conversions,mins_of_home_pen_kicks,mins_of_home_pen_tries,mins_of_away_tries,mins_of_away_conversions,mins_of_away_pen_kicks,mins_of_away_pen_tries
0,924 kg (115.50 kg),26 ans,26 ans,198 cm,Italy : 87% (13/15) England : 7% (1/15) Samoa ...,Italy : 91% (21/23) England : 4% (1/23) Samoa ...,Zebre,919 kg (114.88 kg),27 ans,25 ans,...,1,0,51_61_64_57,51_62,,57.0,32_19_21_15_10,11_16_19_22_33,42,
0,929 kg (116.13 kg),28 ans,27 ans,203 cm,Wales : 80% (12/15) Moldova : 7% (1/15) Tonga ...,Wales : 87% (20/23) Moldova : 4% (1/23) Tonga ...,Cardiff,892 kg (111.50 kg),26 ans,26 ans,...,3,0,38_50_69_9_73,40_51_70_10,,,12_80,13,3_36_57,


In [8]:
matches_df.columns

Index(['Home Pack weight (average)', 'Home Forwards average age',
       'Home Backs average age', 'Home Tallest player',
       'Home Differents nationalities for starters',
       'Home Differents nationalities for all the team', 'Home team',
       'Away Pack weight (average)', 'Away Forwards average age',
       'Away Backs average age', 'Away Tallest player',
       'Away Differents nationalities for starters',
       'Away Differents nationalities for all the team', 'Away team',
       'Stadium', 'Match Date', 'Match Time French', 'home_n_tries',
       'home_n_conversions', 'home_n_pen_kicks', 'home_n_pen_tries',
       'away_n_tries', 'away_n_conversions', 'away_n_pen_kicks',
       'away_n_pen_tries', 'mins_of_home_tries', 'mins_of_home_conversions',
       'mins_of_home_pen_kicks', 'mins_of_home_pen_tries',
       'mins_of_away_tries', 'mins_of_away_conversions',
       'mins_of_away_pen_kicks', 'mins_of_away_pen_tries'],
      dtype='object')

In [9]:
def format_cols(col_name):
    col_name = re.sub(r'[()]', '', col_name)
    col_name = col_name.lower()
    col_name = col_name.replace(' ', '_')
    return col_name
    
matches_df.columns = [format_cols(c) for c in matches_df.columns]
matches_df.columns

Index(['home_pack_weight_average', 'home_forwards_average_age',
       'home_backs_average_age', 'home_tallest_player',
       'home_differents_nationalities_for_starters',
       'home_differents_nationalities_for_all_the_team', 'home_team',
       'away_pack_weight_average', 'away_forwards_average_age',
       'away_backs_average_age', 'away_tallest_player',
       'away_differents_nationalities_for_starters',
       'away_differents_nationalities_for_all_the_team', 'away_team',
       'stadium', 'match_date', 'match_time_french', 'home_n_tries',
       'home_n_conversions', 'home_n_pen_kicks', 'home_n_pen_tries',
       'away_n_tries', 'away_n_conversions', 'away_n_pen_kicks',
       'away_n_pen_tries', 'mins_of_home_tries', 'mins_of_home_conversions',
       'mins_of_home_pen_kicks', 'mins_of_home_pen_tries',
       'mins_of_away_tries', 'mins_of_away_conversions',
       'mins_of_away_pen_kicks', 'mins_of_away_pen_tries'],
      dtype='object')

In [10]:
import os
os.chdir('..')
os.getcwd()

'/Users/benmurphy/Projects/rugby/rugby'

In [11]:
len(scores)

151

In [12]:
scores = [i for i in scores if i != 'cancelled']
matches_df['match_result'] = scores

out_path = f"0_data/match_data/{league}_{year}.csv"
out_path

'0_data/match_data/urc_2022.csv'

In [13]:
matches_df.to_csv(out_path, index=False)