# Player Stats Scraper
This notebook is used to scrape player stats from rugbypass.com

There are two steps to getting player data.

1.   Scrape the roster from each team under the players section of a teams page
2.   Scrape players stats on each teams roster

In [None]:
# Necessary packages to request from rugbypass.com, read url's for player identifiers, and beautiful soup to extract data from html
import pandas as pd
import requests
import regex as re
from bs4 import BeautifulSoup

## Functions to scrape rosters and player stats

The two defined functions for this task are:

1.   `get_roster` takes in the team and returns list of players with rugbypass.com identifiers
2.   `player_stats` takes player idenifers and return thier stats


In [None]:
def get_roster(country):
  """
  Sunction takes in the country or team and scrapes rugbypass.com team page for players and player identifications.

  Arguments:
    country: string matching rugbypass team
  
  Returns:
    list of of player elements with names, positions, team, href, team_id, player_id.

  """


  country = country.lower()
  URL = 'https://www.rugbypass.com/teams/'+country+"/"
  page            = requests.get(URL)

  soup            = BeautifulSoup(page.content, "html.parser")

  div             = soup.find_all('div', class_='players')
  
  # names_soup      = div[0].find_all('span',class_='name')
  # position_soup   = div[0].find_all('span',class_='position')
  # href_soup       = div[0].find_all('a',href = True)
  try:
      names           = [x.text for x in div[0].find_all('span',class_='name')]
      positions       = [x.text for x in div[0].find_all('span',class_='position')]
      href            = [x['href'] for x in div[0].find_all('a',href = True)]
      team_id = [re.findall(r'players\/(\d+)\/', x.img['src'])[0] for x in div[0].find_all('a')]
      player_id = [re.findall(r'24x24\/(\d+)\.jpg', x.img['src'])[0] for x in div[0].find_all('a')]
      team = []
      team += len(names) * [country]
      players = list(zip(names,positions,team,href,team_id,player_id))
  except IndexError:
      players = []
      return players

  return players

In [None]:
def player_scrape(url,team_id,player_id,year):
  """
  function takes player identifiers and scraps rugbypass player page for states.

  Arguments:
    url: string url of player page on rugbypass
    team_id: string id of player's team
    player_id: string id of player
    year: string year of player stats
  
  Returns:
    dictionary of player statistics

  """
  cookies = {
      '_rdt_uuid': '1673218491479.fc86f9fc-020f-4aa4-9a67-c700c8507634',
      '_gcl_au': '1.1.1833555129.1673218492',
      '_fbp': 'fb.1.1673218494782.250599399',
      '_omappvp': 'HAzlgdNEdBOnII4sq4oRBg3xUiRKdI4GGlA11LFWZ32jo2DAenaDqgbrkLSSjyPbuou58KS1xbLKq2UsdIK6aW1aWM4DaSan',
      '_scid': 'f3b613a0-8d10-4577-a9c9-036c28b16802',
      '_pcid': '%7B%22browserId%22%3A%22lcnz39rcrowhy3bv%22%7D',
      '__pid': '.rugbypass.com',
      '_pbjs_userid_consent_data': '6683316680106290',
      'sharedid': '5b3db131-3ce3-4e0e-8653-0d948073d976',
      '__pat': '-18000000',
      '_pctx': '%7Bu%7DN4IgrgzgpgThIC4B2YA2qA05owMoBcBDfSREQpAeyRCwgEt8oBJAE0RXQF8g',
      '__qca': 'P0-1914388938-1673218494976',
      '__gads': 'ID=ace5f38a52b07e2a:T=1673218500:S=ALNI_MaiwYxpJGR-PqMkAgdQybs3qscCtg',
      '_lr_env_src_ats': 'false',
      '_cc_id': '2f2f5e599f87a95586ec1538e90baf49',
      '__pil': 'en_US',
      'xbc': '%7Bkpex%7DUN1jhsSkgV3bgWhrxVfnCCkB98wFe26T4e82OfXeXPzBHVFQNOVfvkcVfniVNyqt9dkX0HYTrfZaPlu4Qgr3nQ',
      'PHPSESSID': 'tp2tvhs9t9kkued1ssm865on3s',
      '_sctr': '1|1674720000000',
      '__gpi': 'UID=000009268e210f08:T=1673218500:RT=1674785179:S=ALNI_MbTM5RGKYgy_Un98yQP9ZkU_8VJnw',
      'panoramaId': 'e32489e37580f24bb56dc3d355814945a702955f0da999787fda0357a0ea7dd9',
      'panoramaId_expiry': '1675389982816',
      'pbjs-id5id': '%5Bobject%20Object%5D',
      'geo-toggle': 'world',
      '_awl': '2.1674786553.5-539853763e7dd2a7f75bb71f964e1ec5-6763652d75732d7765737431-0',
      'cto_bundle': 'A7IkbF8xRG9EazRRaXhKbGZTUWZnWXBYcTBOS2F0c3NVJTJGYm02UllaRnVWSlFOdnN2d092JTJGUWJ1ZG11VEZsQ00lMkJkJTJGRDN4Z3dqRkZ2JTJCRHZXR3Z2SjhCU211RGFpZ1VJVklhYnp1SDJxenBUY1N6aEpDOXEyalV2NHlXa1BKR25qekVSSktkViUyRjBXMVU2eCUyQnVUZUlWTklEQmY3USUzRCUzRA',
      'cto_bidid': 'FphoSl9lQXdzMVNqbyUyQjhRNFZsMXZMJTJCJTJGTUhpVDFSY05ZVHhuc0hkQmVjRUtsdWp1Tk9CcXduc1pqaHZ5cXpTTVVRQ1JhT09ITXkyV0lRJTJCZE1HVHpQZ3hFVnpqR0tCV0xHamw2JTJGVnJVZUpnOVZUbGslM0Q',
      'ki_r': '',
      '_clck': 'yviytw|1|f8q|0',
      '_gid': 'GA1.2.1915472814.1675139759',
      '_ga': 'GA1.2.272962789.1673218492',
      '_uetsid': 'c2934a30a12011edb7373721ec20393d',
      '_uetvid': '77cb7ef08fa711ed8f98717a6bf77631',
      '__pvi': 'eyJpZCI6InYtMjAyMy0wMS0zMC0yMC0zNS01OS04NDUtbnZWWGU1akN4SlpPZEJ5WC04NDk2YjQ3YjVjM2Y2OWUxMWI0MDFjOGE4MjFkZGViYiIsImRvbWFpbiI6Ii5ydWdieXBhc3MuY29tIiwidGltZSI6MTY3NTE0MDQxMTI4Mn0%3D',
      'ki_t': '1673218511447%3B1675139761531%3B1675140411495%3B5%3B67',
      '__tbc': '%7Bkpex%7Dp-EzmDrzoCBX5XR8bre0AkAqEYEbU7jPs4BANstSyeYbvmDcd22fy9WnW7Z51i6T',
      '_ga_B5LC1CX3WN': 'GS1.1.1675139757.12.1.1675140542.0.0.0',
      '_clsk': 'otfbmr|1675141429365|10|1|l.clarity.ms/collect',
      '_gali': 'player-stats-season',
  }

  headers = {
      'Accept': '*/*',
      'Accept-Language': 'en-US,en;q=0.9',
      'Connection': 'keep-alive',
      'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundaryJU1aGo7nGMh72eUT',
      'Cookie': '_rdt_uuid=1673218491479.fc86f9fc-020f-4aa4-9a67-c700c8507634; _gcl_au=1.1.1833555129.1673218492; _fbp=fb.1.1673218494782.250599399; _omappvp=HAzlgdNEdBOnII4sq4oRBg3xUiRKdI4GGlA11LFWZ32jo2DAenaDqgbrkLSSjyPbuou58KS1xbLKq2UsdIK6aW1aWM4DaSan; _scid=f3b613a0-8d10-4577-a9c9-036c28b16802; _pcid=%7B%22browserId%22%3A%22lcnz39rcrowhy3bv%22%7D; __pid=.rugbypass.com; _pbjs_userid_consent_data=6683316680106290; sharedid=5b3db131-3ce3-4e0e-8653-0d948073d976; __pat=-18000000; _pctx=%7Bu%7DN4IgrgzgpgThIC4B2YA2qA05owMoBcBDfSREQpAeyRCwgEt8oBJAE0RXQF8g; __qca=P0-1914388938-1673218494976; __gads=ID=ace5f38a52b07e2a:T=1673218500:S=ALNI_MaiwYxpJGR-PqMkAgdQybs3qscCtg; _lr_env_src_ats=false; _cc_id=2f2f5e599f87a95586ec1538e90baf49; __pil=en_US; xbc=%7Bkpex%7DUN1jhsSkgV3bgWhrxVfnCCkB98wFe26T4e82OfXeXPzBHVFQNOVfvkcVfniVNyqt9dkX0HYTrfZaPlu4Qgr3nQ; PHPSESSID=tp2tvhs9t9kkued1ssm865on3s; _sctr=1|1674720000000; __gpi=UID=000009268e210f08:T=1673218500:RT=1674785179:S=ALNI_MbTM5RGKYgy_Un98yQP9ZkU_8VJnw; panoramaId=e32489e37580f24bb56dc3d355814945a702955f0da999787fda0357a0ea7dd9; panoramaId_expiry=1675389982816; pbjs-id5id=%5Bobject%20Object%5D; geo-toggle=world; _awl=2.1674786553.5-539853763e7dd2a7f75bb71f964e1ec5-6763652d75732d7765737431-0; cto_bundle=A7IkbF8xRG9EazRRaXhKbGZTUWZnWXBYcTBOS2F0c3NVJTJGYm02UllaRnVWSlFOdnN2d092JTJGUWJ1ZG11VEZsQ00lMkJkJTJGRDN4Z3dqRkZ2JTJCRHZXR3Z2SjhCU211RGFpZ1VJVklhYnp1SDJxenBUY1N6aEpDOXEyalV2NHlXa1BKR25qekVSSktkViUyRjBXMVU2eCUyQnVUZUlWTklEQmY3USUzRCUzRA; cto_bidid=FphoSl9lQXdzMVNqbyUyQjhRNFZsMXZMJTJCJTJGTUhpVDFSY05ZVHhuc0hkQmVjRUtsdWp1Tk9CcXduc1pqaHZ5cXpTTVVRQ1JhT09ITXkyV0lRJTJCZE1HVHpQZ3hFVnpqR0tCV0xHamw2JTJGVnJVZUpnOVZUbGslM0Q; ki_r=; _clck=yviytw|1|f8q|0; _gid=GA1.2.1915472814.1675139759; _ga=GA1.2.272962789.1673218492; _uetsid=c2934a30a12011edb7373721ec20393d; _uetvid=77cb7ef08fa711ed8f98717a6bf77631; __pvi=eyJpZCI6InYtMjAyMy0wMS0zMC0yMC0zNS01OS04NDUtbnZWWGU1akN4SlpPZEJ5WC04NDk2YjQ3YjVjM2Y2OWUxMWI0MDFjOGE4MjFkZGViYiIsImRvbWFpbiI6Ii5ydWdieXBhc3MuY29tIiwidGltZSI6MTY3NTE0MDQxMTI4Mn0%3D; ki_t=1673218511447%3B1675139761531%3B1675140411495%3B5%3B67; __tbc=%7Bkpex%7Dp-EzmDrzoCBX5XR8bre0AkAqEYEbU7jPs4BANstSyeYbvmDcd22fy9WnW7Z51i6T; _ga_B5LC1CX3WN=GS1.1.1675139757.12.1.1675140542.0.0.0; _clsk=otfbmr|1675141429365|10|1|l.clarity.ms/collect; _gali=player-stats-season',
      'Origin': 'https://www.rugbypass.com',
      'Referer': url,
      'Sec-Fetch-Dest': 'empty',
      'Sec-Fetch-Mode': 'cors',
      'Sec-Fetch-Site': 'same-origin',
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
      'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
      'sec-ch-ua-mobile': '?0',
      'sec-ch-ua-platform': '"Windows"',
  }

  data = '------WebKitFormBoundaryJU1aGo7nGMh72eUT\r\nContent-Disposition: form-data; name="action"\r\n\r\nstatistics\r\n------WebKitFormBoundaryJU1aGo7nGMh72eUT\r\nContent-Disposition: form-data; name="season"\r\n\r\n'+year+'\r\n------WebKitFormBoundaryJU1aGo7nGMh72eUT\r\nContent-Disposition: form-data; name="comp_id"\r\n\r\n107\r\n------WebKitFormBoundaryJU1aGo7nGMh72eUT\r\nContent-Disposition: form-data; name="team_id"\r\n\r\n'+team_id+'\r\n------WebKitFormBoundaryJU1aGo7nGMh72eUT\r\nContent-Disposition: form-data; name="player_id"\r\n\r\n'+player_id+'\r\n------WebKitFormBoundaryJU1aGo7nGMh72eUT\r\nContent-Disposition: form-data; name="game_id"\r\n\r\n0\r\n------WebKitFormBoundaryJU1aGo7nGMh72eUT\r\nContent-Disposition: form-data; name="isContent"\r\n\r\n1\r\n------WebKitFormBoundaryJU1aGo7nGMh72eUT--\r\n'
  page = requests.post(url, cookies=cookies, headers=headers, data=data)
  soup            = BeautifulSoup(page.content, "html.parser")
  return soup

In [None]:
def get_awh(player_url):
  page            = requests.get(player_url) #get player physical stats
  soup            = BeautifulSoup(page.content, "html.parser")
  div             = soup.find_all('div', class_='a-h-w clearfix')

  try:
    age = re.findall(r'\b(\d+)\b',str(div[0].find_all('div',class_="a")[0]))[0]
    height = re.findall(r'\b(\d+)\b',str(div[0].find_all('div',class_="h")[0]))[0]
    weight = re.findall(r'\b(\d+)\b',str(div[0].find_all('div',class_="w")[0]))[0]
    return age, weight,height
  except IndexError:
    age = 0
    height = 0
    weight = 0
    return age, weight,height
  
def player_stats(teams,year):

  team_rosters = dict()
  for team in teams:
      team_rosters[team] = get_roster(team)

  stats = dict()
  try:
    for player in team_rosters[team]:
        # print(player[0])
        age, weight, height = get_awh(player[3])
        response = player_scrape(player[3],player[4],player[5],str(year)) ## get stats
        response = response.find_all('div', class_="col-full all-stats-col")
        labels = [x.text for x in response[0].find_all('div', class_='label')]
        values = [x.text for x in response[0].find_all('div', class_='value')]
        stats[player[0]] = {
            'position': player[1],
            'age': age,
            'height': height,
            'weight': weight,
            'team': player[2],
            'year':year}
        stats[player[0]].update(dict(zip(labels,values)))
  except (IndexError,KeyError) as error:
        stats[player[0]] = {
            'position': player[1],
            'team': player[2],
            'year':year}
  return stats

## 2023 Qualified Teams and Player Stats

We get the list of qualified teams from https://www.rugbyworldcup.com/2023/teams

From list looked at the team urls on rugbypass.com to get the accurate listing of names on the website. Will iterate through the teams list to get roster and then loop through each player of each team for a certain year to scrape stats. Stats are then assigned to a dictionary and a dataframe is created for export.

In [None]:
teams = ['france','new-zealand','italy','uruguay','namibia','south-africa','ireland','scotland','tonga','romania','wales','australia','fiji','georgia','england','japan',
         'argentina','samoa','chile','portugal','brazil','canada','germany','hong-kong',
         'ireland-a','kenya','maori-all-blacks','netherlands','russia','south-africa-a','spain','usa','agen','barbarians','bath','bay-of-plenty','bayonne','benetton','biarritz','blue-bulls','blues','bordeux',
         'bristol','lions','brive','brumbies','bulls','canterbury','cardiff-blues','castres','cheetahs-1','cheetahs',
         'chiefs','clermont','connacht','crusaders','edinburgh','exeter-chiefs','glasglow','gloucester','grenoble',
         'harlequins','highlanders','hurricanes','jaguares','la-rochelle','leicester','leinster','london-irish',
         'lyon','montpellier','munster','racing-92','rebels','sale','saracens','toulon','toulouse','ulster','waratahs','wasp','worcester']

team_rosters = dict()
for team in teams:
    team_rosters[team] = get_roster(team)

In [None]:
team_rosters['france'][0]

('Anthony Jelonch',
 'Flanker 7',
 'france',
 'https://www.rugbypass.com/players/anthony-jelonch/',
 '806',
 '2403')

In [None]:
from pandas.compat.numpy import np
players = dict()

for team in teams:
  for player in team_rosters[team]:
    if player not in players:
      try:
        age,weight,height = get_awh(player[3])
        players.update({player[0]:{'age':age,
                                  'weight':weight,
                                  'height':height
                                  }})
      except:
        players.update({player[0]:{'age':np.nan,
                          'weight':np.nan,
                          'height':np.nan
                          }})

In [None]:
awhdf = pd.DataFrame.from_dict(players, orient='index')

In [23]:
awhdf.sample(4)

Unnamed: 0,age,weight,height
Mathieu Tanguy,26,119,194
David Hawkshaw,23,85,175
Nic Dolly,0,0,0
Dylan Cretin,25,101,195


In [24]:
awhdf.to_csv('/content/drive/Shareddrives/STUDENT-milestone-II/data/rugbypass-playerahw2022.csv',index=True)

In [None]:

players = list()

for team in teams:
  for player in team_rosters[team]:
    # print(player[3])
    try:
      player = player + (get_awh(player[3]))
      players.append(player)
    except:
      awh = (np.nan,np.nan,np.nan)
      player = player + (awh)
      players.append(player)

In [None]:
stats_awh = dict()
for team in teams:
  for player in team_rosters[team]:
      # print(player[0])
      age, weight, height = get_awh(player[3])
      stats_awh[player[0]] = {
              'position': player[1],
              'age': age,
              'height': height,
              'weight': weight}

In [None]:
statsdf = pd.DataFrame.from_dict(stats_awh, orient='index')

In [None]:
statsdf.info()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
