In [1]:
# for data management 
import pandas as pd 
import numpy as np 

# for web scraping 
import urllib.request
import re 
from bs4 import BeautifulSoup

# misc
import time  # limit automated request rate to prevent banning from site

In [2]:
URL_HEAD = 'https://www.hockey-reference.com'

In [3]:
with urllib.request.urlopen('https://www.hockey-reference.com/teams/TOR/2019_gamelog.html') as f:
    s = f.read().decode()

soup = BeautifulSoup(s, 'html.parser')

In [4]:
# data is a 
def get_game_table(team_season_url):
    """
    Returns a dataframe containing the team data for the indicated season 
    and a list of url suffixes for each individual game. 
    """
    with urllib.request.urlopen(team_season_url) as f:
        soup = BeautifulSoup(f.read().decode(), 'html.parser')
    
    # get table data for regular season
    table_tag = soup.find(lambda tag: tag.name == 'table' \
                          and tag.find('caption').string == 'Regular Season Table')
    
    # get table columns
    table_top_cols_tag, table_cols_tag = table_tag.find('thead').find_all('tr')

    # get the opponent columns (to indicate shot statistics for opponent)
    opp_cols = set() 
    ct = 0
    for tag in table_top_cols_tag.find_all('th'):
        for i in range(int(tag.get('colspan', 1))):
            if tag.string == 'Opponent': 
                opp_cols.add(ct)
            ct += 1

    # get the bottom level columns
    table_cols = [col.string if col.string else col['data-stat'] for col in table_cols_tag.find_all('th')]

    # indicate opponent columns in column names
    for i in opp_cols:
        table_cols[i] = table_cols[i] + '_opp'
        
    # get data for table 
    def is_a_data_row(tag):
        """
        Determines whether or not a given html tag corresponds to a data row in the table. 
        """
        name = tag.name 
        if not name == 'tr':
            return False
        if tag.has_attr('id'):
            return re.search('tm_gamelog_rs.\d+', tag['id'])

    table_data_tag = table_tag.find('tbody')
    data_rows_tag = table_data_tag.find_all(is_a_data_row)
    
    # url list for each game 
    url_suffix_list = []

    # get row data
    row_names_list = []
    row_data_list = []
    for tag in data_rows_tag:
        row_data = [x.string for x in tag.find_all('td')]
        row_data_list.append(row_data)

        row_name = int(tag.find('th').string)
        row_names_list.append(row_name)

        url_suffix = tag.find(lambda tag: tag.name == 'td' and tag.get('data-stat', '') == 'date_game')\
                        .find('a')['href']
        game_date = tag.find(lambda tag: tag.name == 'td' and tag.get('data-stat', '') == 'date_game').string
        url_suffix_list.append((game_date, url_suffix))
        
    # create dataframe 
    games_df = pd.DataFrame(index=pd.Index(data=row_names_list, name=table_cols[0]),
                            data=row_data_list, 
                            columns=table_cols[1:])\
                 .dropna(how='all', axis=1)\
                 .rename(columns={'game_location': 'Home/Away',
                                  'game_outcome': 'W/L',
                                  'overtimes': 'OT'})
    
    return games_df, url_suffix_list

In [5]:
def get_player_level_data_for_game(game_url, team='Toronto Maple Leafs'):
    """
    Gets the player-level data for a given team in a particular game. 
    """
    with urllib.request.urlopen(game_url) as f:
        soup = BeautifulSoup(f.read().decode(), 'html.parser')
        
    table_tag = soup.find(lambda tag: tag.name == 'table' \
                                      and tag.find('caption').string == f'{team} Table')
    
    # get table column names
    table_col_tag = table_tag.find('thead').find(lambda tag: tag.name == 'tr' \
                                                 and tag.find('th').get('aria-label') == 'Rk')
    col_names = [tag.get('aria-label', '') for tag in table_col_tag.find_all('th')]
    
    # get table data 
    table_data_tag = table_tag.find('tbody')

    rownames_list = [int(x.find('th').string) for x in table_data_tag.find_all('tr')]
    rowdata_list = []
    for tag in table_data_tag.find_all('tr'):
        row_data = [x.string for x in tag.find_all(lambda tag: tag.name == 'td' \
                                                   and tag.has_attr('data-stat'))]
        rowdata_list.append(row_data)
    
    # create data frame 
    players_df = pd.DataFrame(index=pd.Index(data=rownames_list, name=col_names[0]),
                              data=rowdata_list,
                              columns=col_names[1:])
    
    return players_df

In [6]:
games_df, url_suffix_list = get_game_table('https://www.hockey-reference.com/teams/TOR/2019_gamelog.html')
time.sleep(5)

In [7]:
player_df_map = dict()
for i, (date, url_suffix) in enumerate(url_suffix_list):
    url = URL_HEAD + url_suffix 
    print(url)
    
    # get data 
    players_df = get_player_level_data_for_game(url, 'Toronto Maple Leafs')
    
    # append to list 
    player_df_map[date] = players_df
    
    print(f"collected data for game {i+1}/{len(url_suffix_list)} on {date}")
    print('-' * 40)
    time.sleep(5)

https://www.hockey-reference.com/boxscores/201810030TOR.html
collected data for game 1/82 on 2018-10-03
----------------------------------------
https://www.hockey-reference.com/boxscores/201810060TOR.html
collected data for game 2/82 on 2018-10-06
----------------------------------------
https://www.hockey-reference.com/boxscores/201810070CHI.html
collected data for game 3/82 on 2018-10-07
----------------------------------------
https://www.hockey-reference.com/boxscores/201810090DAL.html
collected data for game 4/82 on 2018-10-09
----------------------------------------
https://www.hockey-reference.com/boxscores/201810110DET.html
collected data for game 5/82 on 2018-10-11
----------------------------------------
https://www.hockey-reference.com/boxscores/201810130WSH.html
collected data for game 6/82 on 2018-10-13
----------------------------------------
https://www.hockey-reference.com/boxscores/201810150TOR.html
collected data for game 7/82 on 2018-10-15
--------------------------

collected data for game 57/82 on 2019-02-14
----------------------------------------
https://www.hockey-reference.com/boxscores/201902160ARI.html
collected data for game 58/82 on 2019-02-16
----------------------------------------
https://www.hockey-reference.com/boxscores/201902190STL.html
collected data for game 59/82 on 2019-02-19
----------------------------------------
https://www.hockey-reference.com/boxscores/201902210TOR.html
collected data for game 60/82 on 2019-02-21
----------------------------------------
https://www.hockey-reference.com/boxscores/201902230TOR.html
collected data for game 61/82 on 2019-02-23
----------------------------------------
https://www.hockey-reference.com/boxscores/201902250TOR.html
collected data for game 62/82 on 2019-02-25
----------------------------------------
https://www.hockey-reference.com/boxscores/201902270TOR.html
collected data for game 63/82 on 2019-02-27
----------------------------------------
https://www.hockey-reference.com/boxsco

In [8]:
player_df_map

{'2018-10-03':                Player  G  A PTS +/- PIM EV PP SH GW EV PP SH  S    S% SHFT  \
 Rk                                                                           
 1        Connor Brown  0  0   0   0   0  0  0  0  0  0  0  0  1   0.0   23   
 2      Travis Dermott  0  1   1   1   0  0  0  0  0  1  0  0  1   0.0   24   
 3         Tyler Ennis  0  0   0  -1   0  0  0  0  0  0  0  0  2   0.0   24   
 4       Jake Gardiner  0  1   1   1   0  0  0  0  0  1  0  0  0  None   25   
 5         Ron Hainsey  0  0   0   0   2  0  0  0  0  0  0  0  1   0.0   28   
 6          Zach Hyman  0  0   0   0   2  0  0  0  0  0  0  0  2   0.0   24   
 7    Andreas Johnsson  0  0   0   0   0  0  0  0  0  0  0  0  0  None   13   
 8         Nazem Kadri  0  2   2   0   0  0  0  0  0  1  1  0  2   0.0   21   
 9     Kasperi Kapanen  0  0   0   0   0  0  0  0  0  0  0  0  2   0.0   16   
 10         Josh Leivo  0  0   0   1   0  0  0  0  0  0  0  0  1   0.0   17   
 11       Pär Lindholm  0  0   0   0  

In [19]:
games_df.to_csv('data_files/season-level-stats-2018.csv')

In [20]:
for date, player_df in player_df_map.items():
    fname_out = f'data_files/game-level-stats-2018/game_{date}.csv'
    player_df.to_csv(fname_out)