In [1]:
import time
import pandas as pd
import numpy as np
import sklearn.metrics
from datetime import date
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
import requests
from bs4 import BeautifulSoup

from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Scraping data from hockey reference

#### 1) List of goalies

In [2]:
teams_list = ['COL','MIN','STL','DAL','NSH','CHI','ARI','WPG' # Central
              'SJS','SEA','VAN','EDM','ANA','CGY','VGK','LAK'# Pacific  
              'BUF','BOS','TOR','FLA','DET','OTT','TBL','MTL',# Atlantic  
              'CBJ','PIT','NJD','WSH','NYR','PHI','NYI','CAR'# Metro 
             ]
years_list = ['2022','2021','2019','2018','2017','2016','2015','2014','2013']
roster_df = pd.DataFrame(
        {
            'team':'xx',
            'player_name': 'xx',
            'player_url':'xx',
            'position':'xx',
            'A':'a'
        },index=[0]
    ) 


for year in years_list:
    start_time = time.time()
    print('Gathering Data from: {0}'.format(year))
    for team in teams_list:
        try:
            url = 'https://www.hockey-reference.com/teams/{0}/{1}.html'.format(team,year)
            req = requests.get(url).text
            soup = BeautifulSoup(req,'lxml')

            roster = soup.find('div',id='all_roster')

            body = roster.find('tbody')

            num_players = len(body.find_all('tr'))

            for n in range(0,num_players):
                player_detail = body.find_all('tr')[n]
                name = player_detail.find_all('td',class_='left')[0].text
                player_url = player_detail.find_all('td',class_='left')[0]['data-append-csv']
                position = player_detail.find_all('td',class_='center')[0].text

                temp = pd.DataFrame(
                    {
                        'team':team,
                        'player_name': name,
                        'player_url':player_url,
                        'position':position,
                        'A':'a'
                    },index=[0]
                ) 
                roster_df = roster_df.append(temp)
        except:
            pass 
    print('Seconds: {0}'.format(np.round(time.time()- start_time),1))
    print('')

roster_df = roster_df.loc[roster_df['player_name']!='xx']
goalies_list_df = roster_df.loc[roster_df['position']=='G'].reset_index()
goalies_list_cleaned = goalies_list_df[['player_name','player_url']].drop_duplicates()
num_goalies = len(goalies_list_cleaned)
print('Data Gathered on {0} goalies'.format(num_goalies))


Gathering Data from: 2022
Seconds: 19.0

Gathering Data from: 2021
Seconds: 19.0

Gathering Data from: 2019
Seconds: 19.0

Gathering Data from: 2018
Seconds: 19.0

Gathering Data from: 2017
Seconds: 20.0

Gathering Data from: 2016
Seconds: 19.0

Gathering Data from: 2015
Seconds: 19.0

Gathering Data from: 2014
Seconds: 18.0

Gathering Data from: 2013
Seconds: 13.0

Data Gathered on 208 goalies


#### 2) Goalie season stats

In [3]:
goalie_stats_df = pd.DataFrame(

    {
        'player_name':'xx',
        'season':'xx',
        'team':'xx',
        'age':'xx',
        'A':'xx',
        'games_goalie':'xx',
        'starts_goalie':'xx',
        'wins_goalie':'xx',
        'losses_goalie':'xx',
        'ties_goalie':'xx',
        'goals_against':'xx',
        'shots_against':'xx',
        'saves':'xx',
        'save_pct':'xx',
        'goals_against_avg':'xx',
        'shutouts':'xx',
        'min_goalie':'xx',
        'quality_starts_goalie':'xx',
        'quality_start_goalie_pct':'xx',
        'really_bad_starts_goalie':'xx',
        'ga_pct_minus':'xx',
        'gs_above_avg':'xx',
        'goals_against_avg_adjusted':'xx',
        'gps':'xx',
        'goals':'xx',
        'assists':'xx',
        'points':'xx',
        'pen_min':'xx'
    }, index=[0]

)

start_time = time.time()
num_goalies = len(goalies_list_cleaned)
for n in range(0,num_goalies):
#     print(n) 
    try:
        player_name = goalies_list_cleaned.loc[n]['player_name']
        player_url = goalies_list_cleaned.loc[n]['player_url']
        first_letter_url = player_url[0]

        url = 'https://www.hockey-reference.com/players/{0}/{1}.html'.format(first_letter_url,player_url)
        req = requests.get(url).text
        soup = BeautifulSoup(req,'lxml')

        body = soup.find('tbody')

        try:
            num_seasons = len(body.find_all('tr'))

            for n in range(0,num_seasons):
                season_stats = body.find_all('tr')[n]
                season = season_stats.find_all('th')[0].text    
                team = season_stats.find_all('td',class_='left')[0].text
                age = season_stats.find_all('td',class_='center')[0].text  
                num_stat_columns = len(season_stats.find_all('td',class_='right'))
                starter_temp = pd.DataFrame(
                    {
                        'player_name':player_name,
                        'season': season,
                        'team': team,
                        'age':age,
                        'A':'a'
                    },index=[0]
                ) 
                right = pd.DataFrame({'A': 'a'},index=[0])    
                for nu in range(0,num_stat_columns):
                    stat_name = season_stats.find_all('td',class_='right')[nu]['data-stat'] 
                    stat_detail = season_stats.find_all('td',class_='right')[nu].text
                    temp = pd.DataFrame(
                        {
                            str(stat_name): stat_detail,
                            'A':'a'
                        }, index=[0]
                    )
                    right = right.merge(temp,left_on='A',right_on='A')
                    all_df = starter_temp.merge(right,left_on='A',right_on='A')
                goalie_stats_df = goalie_stats_df.append(all_df)
        except:
            pass
    except:
        pass 

goalie_stats_df = goalie_stats_df.loc[goalie_stats_df['season']!='xx']
num_seasons = len(goalie_stats_df)
print('Data Gathered on {0} total goalie seasons'.format(num_seasons))
print('Minutes: {0}'.format(np.round(time.time()- start_time) / 60,1))
goalie_stats_df.head(10)


Data Gathered on 837 total goalie seasons
Minutes: 1.85


Unnamed: 0,player_name,season,team,age,A,games_goalie,starts_goalie,wins_goalie,losses_goalie,ties_goalie,goals_against,shots_against,saves,save_pct,goals_against_avg,shutouts,min_goalie,quality_starts_goalie,quality_start_goalie_pct,really_bad_starts_goalie,ga_pct_minus,gs_above_avg,goals_against_avg_adjusted,gps,goals,assists,points,pen_min
0,Justus Annunen,2021-22,COL,21,a,2,1,1,0,1,7,51,44,0.863,4.34,0,97,0,0.0,1,,,4.34,0.1,0,1,1,0
0,Pavel Francouz,2018-19,COL,28,a,2,0,0,2,0,2,35,33,0.943,1.96,0,61,0,,0,,,1.96,0.3,0,0,0,0
0,Pavel Francouz,2019-20,COL,29,a,34,30,21,7,4,77,996,919,0.923,2.41,1,1914,17,0.567,5,85.0,13.1,2.57,7.0,0,0,0,2
0,Pavel Francouz,2021-22,COL,31,a,21,18,15,5,1,51,608,557,0.916,2.55,2,1200,6,0.333,1,90.0,5.6,2.6,3.9,0,0,0,0
0,Alexandar Georgiev,2017-18,NYR,21,a,10,9,4,4,1,27,331,304,0.918,3.15,0,515,5,0.556,1,93.0,2.1,3.38,2.1,0,0,0,0
0,Alexandar Georgiev,2018-19,NYR,22,a,33,30,14,13,4,91,1057,966,0.914,2.91,2,1874,15,0.5,3,96.0,4.3,3.11,6.3,0,2,2,2
0,Alexandar Georgiev,2019-20,NYR,23,a,34,32,17,14,2,96,1063,967,0.91,3.04,2,1892,17,0.531,5,100.0,0.1,3.24,6.0,0,1,1,0
0,Alexandar Georgiev,2020-21,NYR,24,a,19,18,8,7,2,44,465,421,0.905,2.71,2,974,9,0.5,3,103.0,-1.2,2.96,2.6,0,1,1,0
0,Alexandar Georgiev,2021-22,NYR,25,a,33,28,15,10,2,85,832,747,0.898,2.92,2,1746,13,0.464,9,110.0,-7.6,2.99,3.8,0,0,0,0
0,Hunter Miska,2018-19,ARI,23,a,1,0,0,0,0,1,9,8,0.889,3.28,0,18,0,,0,,,3.28,0.0,0,0,0,0


In [22]:
wins_anchor = 10 

# Data from last season 
goalie_stats_df_21 = goalie_stats_df.loc[goalie_stats_df['season']== '2021-22']

# Avg save pct
goalie_stats_df_21['shots_against_int'] = goalie_stats_df_21['shots_against'].astype('int')
goalie_stats_df_21['goals_against_int'] = goalie_stats_df_21['goals_against'].astype('int')

total_shots = goalie_stats_df_21['shots_against_int'].sum()
total_goals = goalie_stats_df_21['goals_against_int'].sum()
avg_save_pct = (total_shots - total_goals) / total_shots

# Goalie expected saves above average 
goalie_stats_df_21['expected_saves'] = goalie_stats_df_21['shots_against_int'] * avg_save_pct
goalie_stats_df_21['expected_saves_above_avg'] = goalie_stats_df_21['saves'].astype('int') - goalie_stats_df_21['expected_saves']

# Sorted DF 
goalie_stats_df_21_sorted = goalie_stats_df_21.sort_values(by=['expected_saves_above_avg'], ascending=False)

# Assigning expected wins 
max_saves_above_average = goalie_stats_df_21_sorted['expected_saves_above_avg'].max()

goalie_stats_df_21_sorted['expected_wins'] = wins_anchor * (goalie_stats_df_21_sorted['expected_saves_above_avg'] /  max_saves_above_average) 

# print(max_saves_above_average)

goalie_stats_df_21_sorted.head(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goalie_stats_df_21['shots_against_int'] = goalie_stats_df_21['shots_against'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goalie_stats_df_21['goals_against_int'] = goalie_stats_df_21['goals_against'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goalie_stats_df_21['e

Unnamed: 0,player_name,season,team,age,A,games_goalie,starts_goalie,wins_goalie,losses_goalie,ties_goalie,goals_against,shots_against,saves,save_pct,goals_against_avg,shutouts,min_goalie,quality_starts_goalie,quality_start_goalie_pct,really_bad_starts_goalie,ga_pct_minus,gs_above_avg,goals_against_avg_adjusted,gps,goals,assists,points,pen_min,shots_against_int,goals_against_int,expected_saves,expected_saves_above_avg,expected_wins
0,Igor Shesterkin,2021-22,NYR,26,a,53,52,36,13,4,106,1622,1516,0.935,2.07,6,3071,38,0.731,4,70,44.9,2.11,13.6,0,1,1,2,1622,106,1472.783621,43.216379,10.0
0,Ilya Sorokin,2021-22,NYI,26,a,52,52,26,18,8,123,1643,1520,0.925,2.4,7,3072,37,0.712,7,80,29.8,2.46,12.1,0,0,0,0,1643,123,1491.851719,28.148281,6.513336
0,Jacob Markstrom,2021-22,CGY,32,a,63,63,37,15,9,137,1754,1617,0.922,2.22,9,3696,38,0.603,6,84,26.1,2.27,12.4,0,3,3,10,1754,137,1592.640241,24.359759,5.636696
0,Darcy Kuemper,2021-22,COL,31,a,57,57,37,12,4,138,1754,1616,0.921,2.54,5,3259,32,0.561,7,85,25.1,2.6,12.3,0,0,0,2,1754,138,1592.640241,23.359759,5.405302
0,Juuse Saros,2021-22,NSH,26,a,67,67,38,25,3,173,2107,1934,0.918,2.64,4,3932,43,0.642,8,88,23.0,2.7,14.0,0,0,0,0,2107,173,1913.165899,20.834101,4.820881
0,Frederik Andersen,2021-22,CAR,32,a,52,51,35,14,3,111,1431,1320,0.922,2.17,4,3071,30,0.588,5,83,22.1,2.23,10.2,0,4,4,0,1431,111,1299.354723,20.645277,4.777188
0,Tristan Jarry,2021-22,PIT,26,a,58,56,34,18,6,138,1711,1573,0.919,2.42,4,3415,35,0.625,6,87,21.1,2.48,11.6,0,2,2,2,1711,138,1553.596039,19.403961,4.489955
0,Andrei Vasilevskiy,2021-22,TBL,27,a,63,63,39,18,5,156,1868,1712,0.916,2.49,2,3761,39,0.619,7,90,17.7,2.55,12.2,0,1,1,6,1868,156,1696.152776,15.847224,3.666948
0,Ville Husso,2021-22,STL,26,a,40,38,25,7,6,100,1236,1136,0.919,2.56,2,2341,23,0.605,3,87,15.0,2.64,8.4,0,0,0,0,1236,100,1122.293807,13.706193,3.171527
0,Anton Forsberg,2021-22,OTT,29,a,46,44,22,17,4,121,1457,1336,0.917,2.82,1,2571,29,0.659,5,89,14.5,2.89,9.6,0,0,0,2,1457,121,1322.962845,13.037155,3.016716


In [23]:
goalie_stats_df_21_sorted.to_csv('~/desktop/python/goalie_expected_wins.csv',header=None,index=False)