In [1]:
%config Completer.use_jedi = False

In [2]:
import logging
logger = logging.getLogger('fbprophet')
logger.setLevel(logging.ERROR)

In [3]:
import matplotlib.pyplot as plt
import matplotlib
import requests
import re
import pandas as pd
import sys
import warnings
import numpy as np
import datetime
import dateutil.parser
from bs4 import BeautifulSoup
from six import string_types
from fbprophet import Prophet
import time
warnings.filterwarnings('ignore')
%matplotlib inline

Importing plotly failed. Interactive plots will not work.


In [4]:
class BurgerForecaster:
    
    _BASE_URL = "https://www.hockey-reference.com"
    
    def __init__(self, player, player_abbr, y):
        print(f"forecaster intialized for {player}")
        self.player = player
        self.player_abbr = player_abbr
        self.goalie = False
        self.y = y
        
        self.player_log = self._get_training_data(self.player, self.player_abbr, y)
        self.player_log = self.player_log.reset_index(level=0)
        self.min_date = min(self.player_log['DATE'])
        self.max_date = max(self.player_log['DATE'])
        
        print(f"data covers {self.min_date.date()} to {self.max_date.date()}")
        
        self.max_y = np.max(self.player_log[self.y])
        self.min_y = np.min(self.player_log[self.y])
        self.min_y_date = self.player_log[self.player_log[self.y]== self.min_y]['DATE']
        self.min_y_date = self.min_y_date[self.min_y_date.index[0]]
        self.max_y_date = self.player_log[self.player_log[self.y] == self.max_y]['DATE']
        self.max_y_date = self.max_y_date[self.max_y_date.index[0]]
        self.most_recent = self.player_log[self.y].iloc[-1]
        print(f"max_points={self.max_y} min_points={self.min_y} most_recent={self.most_recent}")
        
    @property
    def BASE_URL(self):
        return self._BASE_URL

    def _get_training_data(self, player, extension, y):
        letter = extension[:1]
        player_url = f"{self.BASE_URL}/players/{letter}/{extension}.html"

        r = requests.get(player_url, verify=False)
        

        b = BeautifulSoup(r.text, "html.parser")

        uls = b.find_all("ul")
        
        game_logs = {}
        

        for ul in uls:
            a = ul.find_all("a", href=True)
            season = 0
            for link in a:
                regex = ".*/gamelog/(2018|2019|2020|2021)"
                if re.search(regex, link['href']):
                    if re.search("./2016", link['href']):
                        season = 2016
                    elif re.search("./2017", link['href']):
                        season = 2017
                    elif re.search("./2018", link['href']):
                        season = 2018
                    elif re.search("./2019", link['href']):
                        season = 2019
                    elif re.search("./2020", link['href']):
                        season = 2020
                    elif re.search("./2021", link['href']):
                        season = 2021
                    value = link['href']
                    key = season
                    game_logs[key] = value
                

        player_log = pd.DataFrame()

        for key in game_logs:
            url = f"{self.BASE_URL}{game_logs[key]}"
            try:
                buffer_df = self._get_log_data(self, url, key, player, self.goalie)
                player_log = player_log.append(buffer_df)
            except Exception:
                print(f"unexpected exception {sys.exc_info()[0]}\n ERROR: THIS IS NOT AN ACTIVE PLAYER")
                buffer_df = pd.DataFrame()
                break

        if not self.goalie:
            try:
                player_log.BLK[player_log.BLK == ''] = 0
            except:
                pass

        player_log = player_log.apply(pd.to_numeric, downcast = 'float', errors = 'ignore')
        player_log.fillna(0.0, inplace=True)
        player_log['DATE'] = pd.to_datetime(player_log['DATE'].apply(self.convert_to_date))
        player_log['ds'] = player_log['DATE']
        player_log['ESPN'] = player_log.apply(self._calculate_points, axis=1)
        player_log['y'] = player_log[y]
        player_log['TOI_CONV'] = player_log.apply(self._convert_total_on_ice, axis = 1)
        player_log['ESP_PG'] = (player_log['EV_G'].astype(float) + player_log['EV_A'].astype(float))
        player_log['PPP_PG'] = (player_log['PP_G'].astype(float))
        player_log['PPG'] = (player_log.PTS/player_log.TOI_CONV) * 60

        return player_log

    @staticmethod
    def convert_to_date(value):
        if isinstance(value, string_types) and re.search(r'^\d{4}-\d{2}-\d{2}$', value):
            return dateutil.parser.parse(value).date()
        elif isinstance(value, string_types) and re.search(r'^\d{4}-\d{2}-\d{2}T[\d:\.]+Z$', value):
            return dateutil.parser.parse(value)
        else:
            return value

    @staticmethod
    def _convert_total_on_ice(row):
        t = row.TOI
        m, s = t.split(':')

        return float((int(m) * 1) + (int(s)/60))


    @staticmethod
    def _calculate_points(row):
        return (row.G * 4.0) + (row.A * 2.0) + (row.S * .4) + (row.BLK * .2) + (row.SH_G * 2.0)
        + (row.PM_P * .6) - (row.PM_M * .6) + (row.HTS * .2) + (row.PPP * 1.0)

    @staticmethod
    def _get_log_data(self, gamelogs_url, season, player, goalie):
        url = gamelogs_url
        r = requests.get(url, verify = False)
        b = BeautifulSoup(r.text, 'html.parser')

        table = b.find_all('table', {'id' : 'gamelog'})
        rows = []
        for tbl in table:
            for tr in tbl.find_all('tr'):
                row = []
                for td in tr.find_all('td'):
                    row.append(td.text.replace('\n',''))
                if not goalie:
                    if len(row) == 28 and row[0] != "":
                        rows.append(row)
                else:
                    if len(row) == 15 and row[0] != "":
                        rows.append(row)
        if not goalie:
            log_data = pd.DataFrame(rows, columns=['DATE', 'GAME', 'AGE', 'TM', 'HOA',
                                                  'OPP', 'RSLT', 'G', 'A', 'PTS', 'PM', 'PIM',
                                                  'EV_G', 'PP_G', 'SH_G', 'GW_G', 'EV_A', 'PP_A',
                                                  'SH_A', 'S', 'S_PCT', 'SHFT', 'TOI', 'HIT', 'BLK',
                                                  'FOW', 'FOL', 'FO_PCT'])
        else:
            log_data = pd.DataFrame(rows, columns=['DATE', 'GAME', 'AGE', 'TM', 'HOA',
                                                  'OPP', 'RSLT', 'DEC', 'GA', 'SA', 'SV',
                                                  'SV_PCT', 'SO', 'PIM', 'TOI'])

        log_data['season'] = season
        log_data['name'] = player
        log_data.head()
        return log_data
            

        

In [5]:
def get_player_list(path):
    df_id = pd.read_excel(path, engine='openpyxl')
    df_id['id'] = df_id['CORRECTED_NAME'].str.strip()
    return df_id

def create_player_dict(df_merged):
    players = {}
    for idx, row in df_merged.iterrows():
        players[row['id']] = row['INDEX']
        
    return players

In [6]:
df_players = get_player_list("nhl_player_list.xlsx")
df_players

Unnamed: 0.1,Unnamed: 0,NAME,INDEX,CORRECTED_NAME,id
0,0,Valentin Zykov,zykovva01,Valentin Zykov,Valentin Zykov
1,1,Jason Zucker,zuckeja01,Jason Zucker,Jason Zucker
2,2,Mats Zuccarello,zuccama01,Mats Zuccarello,Mats Zuccarello
3,3,Dainius Zubrus,zubruda01,Dainius Zubrus,Dainius Zubrus
4,4,Harry Zolnierczyk,zolniha01,Harry Zolnierczyk,Harry Zolnierczyk
...,...,...,...,...,...
1145,1145,Jordan Kyrou,kyroujo01,Jordan Kyrou,Jordan Kyrou
1146,1146,Anthony Cirelli,cirelan01,Anthony Cirelli,Anthony Cirelli
1147,1147,Matt Dumba,dumbama01,Matt Dumba,Matt Dumba
1148,1148,Kirill Kaprizov,kapriki01,Kirill Kaprizov,Kirill Kaprizov


In [7]:
df_players_games = pd.DataFrame(['Brad Marchand'], columns=['id'])
df_merged = pd.merge(df_players, df_players_games, how='inner', left_on=['id'], right_on=['id'])
players = create_player_dict(df_merged)
players

{'Brad Marchand': 'marchbr03'}

In [8]:
predictions = []
for player in players:
    print("============================================")
    forecaster = BurgerForecaster(player, players[player], 'ESPN')
    df = forecaster.player_log
    if len(df) >= 30:
        df_train_all = pd.DataFrame()
        df_train_all["ds"] = df.DATE
        df_train_all["y"] = df.ESPN
        m = Prophet(changepoint_prior_scale=0.99)
        print(f"fitting the model for {player}...")
        forecast = m.fit(df_train_all).predict(m.make_future_dataframe(periods=1, freq = 'D'))
        predictions.append({"name": player, "prediction": forecast[-1:]['yhat'].values[0]})
        print(f"finished making predictions for {player}")
        time.sleep(2)
    else:
        print(f"not enough data to make accurate prediction for player {player}")
    print("============================================\n")
print("DONE")

forecaster intialized for Brad Marchand
data covers 2017-10-05 to 2021-04-27
max_points=18.599999999999998 min_points=0.0 most_recent=6.0


INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


fitting the model for Brad Marchand...
finished making predictions for Brad Marchand

DONE
