In [77]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

In [25]:
r = requests.get('https://www.hockey-reference.com/leagues/NHL_2018_games.html')
soup = BeautifulSoup(r.content, 'html.parser')

In [198]:
df = DataFrame(np.array([ele.get_text() for ele in soup.select('#games > tbody > tr > .')]).reshape(-1, 9))
df = df.drop(columns=[6, 7, 8], axis=1) \
    .rename(columns={0: 'date', 1: 'away', 2: 'away_goals', 3: 'home', 4: 'home_goals', 5: 'type'}) \
    .reset_index() \
    .rename(columns={'index': 'game_id'})
df.head()

Unnamed: 0,game_id,date,away,away_goals,home,home_goals,type
0,0,2017-10-04,Calgary Flames,0,Edmonton Oilers,3,
1,1,2017-10-04,St. Louis Blues,5,Pittsburgh Penguins,4,OT
2,2,2017-10-04,Philadelphia Flyers,5,San Jose Sharks,3,
3,3,2017-10-04,Toronto Maple Leafs,7,Winnipeg Jets,2,
4,4,2017-10-05,Arizona Coyotes,4,Anaheim Ducks,5,


In [180]:
teams = DataFrame(df['home'].unique(), columns=['teams'])
teams = teams.assign(teams=teams['teams'].str.split(' ').str[-1]) \
    .sort_values('teams') \
    .reset_index(drop=True) \
    .reset_index() \
    .rename(columns={'index': 'id'})
teams.head()

Unnamed: 0,id,teams
0,0,Avalanche
1,1,Blackhawks
2,2,Blues
3,3,Bruins
4,4,Canadiens


In [280]:
points = DataFrame(data={'result':['W', 'L', 'OTL', 'SOW'], 'points': [2, 0, 1, 2]})
points = points.reset_index() \
    .rename(columns={'index': 'id'}) \
    .assign(id = lambda x: x['id'] + 1)
points

Unnamed: 0,id,result,points
0,1,W,2
1,2,L,0
2,3,OTL,1
3,4,SOW,2


In [202]:
schedule = df[['game_id', 'date', 'away', 'home']] \
    .assign(away=df['away'].str.split(' ').str[-1]) \
    .assign(home=df['home'].str.split(' ').str[-1]) \
    .merge(teams[['id', 'teams']], left_on='away', right_on='teams') \
    .merge(teams[['id', 'teams']], left_on='home', right_on='teams') \
    [['game_id', 'date', 'id_x', 'id_y']] \
    .rename(columns={'id_x': 'away_id', 'id_y': 'home_id'}) \
    .sort_values('game_id') \
    .reset_index(drop=True)
schedule.head()

Unnamed: 0,game_id,date,away_id,home_id
0,0,2017-10-04,10,20
1,1,2017-10-04,2,22
2,2,2017-10-04,11,27
3,3,2017-10-04,18,15
4,4,2017-10-05,7,9


In [240]:
def result_calc(df, *col):
    if df[col[0]] > df[col[1]]:
        if df['type'] == 'SO':
            return 4
        else:
            return 1
    elif df['type'] == 'OT':
        return 3
    return 2


df = df.assign(away_result=df.apply(result_calc, args=['away_goals', 'home_goals'], axis=1)) \
        .assign(home_result=df.apply(result_calc, args=['home_goals', 'away_goals'], axis=1))

away = df.loc[:, ['game_id', 'away', 'away_goals', 'away_result']] \
    .rename(columns={'away': 'team', 'away_goals': 'goals', 'away_result': 'result_id'})
home = df.loc[:, ['game_id', 'home', 'home_goals', 'home_result']] \
    .rename(columns={'home': 'team', 'home_goals': 'goals', 'home_result': 'result_id'})

results = pd.concat([away, home]) \
    .assign(team=lambda x: x['team'].str.split(' ').str[-1]) \
    .merge(teams, left_on='team', right_on='teams') \
    .drop(columns=['team', 'teams']) \
    .rename(columns={'id': 'team_id'}) \
    .sort_values('game_id') \
    .reset_index(drop=False)
results = results[['game_id', 'team_id', 'goals', 'result_id']]
results

Unnamed: 0,game_id,team_id,goals,result_id
0,0,10,0,2
1,0,20,3,1
2,1,2,5,1
3,1,22,4,3
4,2,27,3,2
5,2,11,5,1
6,3,15,2,2
7,3,18,7,1
8,4,9,5,1
9,4,7,4,2


In [241]:
r = requests.get('https://www.hockey-reference.com/leagues/NHL_2018.html')
soup = BeautifulSoup(r.content, 'html.parser')


In [262]:
df = pd.concat([DataFrame([ele.get_text() for ele in soup.select('#standings_EAS > tbody > tr > th')]),
               DataFrame([ele.get_text() for ele in soup.select('#standings_WES > tbody > tr > th')])
               ])
playoffs = df.assign(playoffs=lambda x: x.iloc[:, 0].str.endswith('*').astype(int),
               year=2018,
               team=lambda x: x.iloc[:, 0].str.split(' ').str[-1].str.replace('*', '')) \
        .merge(teams, left_on='team', right_on='teams') \
        .drop(columns=[0, 'team', 'teams']) \
        [['id', 'year', 'playoffs']]
playoffs

Unnamed: 0,id,year,playoffs
0,19,2018,1
1,3,2018,1
2,18,2018,1
3,21,2018,0
4,30,2018,0
5,4,2018,0
6,26,2018,0
7,25,2018,0
8,6,2018,1
9,22,2018,1


In [293]:
wins = results \
    .merge(points[['id', 'result']], left_on='result_id', right_on='id') \
    .sort_values('game_id') \
    .drop(columns=['game_id', 'goals', 'result_id', 'id']) \
    .groupby('team_id') \
    .head(41) \
    .loc[lambda x: x['result'] == 'W'] \
    .groupby('team_id') \
    .count()
wins

Unnamed: 0_level_0,result
team_id,Unnamed: 1_level_1
0,21
1,20
2,22
3,21
4,15
5,16
6,22
7,8
8,19
9,17


In [262]:
df = pd.concat([DataFrame([ele.get_text() for ele in soup.select('#standings_EAS > tbody > tr > th')]),
               DataFrame([ele.get_text() for ele in soup.select('#standings_WES > tbody > tr > th')])
               ])
playoffs = df.assign(playoffs=lambda x: x.iloc[:, 0].str.endswith('*').astype(int),
               year=2018,
               team=lambda x: x.iloc[:, 0].str.split(' ').str[-1].str.replace('*', '')) \
        .merge(teams, left_on='team', right_on='teams') \
        .drop(columns=[0, 'team', 'teams']) \
        [['id', 'year', 'playoffs']]
playoffs

Unnamed: 0,id,year,playoffs
0,19,2018,1
1,3,2018,1
2,18,2018,1
3,21,2018,0
4,30,2018,0
5,4,2018,0
6,26,2018,0
7,25,2018,0
8,6,2018,1
9,22,2018,1


In [303]:
goal_diff = results \
    .merge(results, left_on='game_id', right_on='game_id') \
    .loc[lambda x: x['team_id_x'] != x['team_id_y']] \
    .assign(diff = lambda x: x['goals_x'].astype(int) - x['goals_y'].astype(int)) \
    .drop(columns=['result_id_x', 'team_id_y', 'result_id_y', 'goals_x', 'goals_y']) \
    .groupby('team_id_x') \
    .head(41) \
    .groupby('team_id_x') \
    ['diff'] \
    .sum()

goal_diff.head()

team_id_x
0    11
1    11
2    17
3    30
4   -24
Name: diff, dtype: int32

In [354]:
#not used but may end up being useful
strength = teams \
    .merge(wins, left_on='id', right_on='team_id') \
    .merge(schedule, left_on='id', right_on='home_id') \
    [['game_id', 'home_id', 'result', 'away_id']] \
    .rename(columns={'result': 'home_wins'}) \
    .merge(wins, left_on='away_id', right_on='team_id') \
    .rename(columns={'result': 'away_wins'}) \
    .sort_values('game_id')
strength.head()
strength

Unnamed: 0,game_id,home_id,home_wins,away_id,away_wins
353,0,20,17,10,19
110,1,22,18,2,22
1225,2,27,20,11,18
675,3,15,22,18,21
626,4,9,17,7,8
1031,5,3,21,23,21
934,6,25,10,4,15
575,7,1,20,22,18
819,8,30,14,29,20
1209,9,16,22,11,18


In [353]:
def strength_calc(team, df):
    return df \
        .loc[lambda x: (x['away_id'] == team) | (x['home_id'] == team)] \
        .assign(opponent=lambda x: x['away_id'] + x['home_id'] - team) \
        .drop(columns=['date', 'away_id', 'home_id']) \
        .sort_values('game_id') \
        .tail(n=41) \
        .merge(wins, left_on='opponent', right_on='team_id') \
        ['result'] \
        .sum()


strength = teams \
    .assign(strength=lambda x: x['id'].apply(strength_calc, df=schedule))
strength

Unnamed: 0,id,teams,strength
0,0,Avalanche,777
1,1,Blackhawks,787
2,2,Blues,773
3,3,Bruins,734
4,4,Canadiens,769
5,5,Canucks,771
6,6,Capitals,746
7,7,Coyotes,778
8,8,Devils,778
9,9,Ducks,758


In [None]:
#calc points
#calc blowouts