In [24]:
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import numpy as np
import time

def result_type(df, *col):
    if df[col[0]] > df[col[1]]:
        if df['type'] == 'SO':
            return 4
        else:
            return 1
    elif df['type'] == 'OT':
        return 3
    return 2

df = DataFrame()

for year in range(2018, 2016, -1):
    r = requests.get(f'https://www.hockey-reference.com/leagues/NHL_{year}_games.html')
    soup = BeautifulSoup(r.content, 'html.parser')
    
    df = pd.concat([df, DataFrame(np.array([ele.get_text() for ele in soup.select('#games > tbody > tr > .')]).reshape(-1, 9)) \
        .drop(columns=[0, 6, 7, 8], axis=1) \
        .rename(columns={1: 'away', 2: 'away_goals', 3: 'home', 4: 'home_goals', 5: 'type'}) \
        .assign(away_result=lambda x: x.apply(result_type, args=['away_goals', 'home_goals'], axis=1),
                home_result=lambda x: x.apply(result_type, args=['home_goals', 'away_goals'], axis=1),
                year=f'{year}') \
        .drop(columns=['type'])])
    time.sleep(2)

In [41]:
df = df \
    .reset_index(drop=True) \
    .reset_index() \
    .rename(columns={'index': 'id'}) \
    .assign(id=lambda x: x['id'] + 1) \
    .loc[lambda x: x['away_goals'] != '']


In [5]:
teams = DataFrame(df['home'].unique(), columns=['team']) \
        .assign(team=lambda x: x['team'].str.split(' ').str[-1]) \
        .sort_values('team') \
        .reset_index(drop=True) \
        .reset_index() \
        .rename(columns={'index': 'id'}) \
        .assign(id=lambda x: x['id'] + 1)
teams

Unnamed: 0,id,team
0,1,Avalanche
1,2,Blackhawks
2,3,Blues
3,4,Bruins
4,5,Canadiens
5,6,Canucks
6,7,Capitals
7,8,Coyotes
8,9,Devils
9,10,Ducks


In [15]:
playoffs = DataFrame()

for year in range(2018, 2016, -1):
    r = requests.get(f'https://www.hockey-reference.com/leagues/NHL_2018.html')
    soup = BeautifulSoup(r.content, 'html.parser')
    
    playoffs = pd.concat([playoffs, 
                          pd.concat([DataFrame([ele.get_text() for ele in soup.select('#standings_EAS > tbody > tr > th')]),
                                     DataFrame([ele.get_text() for ele in soup.select('#standings_WES > tbody > tr > th')])
                                    ]) \
                            .assign(playoffs=lambda x: x.iloc[:, 0].str.endswith('*').astype(int),
                                    year=f'{year}',
                                    team=lambda x: x.iloc[:, 0].str.split(' ').str[-1].str.replace('*', '')) \
                            .merge(teams, left_on='team', right_on='team') \
                            .drop(columns=[0, 'team']) \
                            [['id', 'year', 'playoffs']]
              ])


Unnamed: 0,id,year,playoffs
0,20,2018,1
1,4,2018,1
2,19,2018,1
3,22,2018,0
4,31,2018,0
5,5,2018,0
6,27,2018,0
7,26,2018,0
8,7,2018,1
9,23,2018,1


In [33]:
playoffs \
    .reset_index(drop=True) \
    .reset_index() \
    .rename(columns={'index': 'id'}) \
    .assign(id=lambda x: x['id'] + 1)

Unnamed: 0,id,id.1,year,playoffs
0,1,21,2018,1
1,2,5,2018,1
2,3,20,2018,1
3,4,23,2018,0
4,5,32,2018,0
5,6,6,2018,0
6,7,28,2018,0
7,8,27,2018,0
8,9,8,2018,1
9,10,24,2018,1


In [42]:
schedule = df[['id', 'year', 'away', 'home']] \
        .assign(away=lambda x: x['away'].str.split(' ').str[-1]) \
        .assign(home=lambda x: x['home'].str.split(' ').str[-1]) \
        .merge(teams[['id', 'team']], left_on='away', right_on='team') \
        .merge(teams[['id', 'team']], left_on='home', right_on='team')  \
        [['id_x', 'year', 'id_y', 'id']] \
        .rename(columns={'id_x': 'id', 'id_y': 'away_id', 'id': 'home_id'}) \
        .sort_values('id') \
        .reset_index(drop=True)
schedule

Unnamed: 0,id,year,away_id,home_id
0,1,2018,11,21
1,2,2018,3,23
2,3,2018,12,28
3,4,2018,19,16
4,5,2018,8,10
5,6,2018,24,4
6,7,2018,5,26
7,8,2018,23,2
8,9,2018,30,31
9,10,2018,12,17


In [46]:
away = df.loc[:, ['id', 'year', 'away', 'away_goals', 'away_result']] \
        .rename(columns={'away': 'team', 'away_goals': 'goals', 'away_result': 'result_id'})
home = df.loc[:, ['id', 'year', 'home', 'home_goals', 'home_result']] \
    .rename(columns={'home': 'team', 'home_goals': 'goals', 'home_result': 'result_id'})

results = pd.concat([away, home]) \
    .assign(team=lambda x: x['team'].str.split(' ').str[-1]) \
    .merge(teams, left_on='team', right_on='team') \
    .drop(columns=['team']) \
    .rename(columns={'id_x': 'game_id', 'id_y': 'team_id'}) \
    .sort_values('game_id') \
    .reset_index() \
    .rename(columns={'index': 'id'}) \
    [['id', 'game_id', 'year', 'team_id', 'goals', 'result_id']] \
    .sort_values('id')
results

Unnamed: 0,id,game_id,year,team_id,goals,result_id
0,0,1,2018,11,0,2
62,1,32,2018,11,2,1
94,2,48,2018,11,4,1
149,3,75,2018,11,5,1
259,4,130,2018,11,3,4
273,5,137,2018,11,2,2
552,6,277,2018,11,2,2
594,7,298,2018,11,5,1
629,8,315,2018,11,4,1
645,9,323,2018,11,0,3
