In [13]:
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import numpy as np
import time

def result_type(df, *col):
    if df[col[0]] > df[col[1]]:
        if df['type'] == 'SO':
            return 4
        else:
            return 1
    elif df['type'] == 'OT':
        return 3
    return 2

df = DataFrame()

for year in range(2008, 2019, 1):
    r = requests.get(f'https://www.hockey-reference.com/leagues/NHL_{year}_games.html')
    soup = BeautifulSoup(r.content, 'html.parser')
    
    df = pd.concat([df, DataFrame(np.array([ele.get_text() for ele in soup.select('#games > tbody > tr > .')]).reshape(-1, 9)) \
        .drop(columns=[0, 6, 7, 8], axis=1) \
        .rename(columns={1: 'away', 2: 'away_goals', 3: 'home', 4: 'home_goals', 5: 'type'}) \
        .assign(away_result=lambda x: x.apply(result_type, args=['away_goals', 'home_goals'], axis=1),
                home_result=lambda x: x.apply(result_type, args=['home_goals', 'away_goals'], axis=1),
                year=f'{year}') \
        .drop(columns=['type'])])
    time.sleep(2)

In [14]:
df = df \
    .loc[lambda x: x['away_goals'] != ''] \
    .reset_index(drop=True) \
    .reset_index() \
    .rename(columns={'index': 'id'}) \
    .assign(id=lambda x: x['id'] + 1)


In [46]:
teams = DataFrame(df['home'].str.split(' ').str[-1].unique(), columns=['team']) \
        .sort_values('team') \
        .reset_index(drop=True) \
        .reset_index() \
        .rename(columns={'index': 'id'}) \
        .assign(id=lambda x: x['id'] + 1)
teams

Unnamed: 0,id,team
0,1,Avalanche
1,2,Blackhawks
2,3,Blues
3,4,Bruins
4,5,Canadiens
5,6,Canucks
6,7,Capitals
7,8,Coyotes
8,9,Devils
9,10,Ducks


In [52]:
playoffs = DataFrame()

for year in range(2018, 2016, -1):
    r = requests.get(f'https://www.hockey-reference.com/leagues/NHL_2018.html')
    soup = BeautifulSoup(r.content, 'html.parser')
    
    playoffs = pd.concat([playoffs, 
                          pd.concat([DataFrame([ele.get_text() for ele in soup.select('#standings_EAS > tbody > tr > th')]),
                                     DataFrame([ele.get_text() for ele in soup.select('#standings_WES > tbody > tr > th')])
                                    ]) \
                            .assign(playoffs=lambda x: x.iloc[:, 0].str.endswith('*').astype(int),
                                    year=f'{year}',
                                    team=lambda x: x.iloc[:, 0].str.split(' ').str[-1].str.replace('*', '')) \
                            .merge(teams, left_on='team', right_on='team') \
                            .drop(columns=[0, 'team']) \
                            .rename(columns={'id': 'team_id'}) \
                            [['team_id', 'year', 'playoffs']]
                            
              ])
playoffs

Unnamed: 0,team_id,year,playoffs
0,20,2018,1
1,4,2018,1
2,19,2018,1
3,22,2018,0
4,32,2018,0
5,5,2018,0
6,27,2018,0
7,26,2018,0
8,7,2018,1
9,23,2018,1


In [53]:
playoffs = playoffs \
    .reset_index(drop=True) \
    .reset_index() \
    .rename(columns={'index': 'id'}) \
    .assign(id=lambda x: x['id'] + 1)
playoffs

Unnamed: 0,id,team_id,year,playoffs
0,1,20,2018,1
1,2,4,2018,1
2,3,19,2018,1
3,4,22,2018,0
4,5,32,2018,0
5,6,5,2018,0
6,7,27,2018,0
7,8,26,2018,0
8,9,7,2018,1
9,10,23,2018,1


In [18]:
schedule = df[['id', 'year', 'away', 'home']] \
        .assign(away=lambda x: x['away'].str.split(' ').str[-1]) \
        .assign(home=lambda x: x['home'].str.split(' ').str[-1]) \
        .merge(teams[['id', 'team']], left_on='away', right_on='team') \
        .merge(teams[['id', 'team']], left_on='home', right_on='team')  \
        [['id_x', 'year', 'id_y', 'id']] \
        .rename(columns={'id_x': 'id', 'id_y': 'away_id', 'id': 'home_id'}) \
        .sort_values('id') \
        .reset_index(drop=True)
schedule

Unnamed: 0,id,year,away_id,home_id
0,1,2008,10,17
1,2,2008,17,10
2,3,2008,5,13
3,4,2008,29,1
4,5,2008,10,32
5,6,2008,27,19
6,7,2008,12,11
7,8,2008,28,21
8,9,2008,2,31
9,10,2008,1,24


In [3]:
df

Unnamed: 0,id,away,away_goals,home,home_goals,away_result,home_result,year
0,1,Calgary Flames,0,Edmonton Oilers,3,2,1,2018
1,2,St. Louis Blues,5,Pittsburgh Penguins,4,1,3,2018
2,3,Philadelphia Flyers,5,San Jose Sharks,3,1,2,2018
3,4,Toronto Maple Leafs,7,Winnipeg Jets,2,1,2,2018
4,5,Arizona Coyotes,4,Anaheim Ducks,5,2,1,2018
5,6,Nashville Predators,3,Boston Bruins,4,2,1,2018
6,7,Montreal Canadiens,3,Buffalo Sabres,2,4,2,2018
7,8,Pittsburgh Penguins,1,Chicago Blackhawks,10,2,1,2018
8,9,Minnesota Wild,2,Detroit Red Wings,4,2,1,2018
9,10,Philadelphia Flyers,0,Los Angeles Kings,2,2,1,2018


In [19]:
away = df.loc[:, ['id', 'year', 'away', 'away_goals', 'away_result']] \
        .rename(columns={'away': 'team', 'away_goals': 'goals', 'away_result': 'result_id'})
home = df.loc[:, ['id', 'year', 'home', 'home_goals', 'home_result']] \
    .rename(columns={'home': 'team', 'home_goals': 'goals', 'home_result': 'result_id'})

results = pd.concat([away, home]) \
    .assign(team=lambda x: x['team'].str.split(' ').str[-1]) \
    .merge(teams, left_on='team', right_on='team') \
    .drop(columns=['team']) \
    .rename(columns={'id_x': 'game_id', 'id_y': 'team_id'}) \
    .sort_values('game_id') \
    .reset_index() \
    .rename(columns={'index': 'id'}) \
    [['id', 'game_id', 'year', 'team_id', 'goals', 'result_id']] \
    .sort_values('id')
results

Unnamed: 0,id,game_id,year,team_id,goals,result_id
0,0,1,2008,10,1,2
9,1,5,2008,10,2,2
34,2,18,2008,10,0,2
61,3,31,2008,10,4,2
207,4,104,2008,10,1,2
247,5,124,2008,10,2,2
390,6,196,2008,10,5,1
529,7,265,2008,10,6,1
571,8,286,2008,10,2,4
604,9,303,2008,10,1,2


In [25]:
results['game_id'].max()

13061

In [43]:
results[['game_id']].sort_values('game_id')

Unnamed: 0,game_id
0,1
1,1
2,2
3,2
4,3
5,3
6,4
7,4
8,5
9,5
