In [1]:
import requests
import os
import sys
import datetime
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, Comment

utils_path = os.getcwd().split('sports_data_warehouse')[0] + 'sports_data_warehouse/utils'
sys.path.append(utils_path)

from gcp import GCPUtils

In [2]:
class ScheduleAndResults():
    
    def __init__(self, sport):
        
        self.sport = sport
        
        self.get_sport_attrs()
        self.get_current_season()
        
    def get_sport_attrs(self):
        
        sport_attrs = {
            'nfl': {
                'season_start_month': 9,
                'url_base': 'https://www.pro-football-reference.com/years/{year}/games.htm'
            },
            'nhl': {
                'season_start_month': 10,
                'url_base': 'https://www.hockey-reference.com/leagues/NHL_{year}_games.html'
            }
        }
        
        self.season_start_month = sport_attrs[self.sport]['season_start_month']
        self.url_base = sport_attrs[self.sport]['url_base']
        
    def get_current_season(self):
    
        dt = datetime.datetime.now()

        if dt.month >= self.season_start_month - 1:
            self.curret_season = dt.year
        else:
            self.curret_season = dt.year - 1
            
        if self.sport in ['nhl']:
            self.curret_season += 1
            
    def get_table(self, year=None):
        
        year = year if year else self.curret_season
        
        url = self.url_base.format(year=year)
        # Fetch and parse the HTML content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find the target table
        table = soup.find("table", {"id": "games"})
        
        # Get column names
        thead = table.find("thead")
        columns = [i.attrs['data-stat'] for i in thead.find_all("th", {"scope": "col"})]
        
        # Get rows
        tbody = table.find("tbody")
        rows = tbody.find_all("tr", class_=lambda c: c != "thead")
        

        # Prepare a list to store data
        data = []

        # Loop through rows
        for row in rows:

            content = [(r.getText(), r.find('a')['href']) \
                       if r.find('a') != None else r.getText() \
                       for r in row.find_all(name=['th', 'td'])]

            data.append(content)

        df = pd.DataFrame(data, columns=columns).replace('', np.nan).dropna(axis=0, thresh=2)
        
        return df

In [3]:
# def main():

In [4]:
gcp_tool = GCPUtils(key_path=utils_path + '/gcp_keys/sports-warehouse.json')

In [5]:
print('Project:', gcp_tool.storage_client.project)

Project: sports-warehouse-442719


In [6]:
res_obj = ScheduleAndResults('nhl')

In [7]:
bucket_name = "nhl_data_storage"

In [8]:
for year in range(2015, res_obj.curret_season):
    
    df = res_obj.get_table(year)
    
    blob_name = f"schedule_and_results/{year}.csv"

    gcp_tool.pandas_to_gcs(
        df=df,
        blob_name=blob_name,
        bucket_name=bucket_name
    )

DataFrame saved to nhl_data_storage/schedule_and_results/2015.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2016.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2017.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2018.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2019.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2020.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2021.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2022.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2023.csv
DataFrame saved to nhl_data_storage/schedule_and_results/2024.csv
