In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import uuid
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline

In [46]:
def load_month(year, month):
    """
    Loads the html page for the overview of all games in a week
    :param year:
    :param week:
    :return:
    """
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_games-{}.html'.format(year, month)
    print(url)
    page = requests.get(url).text
    return page

In [51]:
def extract_game_info(game, scores=True):
    """
    Get the relevant information for a single game from the html elements
    :param game:
    :param scores:
    :return:
    """
    cells = game.find_all('td')
    date = game.find('th').get_text()

    time = cells[0].get_text()
    dt = datetime.datetime.strptime(date + ' ' + time, '%a, %b %d, %Y %I:%M %p')
    away_team = cells[1].get_text()
    away_points = cells[2].get_text()
    home_team = cells[3].get_text()
    home_points = cells[4].get_text()

    
    return dt, away_team, away_points, home_team, home_points

In [52]:
def get_games(years: list = None, months: list = None):
    """
    Wrapper function to get data on all games in a set of years and weeks
    :param years:
    :param weeks:
    :return:
    """
    output = dict(date=[], away_team=[], away_points=[], home_team=[], home_points=[], season=[], uuid=[])
    if years is None:
        years = range(2010, 2018)
    if months is None:
        months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may']

    for year in tqdm_notebook(years):
        for month in tqdm_notebook(months):
            page = load_month(year, month)
            soup = BeautifulSoup(page)
            try:
                games = soup.find('table', id='schedule').find('tbody').find_all('tr')
                for game in games:
                    try:
                        _uuid = uuid.uuid1()
                        date, away_team, away_points, home_team, home_points = extract_game_info(game)
                        output['date'].append(date)
                        output['away_team'].append(away_team)
                        output['away_points'].append(away_points)
                        output['home_team'].append(home_team)
                        output['home_points'].append(home_points)
                        output['season'].append(year)
                        output['uuid'].append(_uuid)
                    except IndexError:
                        pass
            except AttributeError:
                pass

    df = pd.DataFrame.from_dict(output, orient='columns')
    return df

In [53]:
games = get_games()

https://www.basketball-reference.com/leagues/NBA_2010_games-october.html




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


https://www.basketball-reference.com/leagues/NBA_2010_games-november.html
https://www.basketball-reference.com/leagues/NBA_2010_games-december.html
https://www.basketball-reference.com/leagues/NBA_2010_games-january.html
https://www.basketball-reference.com/leagues/NBA_2010_games-february.html
https://www.basketball-reference.com/leagues/NBA_2010_games-march.html
https://www.basketball-reference.com/leagues/NBA_2010_games-april.html
https://www.basketball-reference.com/leagues/NBA_2010_games-may.html
https://www.basketball-reference.com/leagues/NBA_2011_games-october.html
https://www.basketball-reference.com/leagues/NBA_2011_games-november.html
https://www.basketball-reference.com/leagues/NBA_2011_games-december.html
https://www.basketball-reference.com/leagues/NBA_2011_games-january.html
https://www.basketball-reference.com/leagues/NBA_2011_games-february.html
https://www.basketball-reference.com/leagues/NBA_2011_games-march.html
https://www.basketball-reference.com/leagues/NBA_2011_g

In [57]:
games.tail()

Unnamed: 0,away_points,away_team,date,home_points,home_team,season,uuid
10204,120,Golden State Warriors,2017-05-20 21:00:00,108,San Antonio Spurs,2017,05122c2a-fdf1-11e7-a0f9-60e327966de9
10205,111,Boston Celtics,2017-05-21 20:30:00,108,Cleveland Cavaliers,2017,05122ffe-fdf1-11e7-a0f9-60e327966de9
10206,129,Golden State Warriors,2017-05-22 21:00:00,115,San Antonio Spurs,2017,051233d2-fdf1-11e7-a0f9-60e327966de9
10207,99,Boston Celtics,2017-05-23 20:30:00,112,Cleveland Cavaliers,2017,051237a6-fdf1-11e7-a0f9-60e327966de9
10208,135,Cleveland Cavaliers,2017-05-25 20:30:00,102,Boston Celtics,2017,05123b7a-fdf1-11e7-a0f9-60e327966de9


In [55]:
len(games)

10209

In [56]:
pd.to_pickle(games, 'nba_games.pkl')