In [103]:
from datetime import datetime
from csv import writer
import requests, bs4
import re
import pandas as pd
import os


In [104]:
def get_4fac15(team):
    
    team_abbrevs = {
    'Cleveland Cavaliers': 'CLE',
    'Boston Celtics':'BOS',
    'Washington Wizards':'WAS',
    'Charlotte Hornets':'CHO',
    'Minnesota Timberwolves':'MIN',
    'Dallas Mavericks':'DAL',
    'Milwaukee Bucks':'MIL',
    'Philadelphia 76ers':'PHI',
    'Phoenix Suns':'PHO',
    'Los Angeles Lakers':'LAL',
    'Utah Jazz':'UTA',
    'Sacramento Kings':'SAC',
    'New York Knicks':'NYK',
    'New Orleans Pelicans':'NOP',
    'Detroit Pistons':'DET',
    'Atlanta Hawks':'ATL',
    'Chicago Bulls':'CHI',
    'Miami Heat':'MIA',
    'Memphis Grizzlies':'MEM',
    'Golden State Warriors':'GSW',
    'Denver Nuggets':'DEN',
    'Brooklyn Nets':'BRK',
    'Los Angeles Clippers':'LAC',
    'Portland Trail Blazers':'POR',
    'Indiana Pacers':'IND',
    'San Antonio Spurs':'SAS',
    'Houston Rockets':'HOU',
    'Oklahoma City Thunder':'OKC',
    'Toronto Raptors':'TOR',
    'Orlando Magic':'ORL'
    }
    
    # Formats webscraped dates to match bball reference URLs for later scraping
    def date_formatter(date):

        if len(str(date.month)) != 2:
            month = '0'+ str(date.month)
        else:
            month = str(date.month)
        
        if len(str(date.day)) != 2:
            day = '0' + str(date.day)
        else:
            day = str(date.day)
        
        newdate = str(date.year) + month + day
        return newdate


    dates = []
    efgs = []
    tovpct = []
    orbpct = []
    ftr = []
    pace = []
    OEff = []
    DEff = []
        
    # Get team page
    url = f'https://www.basketball-reference.com/teams/{team}/2015_games.html'
    res = requests.get(url, 'html_parser')
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
        
    # Get just regular season stats
    reg_season = soup.find('table',{'id':'games'})
        
    # Pull regular season game dates, generate list, turn it into datetime objects
    datesoup = reg_season.find_all(attrs = {'data-stat':'date_game'})
    gamedates = [entry.get_text() for entry in datesoup if entry.get_text() != 'Date' ]
    game_dates = [datetime.strptime(date, '%a, %b %d, %Y') for date in gamedates]
        
        
    # Find regular season game locations, translate to 1 = home, 0 = away
    souplocs = reg_season.find_all(attrs = {'data-stat':'game_location'})
    locs = [entry.get_text() for entry in souplocs if entry.get_text() == '' or entry.get_text() == '@']
    game_locs = list(map(lambda x: 1 if x =='' else 0, locs))
    
    # Find opponent team names
    oppts = reg_season.find_all(attrs = {'data-stat':'opp_name'})
    opps = [entry.get_text() for entry in oppts if entry.get_text() != "Opponent"]
    opp_abbrev = [team_abbrevs[opp] for opp in opps]
        
    # Get Results
    soupresults = reg_season.find_all(attrs = {'data-stat':'game_result'})
    results = [entry.get_text() for entry in soupresults if entry.get_text() == 'W' or entry.get_text() == 'L']
    rslts = list(map(lambda x: 1 if x =='W' else 0, results))

    souppoints = reg_season.find_all(attrs = {'data-stat':'pts'})
    soupopp = reg_season.find_all(attrs = {'data-stat':'opp_pts'})

    teampoints = [entry.get_text() for entry in souppoints if not entry.get_text().startswith('T')]
    opppoints = [entry.get_text() for entry in soupopp if not entry.get_text().startswith('O')]

    teampoints = list(map(lambda x: int(x), teampoints))
    opppoints = list(map(lambda x: int(x), opppoints))
    
    # Get game stats
    for i, gamedate in enumerate(game_dates):
            
        date = date_formatter(gamedate)
        dates.append(date)
            
        if game_locs[i] == 1:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[1].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[1].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[1].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[1].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[1].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
        else:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team_abbrevs[opps[i]]}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[0].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[0].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[0].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[0].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[0].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())

#         print("Date: " + date + "Location: " + str(game_locs[i]) + " Team Score " + 
#               " Opponent: " + team_abbrevs[opps[i]] + " Result: " + 
#               str(rslts[i]) + " EFG%: " + efgs[i] + " TOV%: " + tovpct[i] + 
#               " ORB%: " + orbpct[i] + " FTR: " + ftr[i] + " Pace: " + pace[i])
    
    teamlist= [team] * 82
    data = {'Team': teamlist, "Location":game_locs, "Game Number":list(range(1,83)), "Team Points": teampoints,
            "Opp Points": opppoints, "Result":rslts, "Date": dates, "Opponent": opp_abbrev, 
            "EFG%": efgs, "TOV%" : tovpct, "ORB%":orbpct, "FTR":ftr, "Pace": pace, "OEff": OEff, "DEff":DEff}
    df = pd.DataFrame.from_dict(data)
    df['EFG%'] = df['EFG%'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%'].apply(lambda x: float(x))
    df['ORB%'] = df['ORB%'].apply(lambda x: float(x))
    df['FTR'] = df['FTR'].apply(lambda x: float(x))
    df['Pace'] = df['Pace'].apply(lambda x: float(x))
    df['OEff'] = df['OEff'].apply(lambda x: float(x))
    df['DEff'] = df['DEff'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%']/100.0
    df['ORB%'] = df['ORB%']/100.0
    
    return df

In [105]:
def get_4fac16(team):
    
    team_abbrevs = {
    'Cleveland Cavaliers': 'CLE',
    'Boston Celtics':'BOS',
    'Washington Wizards':'WAS',
    'Charlotte Hornets':'CHO',
    'Minnesota Timberwolves':'MIN',
    'Dallas Mavericks':'DAL',
    'Milwaukee Bucks':'MIL',
    'Philadelphia 76ers':'PHI',
    'Phoenix Suns':'PHO',
    'Los Angeles Lakers':'LAL',
    'Utah Jazz':'UTA',
    'Sacramento Kings':'SAC',
    'New York Knicks':'NYK',
    'New Orleans Pelicans':'NOP',
    'Detroit Pistons':'DET',
    'Atlanta Hawks':'ATL',
    'Chicago Bulls':'CHI',
    'Miami Heat':'MIA',
    'Memphis Grizzlies':'MEM',
    'Golden State Warriors':'GSW',
    'Denver Nuggets':'DEN',
    'Brooklyn Nets':'BRK',
    'Los Angeles Clippers':'LAC',
    'Portland Trail Blazers':'POR',
    'Indiana Pacers':'IND',
    'San Antonio Spurs':'SAS',
    'Houston Rockets':'HOU',
    'Oklahoma City Thunder':'OKC',
    'Toronto Raptors':'TOR',
    'Orlando Magic':'ORL'
    }
    
    # Formats webscraped dates to match bball reference URLs for later scraping
    def date_formatter(date):

        if len(str(date.month)) != 2:
            month = '0'+ str(date.month)
        else:
            month = str(date.month)
        
        if len(str(date.day)) != 2:
            day = '0' + str(date.day)
        else:
            day = str(date.day)
        
        newdate = str(date.year) + month + day
        return newdate


    dates = []
    efgs = []
    tovpct = []
    orbpct = []
    ftr = []
    pace = []
    OEff = []
    DEff = []
        
    # Get team page
    url = f'https://www.basketball-reference.com/teams/{team}/2016_games.html'
    res = requests.get(url, 'html_parser')
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    
    # Get just regular season stats
    reg_season = soup.find('table',{'id':'games'})
    
    # Pull regular season game dates, generate list, turn it into datetime objects
    datesoup = reg_season.find_all(attrs = {'data-stat':'date_game'})
    gamedates = [entry.get_text() for entry in datesoup if entry.get_text() != 'Date' ]
    game_dates = [datetime.strptime(date, '%a, %b %d, %Y') for date in gamedates]
        
    
    # Find regular season game locations, translate to 1 = home, 0 = away
    souplocs = reg_season.find_all(attrs = {'data-stat':'game_location'})
    locs = [entry.get_text() for entry in souplocs if entry.get_text() == '' or entry.get_text() == '@']
    game_locs = list(map(lambda x: 1 if x =='' else 0, locs))
    
    # Find opponent team names
    oppts = reg_season.find_all(attrs = {'data-stat':'opp_name'})
    opps = [entry.get_text() for entry in oppts if entry.get_text() != "Opponent"]
    opp_abbrev = [team_abbrevs[opp] for opp in opps]
        
    # Get Results
    soupresults = reg_season.find_all(attrs = {'data-stat':'game_result'})
    results = [entry.get_text() for entry in soupresults if entry.get_text() == 'W' or entry.get_text() == 'L']
    rslts = list(map(lambda x: 1 if x =='W' else 0, results))

    souppoints = reg_season.find_all(attrs = {'data-stat':'pts'})
    soupopp = reg_season.find_all(attrs = {'data-stat':'opp_pts'})

    teampoints = [entry.get_text() for entry in souppoints if not entry.get_text().startswith('T')]
    opppoints = [entry.get_text() for entry in soupopp if not entry.get_text().startswith('O')]

    teampoints = list(map(lambda x: int(x), teampoints))
    opppoints = list(map(lambda x: int(x), opppoints))

    # Get game stats
    for i, gamedate in enumerate(game_dates):
            
        date = date_formatter(gamedate)
        dates.append(date)
            
        if game_locs[i] == 1:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[1].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[1].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[1].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[1].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[1].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
        else:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team_abbrevs[opps[i]]}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[0].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[0].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[0].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[0].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[0].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())

#         print("Date: " + date + "Location: " + str(game_locs[i]) + " Team Score " + 
#               " Opponent: " + team_abbrevs[opps[i]] + " Result: " + 
#               str(rslts[i]) + " EFG%: " + efgs[i] + " TOV%: " + tovpct[i] + 
#               " ORB%: " + orbpct[i] + " FTR: " + ftr[i] + " Pace: " + pace[i])
    
    teamlist= [team] * 82
    data = {'Team': teamlist, "Location":game_locs, "Game Number":list(range(1,83)), "Team Points": teampoints,
            "Opp Points": opppoints, "Result":rslts, "Date": dates, "Opponent": opp_abbrev, 
            "EFG%": efgs, "TOV%" : tovpct, "ORB%":orbpct, "FTR":ftr, "Pace": pace, "OEff": OEff, "DEff":DEff}
    df = pd.DataFrame.from_dict(data)
    df['EFG%'] = df['EFG%'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%'].apply(lambda x: float(x))
    df['ORB%'] = df['ORB%'].apply(lambda x: float(x))
    df['FTR'] = df['FTR'].apply(lambda x: float(x))
    df['Pace'] = df['Pace'].apply(lambda x: float(x))
    df['OEff'] = df['OEff'].apply(lambda x: float(x))
    df['DEff'] = df['DEff'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%']/100.0
    df['ORB%'] = df['ORB%']/100.0
    
    return df

In [106]:
def get_4fac17(team):
    
    team_abbrevs = {
    'Cleveland Cavaliers': 'CLE',
    'Boston Celtics':'BOS',
    'Washington Wizards':'WAS',
    'Charlotte Hornets':'CHO',
    'Minnesota Timberwolves':'MIN',
    'Dallas Mavericks':'DAL',
    'Milwaukee Bucks':'MIL',
    'Philadelphia 76ers':'PHI',
    'Phoenix Suns':'PHO',
    'Los Angeles Lakers':'LAL',
    'Utah Jazz':'UTA',
    'Sacramento Kings':'SAC',
    'New York Knicks':'NYK',
    'New Orleans Pelicans':'NOP',
    'Detroit Pistons':'DET',
    'Atlanta Hawks':'ATL',
    'Chicago Bulls':'CHI',
    'Miami Heat':'MIA',
    'Memphis Grizzlies':'MEM',
    'Golden State Warriors':'GSW',
    'Denver Nuggets':'DEN',
    'Brooklyn Nets':'BRK',
    'Los Angeles Clippers':'LAC',
    'Portland Trail Blazers':'POR',
    'Indiana Pacers':'IND',
    'San Antonio Spurs':'SAS',
    'Houston Rockets':'HOU',
    'Oklahoma City Thunder':'OKC',
    'Toronto Raptors':'TOR',
    'Orlando Magic':'ORL'
    }
    
    # Formats webscraped dates to match bball reference URLs for later scraping
    def date_formatter(date):

        if len(str(date.month)) != 2:
            month = '0'+ str(date.month)
        else:
            month = str(date.month)
        
        if len(str(date.day)) != 2:
            day = '0' + str(date.day)
        else:
            day = str(date.day)
        
        newdate = str(date.year) + month + day
        return newdate


    dates = []
    efgs = []
    tovpct = []
    orbpct = []
    ftr = []
    pace = []
    OEff = []
    DEff = []
        
    # Get team page
    url = f'https://www.basketball-reference.com/teams/{team}/2017_games.html'
    res = requests.get(url, 'html_parser')
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    
    # Get just regular season stats
    reg_season = soup.find('table',{'id':'games'})
    
    # Pull regular season game dates, generate list, turn it into datetime objects
    datesoup = reg_season.find_all(attrs = {'data-stat':'date_game'})
    gamedates = [entry.get_text() for entry in datesoup if entry.get_text() != 'Date' ]
    game_dates = [datetime.strptime(date, '%a, %b %d, %Y') for date in gamedates]
        
    
    # Find regular season game locations, translate to 1 = home, 0 = away
    souplocs = reg_season.find_all(attrs = {'data-stat':'game_location'})
    locs = [entry.get_text() for entry in souplocs if entry.get_text() == '' or entry.get_text() == '@']
    game_locs = list(map(lambda x: 1 if x =='' else 0, locs))
    
    # Find opponent team names
    oppts = reg_season.find_all(attrs = {'data-stat':'opp_name'})
    opps = [entry.get_text() for entry in oppts if entry.get_text() != "Opponent"]
    opp_abbrev = [team_abbrevs[opp] for opp in opps]
        
    # Get Results
    soupresults = reg_season.find_all(attrs = {'data-stat':'game_result'})
    results = [entry.get_text() for entry in soupresults if entry.get_text() == 'W' or entry.get_text() == 'L']
    rslts = list(map(lambda x: 1 if x =='W' else 0, results))

    souppoints = reg_season.find_all(attrs = {'data-stat':'pts'})
    soupopp = reg_season.find_all(attrs = {'data-stat':'opp_pts'})

    teampoints = [entry.get_text() for entry in souppoints if not entry.get_text().startswith('T')]
    opppoints = [entry.get_text() for entry in soupopp if not entry.get_text().startswith('O')]

    teampoints = list(map(lambda x: int(x), teampoints))
    opppoints = list(map(lambda x: int(x), opppoints))

    # Get game stats
    for i, gamedate in enumerate(game_dates):
            
        date = date_formatter(gamedate)
        dates.append(date)
            
        if game_locs[i] == 1:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[1].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[1].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[1].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[1].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[1].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
        else:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team_abbrevs[opps[i]]}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[0].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[0].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[0].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[0].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[0].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())

#         print("Date: " + date + "Location: " + str(game_locs[i]) + " Team Score " + 
#               " Opponent: " + team_abbrevs[opps[i]] + " Result: " + 
#               str(rslts[i]) + " EFG%: " + efgs[i] + " TOV%: " + tovpct[i] + 
#               " ORB%: " + orbpct[i] + " FTR: " + ftr[i] + " Pace: " + pace[i])
    
    teamlist= [team] * 82
    data = {'Team': teamlist, "Location":game_locs, "Game Number":list(range(1,83)), "Team Points": teampoints,
            "Opp Points": opppoints, "Result":rslts, "Date": dates, "Opponent": opp_abbrev, 
            "EFG%": efgs, "TOV%" : tovpct, "ORB%":orbpct, "FTR":ftr, "Pace": pace, "OEff": OEff, "DEff":DEff}
    df = pd.DataFrame.from_dict(data)
    df['EFG%'] = df['EFG%'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%'].apply(lambda x: float(x))
    df['ORB%'] = df['ORB%'].apply(lambda x: float(x))
    df['FTR'] = df['FTR'].apply(lambda x: float(x))
    df['Pace'] = df['Pace'].apply(lambda x: float(x))
    df['OEff'] = df['OEff'].apply(lambda x: float(x))
    df['DEff'] = df['DEff'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%']/100.0
    df['ORB%'] = df['ORB%']/100.0
    
    return df

In [107]:
def get_4fac18(team):
    
    team_abbrevs = {
    'Cleveland Cavaliers': 'CLE',
    'Boston Celtics':'BOS',
    'Washington Wizards':'WAS',
    'Charlotte Hornets':'CHO',
    'Minnesota Timberwolves':'MIN',
    'Dallas Mavericks':'DAL',
    'Milwaukee Bucks':'MIL',
    'Philadelphia 76ers':'PHI',
    'Phoenix Suns':'PHO',
    'Los Angeles Lakers':'LAL',
    'Utah Jazz':'UTA',
    'Sacramento Kings':'SAC',
    'New York Knicks':'NYK',
    'New Orleans Pelicans':'NOP',
    'Detroit Pistons':'DET',
    'Atlanta Hawks':'ATL',
    'Chicago Bulls':'CHI',
    'Miami Heat':'MIA',
    'Memphis Grizzlies':'MEM',
    'Golden State Warriors':'GSW',
    'Denver Nuggets':'DEN',
    'Brooklyn Nets':'BRK',
    'Los Angeles Clippers':'LAC',
    'Portland Trail Blazers':'POR',
    'Indiana Pacers':'IND',
    'San Antonio Spurs':'SAS',
    'Houston Rockets':'HOU',
    'Oklahoma City Thunder':'OKC',
    'Toronto Raptors':'TOR',
    'Orlando Magic':'ORL'
    }
    
    # Formats webscraped dates to match bball reference URLs for later scraping
    def date_formatter(date):

        if len(str(date.month)) != 2:
            month = '0'+ str(date.month)
        else:
            month = str(date.month)
        
        if len(str(date.day)) != 2:
            day = '0' + str(date.day)
        else:
            day = str(date.day)
        
        newdate = str(date.year) + month + day
        return newdate


    dates = []
    efgs = []
    tovpct = []
    orbpct = []
    ftr = []
    pace = []
    OEff = []
    DEff = []
        
    # Get team page
    url = f'https://www.basketball-reference.com/teams/{team}/2018_games.html'
    res = requests.get(url, 'html_parser')
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    
    # Get just regular season stats
    reg_season = soup.find('table',{'id':'games'})
    
    # Pull regular season game dates, generate list, turn it into datetime objects
    datesoup = reg_season.find_all(attrs = {'data-stat':'date_game'})
    gamedates = [entry.get_text() for entry in datesoup if entry.get_text() != 'Date' ]
    game_dates = [datetime.strptime(date, '%a, %b %d, %Y') for date in gamedates]
        
    
    # Find regular season game locations, translate to 1 = home, 0 = away
    souplocs = reg_season.find_all(attrs = {'data-stat':'game_location'})
    locs = [entry.get_text() for entry in souplocs if entry.get_text() == '' or entry.get_text() == '@']
    game_locs = list(map(lambda x: 1 if x =='' else 0, locs))
    
    # Find opponent team names
    oppts = reg_season.find_all(attrs = {'data-stat':'opp_name'})
    opps = [entry.get_text() for entry in oppts if entry.get_text() != "Opponent"]
    opp_abbrev = [team_abbrevs[opp] for opp in opps]
        
    # Get Results
    soupresults = reg_season.find_all(attrs = {'data-stat':'game_result'})
    results = [entry.get_text() for entry in soupresults if entry.get_text() == 'W' or entry.get_text() == 'L']
    rslts = list(map(lambda x: 1 if x =='W' else 0, results))

    souppoints = reg_season.find_all(attrs = {'data-stat':'pts'})
    soupopp = reg_season.find_all(attrs = {'data-stat':'opp_pts'})

    teampoints = [entry.get_text() for entry in souppoints if not entry.get_text().startswith('T')]
    opppoints = [entry.get_text() for entry in soupopp if not entry.get_text().startswith('O')]

    teampoints = list(map(lambda x: int(x), teampoints))
    opppoints = list(map(lambda x: int(x), opppoints))

    # Get game stats
    for i, gamedate in enumerate(game_dates):
            
        date = date_formatter(gamedate)
        dates.append(date)
            
        if game_locs[i] == 1:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[1].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[1].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[1].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[1].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[1].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
        else:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team_abbrevs[opps[i]]}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[0].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[0].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[0].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[0].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[0].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())

#         print("Date: " + date + "Location: " + str(game_locs[i]) + " Team Score " + 
#               " Opponent: " + team_abbrevs[opps[i]] + " Result: " + 
#               str(rslts[i]) + " EFG%: " + efgs[i] + " TOV%: " + tovpct[i] + 
#               " ORB%: " + orbpct[i] + " FTR: " + ftr[i] + " Pace: " + pace[i])
    
    teamlist= [team] * 82
    data = {'Team': teamlist, "Location":game_locs, "Game Number":list(range(1,83)), "Team Points": teampoints,
            "Opp Points": opppoints, "Result":rslts, "Date": dates, "Opponent": opp_abbrev, 
            "EFG%": efgs, "TOV%" : tovpct, "ORB%":orbpct, "FTR":ftr, "Pace": pace, "OEff": OEff, "DEff":DEff}
    df = pd.DataFrame.from_dict(data)
    df['EFG%'] = df['EFG%'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%'].apply(lambda x: float(x))
    df['ORB%'] = df['ORB%'].apply(lambda x: float(x))
    df['FTR'] = df['FTR'].apply(lambda x: float(x))
    df['Pace'] = df['Pace'].apply(lambda x: float(x))
    df['OEff'] = df['OEff'].apply(lambda x: float(x))
    df['DEff'] = df['DEff'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%']/100.0
    df['ORB%'] = df['ORB%']/100.0
    
    return df

In [108]:
def get_4fac19(team):
    
    team_abbrevs = {
    'Cleveland Cavaliers': 'CLE',
    'Boston Celtics':'BOS',
    'Washington Wizards':'WAS',
    'Charlotte Hornets':'CHO',
    'Minnesota Timberwolves':'MIN',
    'Dallas Mavericks':'DAL',
    'Milwaukee Bucks':'MIL',
    'Philadelphia 76ers':'PHI',
    'Phoenix Suns':'PHO',
    'Los Angeles Lakers':'LAL',
    'Utah Jazz':'UTA',
    'Sacramento Kings':'SAC',
    'New York Knicks':'NYK',
    'New Orleans Pelicans':'NOP',
    'Detroit Pistons':'DET',
    'Atlanta Hawks':'ATL',
    'Chicago Bulls':'CHI',
    'Miami Heat':'MIA',
    'Memphis Grizzlies':'MEM',
    'Golden State Warriors':'GSW',
    'Denver Nuggets':'DEN',
    'Brooklyn Nets':'BRK',
    'Los Angeles Clippers':'LAC',
    'Portland Trail Blazers':'POR',
    'Indiana Pacers':'IND',
    'San Antonio Spurs':'SAS',
    'Houston Rockets':'HOU',
    'Oklahoma City Thunder':'OKC',
    'Toronto Raptors':'TOR',
    'Orlando Magic':'ORL'
    }
    
    # Formats webscraped dates to match bball reference URLs for later scraping
    def date_formatter(date):

        if len(str(date.month)) != 2:
            month = '0'+ str(date.month)
        else:
            month = str(date.month)
        
        if len(str(date.day)) != 2:
            day = '0' + str(date.day)
        else:
            day = str(date.day)
        
        newdate = str(date.year) + month + day
        return newdate


    dates = []
    efgs = []
    tovpct = []
    orbpct = []
    ftr = []
    pace = []
    OEff = []
    DEff = []
        
    # Get team page
    url = f'https://www.basketball-reference.com/teams/{team}/2019_games.html'
    res = requests.get(url, 'html_parser')
    res.raise_for_status()
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    
    # Get just regular season stats
    reg_season = soup.find('table',{'id':'games'})
    
    # Pull regular season game dates, generate list, turn it into datetime objects
    datesoup = reg_season.find_all(attrs = {'data-stat':'date_game'})
    gamedates = [entry.get_text() for entry in datesoup if entry.get_text() != 'Date' ]
    game_dates = [datetime.strptime(date, '%a, %b %d, %Y') for date in gamedates]
        
    
    # Find regular season game locations, translate to 1 = home, 0 = away
    souplocs = reg_season.find_all(attrs = {'data-stat':'game_location'})
    locs = [entry.get_text() for entry in souplocs if entry.get_text() == '' or entry.get_text() == '@']
    game_locs = list(map(lambda x: 1 if x =='' else 0, locs))
    
    # Find opponent team names
    oppts = reg_season.find_all(attrs = {'data-stat':'opp_name'})
    opps = [entry.get_text() for entry in oppts if entry.get_text() != "Opponent"]
    opp_abbrev = [team_abbrevs[opp] for opp in opps]
        
    # Get Results
    soupresults = reg_season.find_all(attrs = {'data-stat':'game_result'})
    results = [entry.get_text() for entry in soupresults if entry.get_text() == 'W' or entry.get_text() == 'L']
    rslts = list(map(lambda x: 1 if x =='W' else 0, results))

    souppoints = reg_season.find_all(attrs = {'data-stat':'pts'})
    soupopp = reg_season.find_all(attrs = {'data-stat':'opp_pts'})

    teampoints = [entry.get_text() for entry in souppoints if not entry.get_text().startswith('T')]
    opppoints = [entry.get_text() for entry in soupopp if not entry.get_text().startswith('O')]

    teampoints = list(map(lambda x: int(x), teampoints))
    opppoints = list(map(lambda x: int(x), opppoints))

    # Get game stats
    for i, gamedate in enumerate(game_dates):
            
        date = date_formatter(gamedate)
        dates.append(date)
            
        if game_locs[i] == 1:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[1].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[1].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[1].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[1].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[1].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
        else:
            url = f'https://www.basketball-reference.com/boxscores/{date}0{team_abbrevs[opps[i]]}.html'
            res = requests.get(url)
            res.raise_for_status()
            soup = bs4.BeautifulSoup(re.sub("<!--|-->", "", res.text), 'lxml')
            fourfac = soup.find('div',{'id':'div_four_factors'})
            efgs.append(fourfac.find_all('td',{'data-stat':'efg_pct'})[0].get_text())
            tovpct.append(fourfac.find_all('td',{'data-stat':'tov_pct'})[0].get_text())
            orbpct.append(fourfac.find_all('td',{'data-stat':'orb_pct'})[0].get_text())
            ftr.append(fourfac.find_all('td',{'data-stat':'ft_rate'})[0].get_text())
            pace.append(fourfac.find_all('td',{'data-stat':'pace'})[0].get_text())
            OEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[0].get_text())
            DEff.append(fourfac.find_all('td',{'data-stat':'off_rtg'})[1].get_text())

#         print("Date: " + date + "Location: " + str(game_locs[i]) + " Team Score " + 
#               " Opponent: " + team_abbrevs[opps[i]] + " Result: " + 
#               str(rslts[i]) + " EFG%: " + efgs[i] + " TOV%: " + tovpct[i] + 
#               " ORB%: " + orbpct[i] + " FTR: " + ftr[i] + " Pace: " + pace[i])
    
    teamlist= [team] * 82
    data = {'Team': teamlist, "Location":game_locs, "Game Number":list(range(1,83)), "Team Points": teampoints,
            "Opp Points": opppoints, "Result":rslts, "Date": dates, "Opponent": opp_abbrev, 
            "EFG%": efgs, "TOV%" : tovpct, "ORB%":orbpct, "FTR":ftr, "Pace": pace, "OEff": OEff, "DEff":DEff}
    df = pd.DataFrame.from_dict(data)
    df['EFG%'] = df['EFG%'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%'].apply(lambda x: float(x))
    df['ORB%'] = df['ORB%'].apply(lambda x: float(x))
    df['FTR'] = df['FTR'].apply(lambda x: float(x))
    df['Pace'] = df['Pace'].apply(lambda x: float(x))
    df['OEff'] = df['OEff'].apply(lambda x: float(x))
    df['DEff'] = df['DEff'].apply(lambda x: float(x))
    df['TOV%'] = df['TOV%']/100.0
    df['ORB%'] = df['ORB%']/100.0
    
    return df

In [109]:
teams = ['CLE', 'BOS', 'WAS', 'CHO', 'MIN', 'DAL', 
         'MIL', 'PHI', 'PHO', 'LAL', 'UTA', 'SAC', 
         'NYK', 'NOP', 'DET', 'ATL', 'CHI', 'MIA', 
         'MEM', 'GSW', 'DEN', 'BRK', 'LAC', 'POR', 
         'IND', 'SAS', 'HOU', 'OKC', 'TOR', 'ORL']

In [99]:
df = pd.DataFrame()
year = '2015'
for team in teams:
    team_db = get_4fac15(team)
    df = pd.concat([df, team_db]).reset_index(drop = True)
    
path = '/Users/ahelgeso/Documents/GitHub/bball-master/Stat Sheets'
os.chdir(path)

df.to_csv(f'{year}stats.csv', index = False)

In [None]:
df = pd.DataFrame()
year = '2016'
for team in teams:
    team_db = get_4fac16(team)
    df = pd.concat([df, team_db]).reset_index(drop = True)
    
path = '/Users/ahelgeso/Documents/GitHub/bball-master/Stat Sheets'
os.chdir(path)

df.to_csv(f'{year}stats.csv', index = False)

In [None]:
df = pd.DataFrame()
year = '2017'
for team in teams:
    team_db = get_4fac17(team)
    df = pd.concat([df, team_db]).reset_index(drop = True)
    
path = '/Users/ahelgeso/Documents/GitHub/bball-master/Stat Sheets'
os.chdir(path)

df.to_csv(f'{year}stats.csv', index = False)

In [110]:
df = pd.DataFrame()
year = '2018'
for team in teams:
    team_db = get_4fac18(team)
    df = pd.concat([df, team_db]).reset_index(drop = True)
    
path = '/Users/ahelgeso/Documents/GitHub/bball-master/Stat Sheets'
os.chdir(path)

df.to_csv(f'{year}stats.csv', index = False)

In [111]:
df = pd.DataFrame()
year = '2019'
for team in teams:
    team_db = get_4fac19(team)
    df = pd.concat([df, team_db]).reset_index(drop = True)
    
path = '/Users/ahelgeso/Documents/GitHub/bball-master/Stat Sheets'
os.chdir(path)

df.to_csv(f'{year}stats.csv', index = False)