## Import Dependencies

In [1]:
import requests
from bs4 import BeautifulSoup, Comment
from time import sleep
import re
import pandas as pd

## Create a list with the NFL Summaries Pages from Pro Football Reference
### List contains years 2012 through 2021

In [2]:
mainPages = ['https://www.pro-football-reference.com/years/2018/#all_week_games',
             'https://www.pro-football-reference.com/years/2019/#all_week_games',
             'https://www.pro-football-reference.com/years/2020/#all_week_games',
             'https://www.pro-football-reference.com/years/2021/#all_week_games',
             'https://www.pro-football-reference.com/years/2022/#all_week_games']

## Scrape each page to get the URL for every week of the 10 seasons

In [3]:
weekURLs = []
baseURL = 'https://www.pro-football-reference.com'
for summary in mainPages:
    mainPage = requests.get(summary)
    sleep(2)
    mainSoup = BeautifulSoup(mainPage.content, 'html.parser')
    summaries = mainSoup.find(id='all_week_games')
    weeks = list(summaries.children)[4]
    weeksData = BeautifulSoup(weeks, 'html.parser')
    for a in weeksData.find_all('a',href=True):
        weekURLs.append(a['href'])
weekURLs = [baseURL + week for week in weekURLs]
weekURLs

['https://www.pro-football-reference.com/years/2020/week_1.htm',
 'https://www.pro-football-reference.com/years/2020/week_2.htm',
 'https://www.pro-football-reference.com/years/2020/week_3.htm',
 'https://www.pro-football-reference.com/years/2020/week_4.htm',
 'https://www.pro-football-reference.com/years/2020/week_5.htm',
 'https://www.pro-football-reference.com/years/2020/week_6.htm',
 'https://www.pro-football-reference.com/years/2020/week_7.htm',
 'https://www.pro-football-reference.com/years/2020/week_8.htm',
 'https://www.pro-football-reference.com/years/2020/week_9.htm',
 'https://www.pro-football-reference.com/years/2020/week_10.htm',
 'https://www.pro-football-reference.com/years/2020/week_11.htm',
 'https://www.pro-football-reference.com/years/2020/week_12.htm',
 'https://www.pro-football-reference.com/years/2020/week_13.htm',
 'https://www.pro-football-reference.com/years/2020/week_14.htm',
 'https://www.pro-football-reference.com/years/2020/week_15.htm',
 'https://www.pro-f

## Scrape each week to get the URL for every game of every week

In [4]:
# Scrape the URL for each game of each week of every season
gameURLs = []
for week in weekURLs:
    gamePage = requests.get(week)
    sleep(3)
    gameSoup = BeautifulSoup(gamePage.content, 'html.parser')
    summaries = gameSoup.find(class_='game_summaries')
    for a in summaries.find_all('a',href=True):
        href = str(a['href'])
        if 'box' in href:
            gameURLs.append(href)
gameURLs = [baseURL + game for game in gameURLs]
gameURLs

['https://www.pro-football-reference.com/boxscores/202009100kan.htm',
 'https://www.pro-football-reference.com/boxscores/202009130atl.htm',
 'https://www.pro-football-reference.com/boxscores/202009130buf.htm',
 'https://www.pro-football-reference.com/boxscores/202009130car.htm',
 'https://www.pro-football-reference.com/boxscores/202009130det.htm',
 'https://www.pro-football-reference.com/boxscores/202009130jax.htm',
 'https://www.pro-football-reference.com/boxscores/202009130min.htm',
 'https://www.pro-football-reference.com/boxscores/202009130nwe.htm',
 'https://www.pro-football-reference.com/boxscores/202009130rav.htm',
 'https://www.pro-football-reference.com/boxscores/202009130was.htm',
 'https://www.pro-football-reference.com/boxscores/202009130cin.htm',
 'https://www.pro-football-reference.com/boxscores/202009130nor.htm',
 'https://www.pro-football-reference.com/boxscores/202009130sfo.htm',
 'https://www.pro-football-reference.com/boxscores/202009130ram.htm',
 'https://www.pro-fo

## Create an empty dataframe for the scraped data

In [5]:
df = pd.DataFrame(columns=['wonToss','roof','surface','attendance','weather','vegasLine','over/under',
                           'homeScore','awayScore','homeRecord','awayRecord','date','time',
                           'stadium','teams(away:home)','firstDowns','rush-yds-tds','cmp-att-yd-td-int',
                           'sacked-yrds','passYards','totalYards','fumbles-lost','turnovers','penalties-yrds',
                           'thirdDownConv.','fourthDownConv.','timeOfPossession','playerPassing'])

## Loop through all of the pages and scrape the data
### Then insert into dataframe

In [None]:
# Create empty lists for values to scrape
gameInfo = []
teamStats = []
scores = []
records = []
gameDetails = []
passingStats = []
# Begin scraping the pages
for game in gameURLs:
    request = requests.get(game)
    sleep(3)
    # Generate the soup
    soup = BeautifulSoup(request.content, 'html.parser')
    # Get Game Info Table
    game_info = soup.find(id='all_game_info')
    gi_children = list(game_info.children)
    gi_table = gi_children[4]
    gi_data = BeautifulSoup(gi_table, 'html.parser')
    for tr in gi_data.select("tr"):
        gameInfo.append(tr.get_text(strip=True,separator=":"))
    # Get Team Stats Table
    stats = soup.find(id="all_team_stats")
    statsResults = list(stats.children)
    statsTable = statsResults[4]
    statsData = BeautifulSoup(statsTable, 'html.parser')
    for tr in statsData.select("tr"):
        teamStats.append(tr.get_text(strip=True, separator=':'))
    # Find the Score of the Games
    for score in soup.find_all(class_="score"):
        scores.append(score.get_text())
    # Find the teams records
    scorebox = soup.find(class_="scorebox")
    homeTeam = scorebox.select("div")[5]
    records.append(homeTeam.get_text())
    awayTeam = scorebox.select("div")[13]
    records.append(awayTeam.get_text())
    # Get the Game Details
    details = soup.find(class_="scorebox_meta")
    for detail in details.select("div")[0:3]:
        gameDetails.append(detail.get_text())
    # Get the Player Stats Table
    passingTable = soup.find(id='player_offense')
    for tr in passingTable.select('tr'):
        passingStats.append(tr.get_text(strip=True,separator=':'))
    # Append data to the dataframe
    if len(gameInfo) == 9:
        df = df.append({'wonToss':gameInfo[1],'roof':gameInfo[2],'surface':gameInfo[3],'attendance':gameInfo[5],
                   'weather':gameInfo[6],'vegasLine':gameInfo[7],'over/under':gameInfo[8],'homeScore':scores[0],
                   'awayScore':scores[1],'homeRecord':records[0],'awayRecord':records[1],'date':gameDetails[0],
                   'time':gameDetails[1],'stadium':gameDetails[2],'teams(away:home)':teamStats[0],
                   'firstDowns':teamStats[1],'rush-yds-tds':teamStats[2],'cmp-att-yd-td-int':teamStats[3],
                   'sacked-yrds':teamStats[4],'passYards':teamStats[5],'totalYards':teamStats[6],
                   'fumbles-lost':teamStats[7],'turnovers':teamStats[8],'penalties-yrds':teamStats[9],
                   'thirdDownConv.':teamStats[10],'fourthDownConv.':teamStats[11],
                   'timeOfPossession':teamStats[12],'playerPassing':passingStats},ignore_index=True)
    elif len(gameInfo) == 8:
        df = df.append({'wonToss':gameInfo[1],'roof':gameInfo[2],'surface':gameInfo[3],'attendance':gameInfo[5],
                   'weather':'','vegasLine':gameInfo[6],'over/under':gameInfo[7],'homeScore':scores[0],
                   'awayScore':scores[1],'homeRecord':records[0],'awayRecord':records[1],'date':gameDetails[0],
                   'time':gameDetails[1],'stadium':gameDetails[2],'teams(away:home)':teamStats[0],
                   'firstDowns':teamStats[1],'rush-yds-tds':teamStats[2],'cmp-att-yd-td-int':teamStats[3],
                   'sacked-yrds':teamStats[4],'passYards':teamStats[5],'totalYards':teamStats[6],
                   'fumbles-lost':teamStats[7],'turnovers':teamStats[8],'penalties-yrds':teamStats[9],
                   'thirdDownConv.':teamStats[10],'fourthDownConv.':teamStats[11],
                   'timeOfPossession':teamStats[12],'playerPassing':passingStats},ignore_index=True)
    # Empty lists for new rows of data
    gameInfo = []
    teamStats = []
    scores = []
    records = []
    gameDetails = []
    passingStats = []

## Show dataframe

In [7]:
df

Unnamed: 0,wonToss,roof,surface,attendance,weather,vegasLine,over/under,homeScore,awayScore,homeRecord,...,cmp-att-yd-td-int,sacked-yrds,passYards,totalYards,fumbles-lost,turnovers,penalties-yrds,thirdDownConv.,fourthDownConv.,timeOfPossession
0,Won Toss:Patriots (deferred),Roof:outdoors,Surface:fieldturf,"Attendance:66,829","Weather:65 degrees, relative humidity 0%, wind...",Vegas Line:New England Patriots -7.0,Over/Under:52.0:(under),28,21,1-0,...,Cmp-Att-Yd-TD-INT:26-38-351-1-1:25-32-288-4-0,Sacked-Yards:3-21:2-7,Net Pass Yards:330:281,Total Yards:464:361,Fumbles-Lost:0-0:1-0,Turnovers:1:0,Penalties-Yards:8-77:7-64,Third Down Conv.:7-15:7-11,Fourth Down Conv.:2-2:0-0,Time of Possession:32:05:27:55
1,Won Toss:Packers (deferred),Roof:outdoors,Surface:grass,"Attendance:62,442","Weather:72 degrees, relative humidity 39%, win...",Vegas Line:Green Bay Packers -6.5,Over/Under:49.0:(over),23,31,0-1,...,Cmp-Att-Yd-TD-INT:18-23-189-3-0:18-36-225-1-1,Sacked-Yards:0-0:2-12,Net Pass Yards:189:213,Total Yards:322:402,Fumbles-Lost:0-0:0-0,Turnovers:0:1,Penalties-Yards:10-74:6-64,Third Down Conv.:6-10:11-17,Fourth Down Conv.:1-1:2-3,Time of Possession:28:08:31:52
2,Won Toss:Rams (deferred),Won OT Toss:Rams,Roof:dome,Duration:3:30,"Attendance:51,792",Vegas Line:Seattle Seahawks -4.0,Over/Under:40.5:(over),34,31,1-0,...,Cmp-Att-Yd-TD-INT:32-41-251-1-1:18-27-297-1-0,Sacked-Yards:6-32:2-21,Net Pass Yards:219:276,Total Yards:343:352,Fumbles-Lost:0-0:3-3,Turnovers:1:3,Penalties-Yards:7-46:4-30,Third Down Conv.:8-19:6-11,Fourth Down Conv.:1-2:0-0,Time of Possession:37:28:28:32
3,Won Toss:Texans (deferred),Roof:retractable roof (closed),Surface:grass,"Attendance:71,776",,Vegas Line:Houston Texans -1.0,Over/Under:41.0:(over),20,27,0-1,...,Cmp-Att-Yd-TD-INT:22-33-243-3-0:26-47-334-2-1,Sacked-Yards:2-10:5-36,Net Pass Yards:233:298,Total Yards:330:396,Fumbles-Lost:1-0:2-1,Turnovers:0:2,Penalties-Yards:2-25:6-39,Third Down Conv.:3-13:3-14,Fourth Down Conv.:0-0:0-1,Time of Possession:35:19:24:41
4,Won Toss:Dolphins (deferred),Roof:outdoors,Surface:grass,"Attendance:76,512","Weather:69 degrees, relative humidity 58%, win...",Vegas Line:Miami Dolphins -3.0,Over/Under:43.5:(under),10,17,0-1,...,Cmp-Att-Yd-TD-INT:22-34-226-1-0:21-31-196-1-2,Sacked-Yards:3-44:1-8,Net Pass Yards:182:188,Total Yards:256:349,Fumbles-Lost:2-1:1-0,Turnovers:1:2,Penalties-Yards:6-39:11-88,Third Down Conv.:5-12:6-14,Fourth Down Conv.:0-1:0-1,Time of Possession:22:06:37:54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,Won Toss:Colts,Roof:outdoors,Surface:fieldturf,"Attendance:68,756","Weather:57 degrees, relative humidity 100%, wi...",Vegas Line:New England Patriots -7.0,Over/Under:51.0:(over),43,22,13-4,...,Cmp-Att-Yd-TD-INT:20-41-331-2-4:13-25-198-0-0,Sacked-Yards:3-14:2-13,Net Pass Yards:317:185,Total Yards:386:419,Fumbles-Lost:0-0:3-0,Turnovers:4:0,Penalties-Yards:4-42:4-35,Third Down Conv.:6-15:11-18,Fourth Down Conv.:1-1:0-1,Time of Possession:25:00:35:00
764,Won Toss:Panthers (deferred),Roof:outdoors,Surface:grass,"Attendance:73,784","Weather:54 degrees, relative humidity 42%, win...",Vegas Line:San Francisco 49ers -1.0,Over/Under:41.0:(under),10,23,12-5,...,Cmp-Att-Yd-TD-INT:15-30-196-1-0:16-25-267-1-2,Sacked-Yards:1-7:5-35,Net Pass Yards:189:232,Total Yards:315:325,Fumbles-Lost:1-0:0-0,Turnovers:0:2,Penalties-Yards:5-40:8-73,Third Down Conv.:6-14:6-10,Fourth Down Conv.:0-1:0-1,Time of Possession:29:55:30:05
765,Won Toss:Broncos (deferred),Roof:outdoors,Surface:grass,"Attendance:76,969","Weather:41 degrees, relative humidity 20%, win...",Vegas Line:Denver Broncos -7.5,Over/Under:55.0:(under),24,17,14-3,...,Cmp-Att-Yd-TD-INT:18-27-217-2-0:25-36-230-2-1,Sacked-Yards:4-23:0-0,Net Pass Yards:194:230,Total Yards:259:363,Fumbles-Lost:0-0:1-1,Turnovers:0:2,Penalties-Yards:8-63:6-50,Third Down Conv.:4-12:9-13,Fourth Down Conv.:1-1:0-0,Time of Possession:24:33:35:27
766,Won Toss:Broncos (deferred),Roof:outdoors,Surface:grass,"Attendance:77,110","Weather:63 degrees, relative humidity 10%, win...",Vegas Line:Denver Broncos -5.0,Over/Under:56.5:(under),26,16,15-3,...,Cmp-Att-Yd-TD-INT:24-38-277-1-0:32-43-400-2-0,Sacked-Yards:2-21:0-0,Net Pass Yards:256:400,Total Yards:320:507,Fumbles-Lost:0-0:0-0,Turnovers:0:0,Penalties-Yards:2-15:4-34,Third Down Conv.:6-12:7-13,Fourth Down Conv.:1-2:1-1,Time of Possession:24:16:35:44


## Save CSV to desktop

In [8]:
nflData = df.to_csv(r'Desktop\nflData.csv')