# NFL Historic to Present Matches - Data Extraction

In [2]:
import pandas as pd
import time
import re

## Data Collection and Parsing

### Prepare table of past scores to use

In [64]:
all_games = pd.read_html("https://www.pro-football-reference.com/boxscores/game-scores.htm") # scrape html
all_games_df = pd.DataFrame(all_games[0]) # parse into a dataframe

In [72]:
print(f"Total number of unique score combinations: {all_games_df.shape[0]}")

Total number of unique score combinations: 1069


In [67]:
WinScores = all_games_df["PtsW"].tolist() # list of winning score lines
LossScores = all_games_df["PtsL"].tolist() # list of losign score lines

In [69]:
print(f"Sample Win Score: {WinScores[:5]}") # preview
print(f"Sample Loss Score: {LossScores[:5]}")

Sample Win Score: [20, 27, 17, 23, 24]
Sample Loss Score: [17, 24, 14, 20, 17]


### Obtain all games for each scoreline

In [None]:
default_names = ["Rk", "Week", "Day", "Date", "Unnamed: 4", "Winner/tie", "Unnamed: 6",
        "Loser/tie", "Unnamed: 8", "PtsW", "PtsL", "YdsW", "TOW", "YdsL", "TOL"]
out_df = pd.DataFrame(columns=default_names)

In [None]:
for W, L in zip(WinScores, LossScores):
    temp_df = pd.read_html(f"https://www.pro-football-reference.com/boxscores/game_scores_find.cgi?pts_win={W}&pts_lose={L}")[0]
    out_df = out_df.append(temp_df) # append each scraped component to the external df
    time.sleep(1) # attempt to avoid ip blocking 

In [68]:
out_df = out_df.rename(columns={"Unnamed: 6": "AtHome"})
out_df.to_csv("Data/historic_match_scores.csv") # write to file