In [None]:
"""
Script to fetch NFL player names from pro-football-reference.com
and store them in a list.
"""

from bs4 import BeautifulSoup as soup
import requests
import time

site_url = 'https://www.pro-football-reference.com'
base_url = 'https://www.pro-football-reference.com/players/'

def get_html(url):
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    return response.text

def extract_names_links(html, names, links):
    page_soup = soup(html, 'html.parser')
    div_players = page_soup.find('div', {'id': 'div_players'})
    if not div_players:
        return names

    player_links = div_players.find_all('a', href=True)
    for link in player_links:
        name = link.text.strip()
        href = link['href']
        # check that name nonempty
        if not name or not href:
            continue

        # Check if link contains a <b> or <strong> child (i.e. bold)
        # or if link’s parent is bold, etc.
        bold_child = link.find(['b', 'strong'])
        parent_bold = link.parent and link.parent.name in ('b', 'strong')

        if bold_child or parent_bold:
            names.append(name)
            links.append(href)
    return names

In [None]:
def scrape_names_links():
    alphabet_capitalized = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    names = []
    links = []

    for letter in alphabet_capitalized:
        url = f"{base_url}{letter}/"
        print(f"Fetching links from: {url}")
        print("Getting letter: ", letter)
        html = get_html(url)
        extract_names_links(html, names, links)

        time.sleep(7)  # Be polite and avoid hammering the server

    with open(r'C:\Users\bengu\Documents\Sports Analysis Project\clairvoyent-raven-sports-analysis\data\nfl_players.txt', 'w') as f:
        for name in names:
            f.write(name + '\n')

    with open(r'C:\Users\bengu\Documents\Sports Analysis Project\clairvoyent-raven-sports-analysis\data\nfl_links.txt', 'w') as f:
        for link in links:
            f.write(link + '\n')

# scrape_names_links()

with open(r'C:\Users\bengu\Documents\Sports Analysis Project\clairvoyent-raven-sports-analysis\data\nfl_players.txt', 'r') as f:
    names = [line.strip() for line in f.readlines()]

with open(r'C:\Users\bengu\Documents\Sports Analysis Project\clairvoyent-raven-sports-analysis\data\nfl_links.txt', 'r') as f:
    links = [line.strip() for line in f.readlines()]



In [3]:
def find_link(name):
    for i, player_name in enumerate(names):
        if player_name.lower() == name.lower():
            return links[i]
    return None

In [4]:
rb = names[4]
kicker = "Cameron Dicker"
qb = "Josh Allen"
defender = names[7]

assert len(names) == len(links)  # Should be the same
assorted_links = [find_link(rb), find_link(kicker), find_link(qb), find_link(defender)]

table_names = ["rushing_and_receiving", "kicking", "passing", "defense"]

In [5]:
data_dictionary = {}

for i, link in enumerate(assorted_links):
    table_name = table_names[i]
    page = get_html(site_url + link)

    page_soup = soup(page, 'html.parser')

    table = page_soup.find("table", id=table_name)

    tr = table.find_all("tr")[1] if table_name != "passing" else table.find_all("tr")[0]
    ths = tr.find_all("th")
    tbody = table.find("tbody")
    table_headers = [th.text for th in ths]

    trs = tbody.find_all("tr")
    aliases = {}
    for tr in trs:
        th = tr.find("th")
        tds = tr.find_all("td")

        if th.get("data-stat") not in aliases:
            aliases[th.get("data-stat")] = table_headers[0]

        for j, td in enumerate(tds):
            if td.get("data-stat") not in aliases:
                aliases[td.get("data-stat")] = table_headers[j + 1]
    data_dictionary[table_name] = aliases


In [13]:
cameron_dicker = {}
page = get_html(site_url + assorted_links[1])
page_soup = soup(page, 'html.parser')
table_name = table_names[1]
table = page_soup.find("table", id=table_name)
tr = table.find_all("tr")[1] if table_name != "passing" else table.find_all("tr")[0]
ths = tr.find_all("th")
tbody = table.find("tbody")
table_headers = [th.text for th in ths]

trs = tbody.find_all("tr")
for tr in trs:
    # Should point to the year
    th = tr.find("th")
    season = th.text.strip()
    cameron_dicker[season] = {}

    tds = tr.find_all("td")

    # if th.get("data-stat") not in data_dictionary.get(table_name):
    #     pass

    for j, td in enumerate(tds):
        # if td.get("data-stat") not in data_dictionary.get(table_name):
        #     pass
        cameron_dicker[season][td.get("data-stat")] = td.text.strip()
        
print(cameron_dicker)

{'2022': {'age': '22', 'team_name_abbr': 'LAC', 'comp_name_abbr': 'NFL', 'pos': 'K', 'games': '10', 'games_started': '0', 'fga1': '0', 'fgm1': '0', 'fga2': '6', 'fgm2': '6', 'fga3': '7', 'fgm3': '7', 'fga4': '6', 'fgm4': '6', 'fga5': '1', 'fgm5': '0', 'fga': '20', 'fgm': '19', 'fg_long': '48', 'fg_pct': '95.0', 'xpa': '22', 'xpm': '22', 'xp_pct': '100.0', 'kickoff': '50', 'kickoff_yds': '3192', 'kickoff_tb': '42', 'kickoff_tb_pct': '84.0', 'kickoff_yds_avg': '64', 'av': '3', 'awards': ''}, '2023': {'age': '23', 'team_name_abbr': 'LAC', 'comp_name_abbr': 'NFL', 'pos': 'K', 'games': '17', 'games_started': '0', 'fga1': '0', 'fgm1': '0', 'fga2': '7', 'fgm2': '7', 'fga3': '8', 'fgm3': '8', 'fga4': '9', 'fgm4': '9', 'fga5': '9', 'fgm5': '7', 'fga': '33', 'fgm': '31', 'fg_long': '55', 'fg_pct': '93.9', 'xpa': '35', 'xpm': '35', 'xp_pct': '100.0', 'kickoff': '81', 'kickoff_yds': '5168', 'kickoff_tb': '68', 'kickoff_tb_pct': '84.0', 'kickoff_yds_avg': '64', 'av': '5', 'awards': ''}, '2024': {'a

In [14]:
import pprint
pprint.pprint(cameron_dicker)

{'2022': {'age': '22',
          'av': '3',
          'awards': '',
          'comp_name_abbr': 'NFL',
          'fg_long': '48',
          'fg_pct': '95.0',
          'fga': '20',
          'fga1': '0',
          'fga2': '6',
          'fga3': '7',
          'fga4': '6',
          'fga5': '1',
          'fgm': '19',
          'fgm1': '0',
          'fgm2': '6',
          'fgm3': '7',
          'fgm4': '6',
          'fgm5': '0',
          'games': '10',
          'games_started': '0',
          'kickoff': '50',
          'kickoff_tb': '42',
          'kickoff_tb_pct': '84.0',
          'kickoff_yds': '3192',
          'kickoff_yds_avg': '64',
          'pos': 'K',
          'team_name_abbr': 'LAC',
          'xp_pct': '100.0',
          'xpa': '22',
          'xpm': '22'},
 '2023': {'age': '23',
          'av': '5',
          'awards': '',
          'comp_name_abbr': 'NFL',
          'fg_long': '55',
          'fg_pct': '93.9',
          'fga': '33',
          'fga1': '0',
          '

In [15]:
import pandas as pd

pd.DataFrame(cameron_dicker)

Unnamed: 0,2022,2023,2024,2025
age,22,23,24,25
team_name_abbr,LAC,LAC,LAC,LAC
comp_name_abbr,NFL,NFL,NFL,NFL
pos,K,K,K,K
games,10,17,17,3
games_started,0,0,0,0
fga1,0,0,1,0
fgm1,0,0,1,0
fga2,6,7,9,2
fgm2,6,7,9,2
