In [6]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
import unicodedata



In [4]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import unicodedata

# Configuration
base_url = "https://pennathletics.com"
output_dir = "penn_football_images"
os.makedirs(output_dir, exist_ok=True)

# Clean and normalize names for filenames
def clean_name(name):
    name = name.lower().replace(" ", "-")
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
    name = re.sub(r"[^\w\-]", "", name)
    return name

# Loop over years, oldest to newest
for year in range(2019, 2026):
    print(f"Scraping {year}...")
    roster_url = f"{base_url}/sports/football/roster/{year}?view=3"
    response = requests.get(roster_url)
    if response.status_code != 200:
        print(f"⚠️ Skipping {year}, page not found.")
        continue

    soup = BeautifulSoup(response.text, "html.parser")
    players = soup.select("li.sidearm-list-card-item")

    for player in players:
        # Get first + last name
        first = player.select_one("span.sidearm-roster-player-first-name")
        last = player.select_one("span.sidearm-roster-player-last-name")
        if not first or not last:
            continue

        first_name = clean_name(first.text.strip())
        last_name = clean_name(last.text.strip())
        filename = f"{first_name}-{last_name}.jpg"
        filepath = os.path.join(output_dir, filename)

        # Get image URL from inline style
        image_div = player.select_one("div.sidearm-roster-player-image")
        if not image_div:
            continue

        style = image_div.get("style", "")
        match = re.search(r"url\(['\"]?(.*?)['\"]?\)", style)
        if not match:
            continue

        image_path = match.group(1)
        image_url = urljoin(base_url, image_path)

        # Download and overwrite existing image if newer
        try:
            img_data = requests.get(image_url).content
            with open(filepath, "wb") as f:
                f.write(img_data)
            print(f"✅ Saved {filename} ({year})")
        except Exception as e:
            print(f"❌ Error saving {filename} from {year}: {e}")


Scraping 2019...
✅ Saved vhito-decapria.jpg (2019)
✅ Saved ryan-glover.jpg (2019)
✅ Saved taheeb-sonekan.jpg (2019)
✅ Saved abe-willows.jpg (2019)
✅ Saved mac-humble.jpg (2019)
✅ Saved khalil-weathers.jpg (2019)
✅ Saved ben-gerber.jpg (2019)
✅ Saved eric-markes.jpg (2019)
✅ Saved mohammed-diakite.jpg (2019)
✅ Saved john-quinnelly.jpg (2019)
✅ Saved winston-britton.jpg (2019)
✅ Saved garrett-morris.jpg (2019)
✅ Saved hunter-hayes.jpg (2019)
✅ Saved nick-robinson.jpg (2019)
✅ Saved lewis-gibson.jpg (2019)
✅ Saved shane-sweitzer.jpg (2019)
✅ Saved kolton-huber.jpg (2019)
✅ Saved tyler-herrick.jpg (2019)
✅ Saved sire-woods.jpg (2019)
✅ Saved tanner-long.jpg (2019)
✅ Saved mason-quandt.jpg (2019)
✅ Saved sam-philippi.jpg (2019)
✅ Saved rory-starkey-jr.jpg (2019)
✅ Saved gavin-geib.jpg (2019)
✅ Saved david-perkins.jpg (2019)
✅ Saved malcolm-strickland.jpg (2019)
✅ Saved jason-mccleod-jr.jpg (2019)
✅ Saved jaden-key.jpg (2019)
✅ Saved preston-norwood.jpg (2019)
✅ Saved nance-hill.jpg (2019)
✅

In [7]:
# Normalize names for dictionary keys
def clean_key(name):
    name = name.strip().lower()
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
    return re.sub(r"[^\w\s\-]", "", name)

# Store player info in a dictionary keyed by cleaned first+last name
players = {}

for year in range(2019, 2026):
    print(f"Scraping {year}...")
    url = f"{base_url}/sports/football/roster/{year}?view=3"
    res = requests.get(url)
    if res.status_code != 200:
        print(f"⚠️  Skipping {year} - Page not found.")
        continue

    soup = BeautifulSoup(res.text, "html.parser")
    cards = soup.select("li.sidearm-list-card-item")

    for card in cards:
        first = card.select_one("span.sidearm-roster-player-first-name")
        last = card.select_one("span.sidearm-roster-player-last-name")
        if not first or not last:
            continue

        first_name = first.text.strip()
        last_name = last.text.strip()
        key = (clean_key(first_name), clean_key(last_name))

        position = card.select_one("div.sidearm-roster-player-position-short")
        pos_text = position.text.strip() if position else ""

        jersey = card.select_one("div.sidearm-roster-player-jersey span")
        jersey_text = jersey.text.strip() if jersey else ""

        acad_year = card.select_one("span.sidearm-roster-player-academic-year")
        year_text = acad_year.text.strip() if acad_year else ""

        hometown = card.select_one("span.sidearm-roster-player-hometown")
        hometown_text = hometown.text.strip() if hometown else ""

        # Overwrite previous entry if one exists
        players[key] = {
            "First": first_name,
            "Last": last_name,
            "Position": pos_text,
            "Year": year_text,
            "Number": jersey_text,
            "Hometown": hometown_text
        }

# Write to CSV
output_path = "penn_football_roster_2019_2025.csv"
with open(output_path, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["First", "Last", "Position", "Year", "Number", "Hometown"])
    writer.writeheader()
    for player in players.values():
        writer.writerow(player)

print(f"✅ CSV saved to {output_path} with {len(players)} players.")

Scraping 2019...
Scraping 2020...
Scraping 2021...
Scraping 2022...
Scraping 2023...
Scraping 2024...
Scraping 2025...
✅ CSV saved to penn_football_roster_2019_2025.csv with 317 players.
