In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
import unicodedata
from urllib.parse import urljoin



base_url = "https://pennathletics.com"
output_dir = "penn_football_images_fullsize"
os.makedirs(output_dir, exist_ok=True)

In [3]:
base_url = "https://pennathletics.com"
output_dir = "penn_football_images_fullsize"
os.makedirs(output_dir, exist_ok=True)

def clean_name(name):
    name = name.lower().replace(" ", "-")
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
    name = re.sub(r"[^\w\-]", "", name)
    return name

for year in range(2019, 2026):
    print(f"üìÖ Scraping roster for {year}")
    roster_url = f"{base_url}/sports/football/roster/{year}?view=3"
    r = requests.get(roster_url)
    if r.status_code != 200:
        print(f"‚ö†Ô∏è Could not load year {year}")
        continue

    soup = BeautifulSoup(r.text, "html.parser")
    player_cards = soup.select("li.sidearm-list-card-item")

    for card in player_cards:
        first = card.select_one("span.sidearm-roster-player-first-name")
        last = card.select_one("span.sidearm-roster-player-last-name")
        link = card.select_one("a.sidearm-roster-player-name")

        if not (first and last and link):
            continue

        first_name = clean_name(first.text)
        last_name = clean_name(last.text)
        filename = f"{first_name}-{last_name}.jpg"
        profile_url = urljoin(base_url, link["href"])

        profile_res = requests.get(profile_url)
        if profile_res.status_code != 200:
            continue

        profile_soup = BeautifulSoup(profile_res.text, "html.parser")
        img_tag = profile_soup.select_one("div.sidearm-roster-player-image img")

        if not img_tag or not img_tag.get("src"):
            continue

        img_url = urljoin(base_url, img_tag["src"].split("?")[0])  # Remove any ?width= params

        try:
            img_data = requests.get(img_url).content
            with open(os.path.join(output_dir, filename), "wb") as f:
                f.write(img_data)
        except Exception as e:
            print(f"    ‚ùå Error saving {filename}: {e}")


üìÖ Scraping roster for 2019
üìÖ Scraping roster for 2020
üìÖ Scraping roster for 2021
üìÖ Scraping roster for 2022
üìÖ Scraping roster for 2023
üìÖ Scraping roster for 2024
üìÖ Scraping roster for 2025


In [4]:
# Normalize names for dictionary keys
def clean_key(name):
    name = name.strip().lower()
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
    return re.sub(r"[^\w\s\-]", "", name)

# Store player info in a dictionary keyed by cleaned first+last name
players = {}

for year in range(2019, 2026):
    print(f"Scraping {year}...")
    url = f"{base_url}/sports/football/roster/{year}?view=3"
    res = requests.get(url)
    if res.status_code != 200:
        print(f"‚ö†Ô∏è  Skipping {year} - Page not found.")
        continue
    
    soup = BeautifulSoup(res.text, "html.parser")
    cards = soup.select("li.sidearm-list-card-item")
    
    for card in cards:
        first = card.select_one("span.sidearm-roster-player-first-name")
        last = card.select_one("span.sidearm-roster-player-last-name")
        if not first or not last:
            continue
            
        first_name = first.text.strip()
        last_name = last.text.strip()
        key = (clean_key(first_name), clean_key(last_name))
        
        position = card.select_one("div.sidearm-roster-player-position-short")
        pos_text = position.text.strip() if position else ""
        
        jersey = card.select_one("div.sidearm-roster-player-jersey span")
        jersey_text = jersey.text.strip() if jersey else ""
        
        # Skip players without jersey numbers (not athletes)
        if not jersey_text:
            continue
        
        acad_year = card.select_one("span.sidearm-roster-player-academic-year")
        year_text = acad_year.text.strip() if acad_year else ""
        
        hometown = card.select_one("span.sidearm-roster-player-hometown")
        hometown_text = hometown.text.strip() if hometown else ""
        
        # Overwrite previous entry if one exists, updating grad_year to current roster year
        players[key] = {
            "First": first_name,
            "Last": last_name,
            "Position": pos_text,
            "Year": year_text,
            "Number": jersey_text,
            "Hometown": hometown_text,
            "grad_year": year - 1  # Graduate 1 year earlier than roster year
        }

# Write to CSV
output_path = "penn_football_roster_2019_2025.csv"
with open(output_path, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["First", "Last", "Position", "Year", "Number", "Hometown", "grad_year"])
    writer.writeheader()
    for player in players.values():
        writer.writerow(player)

print(f"‚úÖ CSV saved to {output_path} with {len(players)} players.")

# Optional: Print some stats about graduation years
grad_year_counts = {}
for player in players.values():
    grad_year = player["grad_year"]
    grad_year_counts[grad_year] = grad_year_counts.get(grad_year, 0) + 1

print("\nGraduation year distribution:")
for year in sorted(grad_year_counts.keys()):
    print(f"  {year}: {grad_year_counts[year]} players")

Scraping 2019...
Scraping 2020...
Scraping 2021...
Scraping 2022...
Scraping 2023...
Scraping 2024...
Scraping 2025...
‚úÖ CSV saved to penn_football_roster_2019_2025.csv with 258 players.

Graduation year distribution:
  2018: 32 players
  2019: 22 players
  2020: 29 players
  2021: 29 players
  2022: 37 players
  2023: 30 players
  2024: 79 players
