In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
import unicodedata
from urllib.parse import urljoin



base_url = "https://pennathletics.com"
output_dir = "penn_football_images_fullsize"
os.makedirs(output_dir, exist_ok=True)

In [3]:
base_url = "https://pennathletics.com"
output_dir = "penn_football_images_fullsize"
os.makedirs(output_dir, exist_ok=True)

def clean_name(name):
    name = name.lower().replace(" ", "-")
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
    name = re.sub(r"[^\w\-]", "", name)
    return name

for year in range(2019, 2026):
    print(f"📅 Scraping roster for {year}")
    roster_url = f"{base_url}/sports/football/roster/{year}?view=3"
    r = requests.get(roster_url)
    if r.status_code != 200:
        print(f"⚠️ Could not load year {year}")
        continue

    soup = BeautifulSoup(r.text, "html.parser")
    player_cards = soup.select("li.sidearm-list-card-item")

    for card in player_cards:
        first = card.select_one("span.sidearm-roster-player-first-name")
        last = card.select_one("span.sidearm-roster-player-last-name")
        link = card.select_one("a.sidearm-roster-player-name")

        if not (first and last and link):
            continue

        first_name = clean_name(first.text)
        last_name = clean_name(last.text)
        filename = f"{first_name}-{last_name}.jpg"
        profile_url = urljoin(base_url, link["href"])

        profile_res = requests.get(profile_url)
        if profile_res.status_code != 200:
            continue

        profile_soup = BeautifulSoup(profile_res.text, "html.parser")
        img_tag = profile_soup.select_one("div.sidearm-roster-player-image img")

        if not img_tag or not img_tag.get("src"):
            continue

        img_url = urljoin(base_url, img_tag["src"].split("?")[0])  # Remove any ?width= params

        try:
            img_data = requests.get(img_url).content
            with open(os.path.join(output_dir, filename), "wb") as f:
                f.write(img_data)
        except Exception as e:
            print(f"    ❌ Error saving {filename}: {e}")


📅 Scraping roster for 2019
📅 Scraping roster for 2020
📅 Scraping roster for 2021
📅 Scraping roster for 2022
📅 Scraping roster for 2023
📅 Scraping roster for 2024
📅 Scraping roster for 2025


In [5]:
# Normalize names for dictionary keys
def clean_key(name):
    name = name.strip().lower()
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
    return re.sub(r"[^\w\s\-]", "", name)

# Remove suffixes from names
def remove_suffix(name):
    # Common suffixes to remove
    suffixes = ['jr', 'jr.', 'sr', 'sr.', 'ii', 'iii', 'iv', 'v', 'vi']
    name_parts = name.strip().split()
    
    # Check if last part is a suffix
    if len(name_parts) > 1 and name_parts[-1].lower().rstrip('.') in suffixes:
        return ' '.join(name_parts[:-1])
    return name

# Calculate graduation year based on academic year status
def calculate_grad_year(academic_year, roster_year):
    academic_year = academic_year.lower().strip()
    
    if 'freshman' in academic_year or 'fr' == academic_year:
        return roster_year + 3
    elif 'sophomore' in academic_year or 'so' == academic_year:
        return roster_year + 2
    elif 'junior' in academic_year or 'jr' == academic_year:
        return roster_year + 1
    elif 'senior' in academic_year or 'sr' == academic_year:
        return roster_year
    else:
        # Default case if we can't determine academic year
        return roster_year

# Standardize status names
def standardize_status(academic_year):
    academic_year = academic_year.strip()  # Don't convert to lowercase yet
    
    if academic_year in ['Fr.', 'Fr', 'Freshman']:
        return "Freshman"
    elif academic_year in ['So.', 'So', 'Sophomore']:
        return "Sophomore"
    elif academic_year in ['Jr.', 'Jr', 'Junior']:
        return "Junior"
    elif academic_year in ['Sr.', 'Sr', 'Senior']:
        return "Senior"
    else:
        # Return original if we can't determine
        return academic_year if academic_year else ""

# Store player info in a dictionary keyed by cleaned first+last name
players = {}

for year in range(2019, 2026):
    print(f"Scraping {year}...")
    url = f"{base_url}/sports/football/roster/{year}?view=3"
    res = requests.get(url)
    if res.status_code != 200:
        print(f"⚠️  Skipping {year} - Page not found.")
        continue
    
    soup = BeautifulSoup(res.text, "html.parser")
    cards = soup.select("li.sidearm-list-card-item")
    
    for card in cards:
        first = card.select_one("span.sidearm-roster-player-first-name")
        last = card.select_one("span.sidearm-roster-player-last-name")
        if not first or not last:
            continue
            
        first_name = first.text.strip()
        last_name = last.text.strip()
        key = (clean_key(first_name), clean_key(last_name))
        
        position = card.select_one("div.sidearm-roster-player-position-short")
        pos_text = position.text.strip() if position else ""
        
        jersey = card.select_one("div.sidearm-roster-player-jersey span")
        jersey_text = jersey.text.strip() if jersey else ""
        
        # Skip players without jersey numbers (not athletes)
        if not jersey_text:
            continue
        
        acad_year = card.select_one("span.sidearm-roster-player-academic-year")
        year_text = acad_year.text.strip() if acad_year else ""
        
        hometown = card.select_one("span.sidearm-roster-player-hometown")
        hometown_text = hometown.text.strip() if hometown else ""
        
        # Overwrite previous entry if one exists, updating grad_year based on academic status
        players[key] = {
            "first": first_name.lower(),
            "last": last_name.lower(),
            "position": pos_text,
            "year": year_text,
            "number": jersey_text,
            "hometown": hometown_text,
            "status": standardize_status(year_text),
            "grad_year": calculate_grad_year(year_text, year),
            "name_last_first_initial": f"{last_name.lower()}, {first_name.lower()[0]}" if first_name else "",
            "name_last_first": f"{last_name.lower()}, {first_name.lower()}" if first_name else "",
            "name": f"{first_name.lower()} {last_name.lower()}"
        }

# Write to CSV
output_path = "roster_19_25.csv"
with open(output_path, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["first", "last", "position", "year", "number", "hometown", "status", "grad_year", "name_last_first_initial", "name_last_first", "name"])
    writer.writeheader()
    for player in players.values():
        writer.writerow(player)

print(f"✅ CSV saved to {output_path} with {len(players)} players.")

# Optional: Print some stats about graduation years
grad_year_counts = {}
for player in players.values():
    grad_year = player["grad_year"]
    grad_year_counts[grad_year] = grad_year_counts.get(grad_year, 0) + 1

print("\nGraduation year distribution:")
for year in sorted(grad_year_counts.keys()):
    print(f"  {year}: {grad_year_counts[year]} players")

Scraping 2019...
Scraping 2020...
Scraping 2021...
Scraping 2022...
Scraping 2023...
Scraping 2024...
Scraping 2025...
✅ CSV saved to penn_football_roster_2019_2025.csv with 258 players.

Graduation year distribution:
  2019: 32 players
  2020: 22 players
  2021: 29 players
  2022: 29 players
  2023: 37 players
  2024: 30 players
  2025: 79 players
