In [9]:
import requests
import pandas as pd
import random
import time
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

# Define output directory
base_dir = r"C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files"

# Load the existing team data CSV
team_data_file = os.path.join(base_dir, "cfbstats_teams_2022_2024.csv")
if not os.path.exists(team_data_file):
    print("Team data file not found! Make sure the first scraping step is completed.")
    exit()

df_teams = pd.read_csv(team_data_file)

# User-Agent list
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]

REFERERS = [
    "https://www.google.com/",
    "https://www.bing.com/",
    "https://www.yahoo.com/",
    "https://duckduckgo.com/"
]

# Converts data in table to proper format instead of a date in CSV file
def convert_height_to_string(height):
    parts = height.split('-')
    if len(parts) == 2:
        try:
            feet = int(parts[0])
            inches = int(parts[1])
            return f"{feet}'{inches}\""  # Strict string format
        except ValueError:
            return height  # Return as-is if conversion fails
    return height

# Scrapes team rosters
def scrape_roster(team_name, year, team_id):
    team_url = f"https://cfbstats.com/{year}/team/{team_id}/roster.html"
    print(f"Scraping roster for {team_name} ({year}) - {team_url}")
    
    headers = {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": random.choice(REFERERS),
    }
    
    try:
        response = requests.get(team_url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Failed to fetch {team_url} for {team_name} ({year}) (Error: {e})")
        return

    soup = BeautifulSoup(response.content, "html.parser")
    
    # Locate the roster table (Adjust selector if needed)
    table = soup.find("table")
    if not table:
        print(f"No roster table found for {team_name} ({year})")
        return
    
    roster_data = []
    rows = table.find_all("tr")
    for row in rows[1:]:  # Skip header row
        cols = row.find_all("td")
        if len(cols) < 6:  # Ensure we have enough columns
            continue  # Skip malformed rows
        
        pos = cols[2].text.strip()
        ht = convert_height_to_string(cols[4].text.strip())  # Convert height
        wt = cols[5].text.strip()
        
        # Skip players with missing height or weight
        if not ht or not wt or ht == "-" or wt == "-" or ht.isspace() or wt.isspace():
            continue
        
        roster_data.append({
            "Team": team_name,
            "Year": year,
            "Team ID": team_id,
            "Pos": pos,
            "Ht": ht,
            "Wt": wt,
        })
    
    # Save to year-specific CSV
    output_file = os.path.join(base_dir, f"cfbstats_team_rosters_{year}.csv")
    if roster_data:
        df = pd.DataFrame(roster_data)
        
        # Sort by Team and then by Pos
        df = df.sort_values(by=["Team", "Pos"])
        
        df.to_csv(output_file, mode="a", header=not os.path.exists(output_file), index=False)
        print(f"Saved {len(roster_data)} players for {team_name} ({year}) in {output_file}")

# Use ThreadPoolExecutor for parallel scraping
with ThreadPoolExecutor(max_workers=3) as executor:
    futures = {}
    for _, row in df_teams.iterrows():
        team_name, year, team_id = row["Team"], row["Year"], row["Team ID"]
        futures[executor.submit(scrape_roster, team_name, year, team_id)] = team_name
    
    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error scraping {futures[future]}: {e}")

print("\nRoster scraping complete! Data saved to individual year files.")


Scraping roster for Air Force (2024) - https://cfbstats.com/2024/team/721/roster.html
Scraping roster for Air Force (2023) - https://cfbstats.com/2023/team/721/roster.html
Scraping roster for Air Force (2022) - https://cfbstats.com/2022/team/721/roster.html
Saved 121 players for Air Force (2023) in C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files\cfbstats_team_rosters_2023.csv
Scraping roster for Akron (2024) - https://cfbstats.com/2024/team/5/roster.html
Saved 115 players for Air Force (2022) in C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files\cfbstats_team_rosters_2022.csv
Scraping roster for Akron (2023) - https://cfbstats.com/2023/team/5/roster.html
Saved 110 players for Air Force (2024) in C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files\cfbstats_team_rosters_2024.csv
Scraping roster for Akron (2022) - https:/

In [11]:
# Verify each team has an entry for each season and contains data for each season
# Load the existing team data CSV
team_data_file = os.path.join(base_dir, "cfbstats_teams_2022_2024.csv")
if not os.path.exists(team_data_file):
    print("Team data file not found! Make sure the first scraping step is completed.")
    exit()

df_teams = pd.read_csv(team_data_file)

# List of years to check
years = [2022, 2023, 2024]

# Function to check if data exists for each team and season
def check_data_exists(year, team_name):
    """Check if the data exists for a team in a specific year"""
    # Construct the file path for the given year
    file_path = os.path.join(base_dir, f"cfbstats_team_rosters_{year}.csv")
    
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"File for {team_name} ({year}) not found: {file_path}")
        return False
    
    # Load the data
    df = pd.read_csv(file_path)
    
    # Check if the team is in the file for the specified year
    if team_name not in df['Team'].values:
        print(f"Team {team_name} not found for {year} in {file_path}")
        return False
    
    return True

# Data structure to track team and season
team_seasons_data = []

# Iterate through the teams and check data for each year
missing_data = []
for _, row in df_teams.iterrows():
    team_name, year, _ = row["Team"], row["Year"], row["Team ID"]
    
    # Track each team-season combination
    team_seasons_data.append((team_name, year))
    
    # Skip the team if data for this year already exists in the file
    if not check_data_exists(year, team_name):
        missing_data.append((team_name, year))

# Count unique teams and unique seasons
unique_teams = set([team for team, _ in team_seasons_data])
unique_seasons = set([year for _, year in team_seasons_data])

# Print out the unique team and season counts
print(f"Unique Teams: {len(unique_teams)}")
print(f"Unique Seasons: {len(unique_seasons)}")

# Print missing data for any teams and years
if missing_data:
    print("\nMissing data for the following teams and years:")
    for team, year in missing_data:
        print(f"{team} - {year}")
else:
    print("All teams and years are accounted for!")

# Check how many rows per team are present in each year's file
print("\nRows per team in each season:")
for year in years:
    file_path = os.path.join(base_dir, f"cfbstats_team_rosters_{year}.csv")
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        
        # Count the number of rows for each team
        team_counts = df['Team'].value_counts()
        print(f"\n{year} Data:")
        print(team_counts)
    else:
        print(f"File for {year} not found.")

print("\nCheck complete.")


Unique Teams: 127
Unique Seasons: 3
All teams and years are accounted for!

Rows per team in each season:

2022 Data:
Team
Navy              190
Army              170
Oklahoma State    155
Nebraska          151
Louisiana         147
                 ... 
Boston College    110
Minnesota         110
Northwestern      109
Rice              108
Kent State        101
Name: count, Length: 127, dtype: int64

2023 Data:
Team
Army                179
Navy                166
Michigan            143
Alabama             138
Nebraska            136
                   ... 
Western Michigan    104
Nevada-Las Vegas    103
Northwestern        103
Old Dominion         99
Kent State           92
Name: count, Length: 127, dtype: int64

2024 Data:
Team
Navy              181
Army              179
Nebraska          150
Texas A&M         143
Oklahoma State    139
                 ... 
Kent State        107
Akron             107
Bowling Green     106
Syracuse          104
San Jose State    102
Name: count, Leng