In [10]:
import requests
import pandas as pd
import random
import time
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

# Define output directory
base_dir = r"C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files"

# Load the existing team data CSV for list of teams, IDs, etc
team_data_file = os.path.join(base_dir, "cfbstats_teams_2022_2024.csv")
if not os.path.exists(team_data_file):
    print("Team data file not found!")
    exit()

df_teams = pd.read_csv(team_data_file)

# User-Agent list
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]

REFERERS = [
    "https://www.google.com/",
    "https://www.bing.com/",
    "https://www.yahoo.com/",
    "https://duckduckgo.com/"
]

# File paths for defense stats
passing_file = os.path.join(base_dir, "cfbstats_team_passing_stats_defense_2022_2024.csv")
rushing_file = os.path.join(base_dir, "cfbstats_team_rushing_stats_defense_2022_2024.csv")

# Load existing data to check for missing teams
df_passing = pd.read_csv(passing_file) if os.path.exists(passing_file) else pd.DataFrame()
df_rushing = pd.read_csv(rushing_file) if os.path.exists(rushing_file) else pd.DataFrame()

# Function to determine missing teams, if any, from initial scrape
def get_missing_teams(df, stat_type):
    existing_teams = set(zip(df["Team"], df["Year"])) if not df.empty else set()
    all_teams = set(zip(df_teams["Team"], df_teams["Year"]))
    missing_teams = all_teams - existing_teams  # Find teams not yet scraped
    print(f"✅ {stat_type.capitalize()} - Missing teams: {len(missing_teams)}")
    return missing_teams

missing_passing_teams = get_missing_teams(df_passing, "passing")
missing_rushing_teams = get_missing_teams(df_rushing, "rushing")

def scrape_defensive_stats(team_name, year, team_id, stat_type):
    # Construct URL based on stat type
    url = f"https://cfbstats.com/{year}/team/{team_id}/{stat_type}/defense/split.html"
    print(f"Scraping {stat_type} defense stats for {team_name} ({year}) - {url}")
    
    headers = {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": random.choice(REFERERS),
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"❌ Failed to fetch {url} for {team_name} ({year}) (Error: {e})")
        return None
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Locate the stats table
    table = soup.find("table")
    if not table:
        print(f"❌ No stats table found for {team_name} ({year}) in {stat_type} defense")
        return None
    
    rows = table.find_all("tr")
    
    # Extract header row
    headers = [th.text.strip() for th in rows[0].find_all("th")]
    
    # Extract only the first data row
    first_data_row = rows[1].find_all("td") if len(rows) > 1 else []
    
    if not first_data_row:
        print(f"❌ No valid data row found for {team_name} ({year}) in {stat_type} defense")
        return None
    
    # Extract values from the first row
    row_values = [td.text.strip() for td in first_data_row]
    
    # Prepare dictionary for DataFrame
    data_dict = dict(zip(headers, row_values))
    data_dict["Team"] = team_name
    data_dict["Year"] = year
    data_dict["Team ID"] = team_id
    
    return data_dict

# Prepare data storage for teams
passing_data = []
rushing_data = []

# Use ThreadPoolExecutor for parallel scraping (goes faster)
with ThreadPoolExecutor(max_workers=3) as executor:
    futures = {}
    for team_name, year in missing_passing_teams:
        team_id = df_teams[(df_teams["Team"] == team_name) & (df_teams["Year"] == year)]["Team ID"].values[0]
        futures[executor.submit(scrape_defensive_stats, team_name, year, team_id, "passing")] = ("passing", team_name)
    
    for team_name, year in missing_rushing_teams:
        team_id = df_teams[(df_teams["Team"] == team_name) & (df_teams["Year"] == year)]["Team ID"].values[0]
        futures[executor.submit(scrape_defensive_stats, team_name, year, team_id, "rushing")] = ("rushing", team_name)
    
    for future in as_completed(futures):
        try:
            result = future.result()
            if result:
                stat_type, team_name = futures[future]
                if stat_type == "passing":
                    passing_data.append(result)
                else:
                    rushing_data.append(result)
        except Exception as e:
            print(f"❌ Error scraping {futures[future][1]}: {e}")

# Append new data to existing CSVs if teams were missed in initial scraping
for stat_type, data, file_path in [("passing", passing_data, passing_file), ("rushing", rushing_data, rushing_file)]:
    if data:
        df_new = pd.DataFrame(data)

        # Ensure consistent column ordering
        col_order = ["Team", "Year", "Team ID"] + [col for col in df_new.columns if col not in ["Team", "Year", "Team ID"]]
        df_new = df_new[col_order]

        # Sort by Team (A-Z) and Year (Descending)
        df_new = df_new.sort_values(by=["Team", "Year"], ascending=[True, False])

        # Append new data to existing file
        df_new.to_csv(file_path, mode="a", header=not os.path.exists(file_path), index=False)
        print(f"✅ Added {len(df_new)} new records to {file_path}")

print("\n✅ Missing teams have been scraped and added to the respective files.")


✅ Passing - Missing teams: 3
✅ Rushing - Missing teams: 6
Scraping passing defense stats for Washington (2024) - https://cfbstats.com/2024/team/756/passing/defense/split.html
Scraping passing defense stats for Virginia Tech (2022) - https://cfbstats.com/2022/team/742/passing/defense/split.html
Scraping passing defense stats for South Alabama (2023) - https://cfbstats.com/2023/team/646/passing/defense/split.html
Scraping rushing defense stats for Wake Forest (2023) - https://cfbstats.com/2023/team/749/rushing/defense/split.html
Scraping rushing defense stats for UAB (2024) - https://cfbstats.com/2024/team/9/rushing/defense/split.html
Scraping rushing defense stats for Virginia Tech (2022) - https://cfbstats.com/2022/team/742/rushing/defense/split.html
Scraping rushing defense stats for Tulsa (2023) - https://cfbstats.com/2023/team/719/rushing/defense/split.html
Scraping rushing defense stats for Texas Tech (2022) - https://cfbstats.com/2022/team/700/rushing/defense/split.html
Scraping r

In [11]:
#Verification that each team has one entry per season
# Files to check
passing_file = os.path.join(base_dir, "cfbstats_team_passing_stats_defense_2022_2024.csv")
rushing_file = os.path.join(base_dir, "cfbstats_team_rushing_stats_defense_2022_2024.csv")

# Load the datasets
df_passing = pd.read_csv(passing_file)
df_rushing = pd.read_csv(rushing_file)

# Function to validate the dataset
def validate_file(df, file_name):
    # Count unique teams
    unique_teams = df["Team"].nunique()
    print(f"✅ {file_name}: Total unique teams: {unique_teams}")

    # Verify that each team has exactly 3 entries (one for each year: 2024, 2023, 2022)
    team_year_counts = df[df["Year"].isin([2024, 2023, 2022])].groupby("Team")["Year"].nunique()
    incorrect_teams = team_year_counts[team_year_counts != 3]

    if incorrect_teams.empty:
        print(f"✅ {file_name}: All teams have exactly 3 entries (one per year for 2024, 2023, 2022).")
    else:
        print(f"❌ {file_name}: The following teams do not have exactly 3 entries:")
        print(incorrect_teams)

# Validate both files
validate_file(df_passing, "cfbstats_team_passing_stats_defense_2022_2024.csv")
validate_file(df_rushing, "cfbstats_team_rushing_stats_defense_2022_2024.csv")

✅ cfbstats_team_passing_stats_defense_2022_2024.csv: Total unique teams: 127
✅ cfbstats_team_passing_stats_defense_2022_2024.csv: All teams have exactly 3 entries (one per year for 2024, 2023, 2022).
✅ cfbstats_team_rushing_stats_defense_2022_2024.csv: Total unique teams: 127
✅ cfbstats_team_rushing_stats_defense_2022_2024.csv: All teams have exactly 3 entries (one per year for 2024, 2023, 2022).
