In [12]:
import requests
import pandas as pd
import random
import time
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import re

# Define output file
base_dir = r"C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files"
output_file = os.path.join(base_dir, "cfbstats_teams_2022_2024.csv")

# Ensure the output file exists with proper headers
if not os.path.exists(output_file):
    pd.DataFrame(columns=["Team", "Year", "Team ID", "Team URL"]).to_csv(output_file, index=False)

# Base URLs for scraping
base_urls = {
    2024: "https://cfbstats.com/2024/team/index.html",
    2023: "https://cfbstats.com/2023/team/index.html",
    2022: "https://cfbstats.com/2022/team/index.html"
}

# User-Agent list and Referers
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]

REFERERS = [
    "https://www.google.com/",
    "https://www.bing.com/",
    "https://www.yahoo.com/",
    "https://duckduckgo.com/"
]

# Track scraped teams to avoid duplicates
scraped_teams = set()

# Function to scrape a single year's teams
def scrape_teams(year, url):
    print(f"\nProcessing Year: {year} - URL: {url}")

    headers = {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": random.choice(REFERERS),
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise exception for bad status codes
    except requests.RequestException as e:
        print(f"Failed to fetch {url} for {year} (Error: {e})")
        return

    soup = BeautifulSoup(response.content, "html.parser")
    team_links = soup.find_all("a", href=True)

    teams_data = []
    for link in team_links:
        team_url = link["href"]

        # Extract only valid team links: '/team/{id}/index.html' (ignore "/team/index.html")
        if "/team/" in team_url and "/index.html" in team_url and not team_url.endswith("/team/index.html"):
            team_name = link.text.strip()

            # Ensure it's not mistakenly pulling a year as a team name
            if team_name.isdigit():
                continue  # Skip bad data like "2024", "2023", etc.

            match = re.search(r"/team/(\d+)/index\.html", team_url)
            team_id = match.group(1) if match else None  # Correct extraction

            if not team_id:
                print(f"Warning: Could not extract Team ID from {team_url}")
                continue

            full_link = f"https://cfbstats.com{team_url}"

            # Skip duplicates
            if (team_name, year) in scraped_teams:
                print(f"Skipping duplicate: {team_name} ({year})")
                continue

            teams_data.append({"Team": team_name, "Year": year, "Team ID": team_id, "Team URL": full_link})
            scraped_teams.add((team_name, year))

    # Save valid data
    if teams_data:
        df = pd.DataFrame(teams_data)
        df.to_csv(output_file, mode="a", header=not os.path.exists(output_file), index=False)
        print(f"Saved {len(teams_data)} teams for {year}")

    return teams_data

# Use ThreadPoolExecutor to scrape multiple years
with ThreadPoolExecutor(max_workers=3) as executor:
    future_to_year = {executor.submit(scrape_teams, year, url): year for year, url in base_urls.items()}

    for future in as_completed(future_to_year):
        year = future_to_year[future]
        try:
            future.result()  # Ensure exceptions are raised if they occur
        except Exception as e:
            print(f"Error scraping {year}: {e}")

print(f"\nScraping complete. Data saved to {output_file}")



Processing Year: 2024 - URL: https://cfbstats.com/2024/team/index.html

Processing Year: 2023 - URL: https://cfbstats.com/2023/team/index.html

Processing Year: 2022 - URL: https://cfbstats.com/2022/team/index.html
Saved 134 teams for 2024
Saved 131 teams for 2022
Saved 133 teams for 2023

Scraping complete. Data saved to C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files\cfbstats_teams_2022_2024.csv


In [29]:
# Error checking and file data verification
# Define file paths
base_dir = r"C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files"
output_file = os.path.join(base_dir, "cfbstats_teams_2022_2024.csv")
reference_file = os.path.join(base_dir, "fbs_teams_links.csv")  # Reference file for verification

# Load both CSV files
df_main = pd.read_csv(output_file)
df_ref = pd.read_csv(reference_file)

# *ADDED LATER* Remove unwanted teams, I scraped teams I didn't need because original code was wrong
#teams_to_remove = {"add team needing removed here"}
#df_main = df_main[~df_main["Team"].isin(teams_to_remove)]

# Verify that each unique team has ONE instance per year
team_year_counts = df_main.groupby(["Team", "Year"]).size().reset_index(name="Count")
duplicates = team_year_counts[team_year_counts["Count"] > 1]

if not duplicates.empty:
    print("\nWARNING: The following teams have multiple entries for a single year:")
    print(duplicates)
else:
    print("\nVerification passed: Each team has exactly one entry per year.")

# Extract unique team names for comparison
teams_main = set(df_main["Team"])
teams_ref = set(df_ref["School"])

# Find discrepancies, if any, in the files created
missing_in_main = teams_ref - teams_main  # Teams in reference but missing in main
missing_in_ref = teams_main - teams_ref  # Teams in main but missing in reference

# Print results
if missing_in_main:
    print("\nTeams present in reference file but MISSING in main file:")
    print("\n".join(missing_in_main))
else:
    print("\nAll teams from the reference file are present in the main file.")

if missing_in_ref:
    print("\nTeams present in main file but MISSING in reference file:")
    print("\n".join(missing_in_ref))
else:
    print("\nAll teams from the main file are present in the reference file.")

# Sort the file alphabetically by team name, then by year (descending)
df_main = df_main.sort_values(by=["Team", "Year"], ascending=[True, False])

# Save cleaned and sorted file
df_main.to_csv(output_file, index=False)
print(f"\nUpdated file saved at: {output_file}")


Verification passed: Each team has exactly one entry per year.

All teams from the reference file are present in the main file.

All teams from the main file are present in the reference file.

Updated file saved at: C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files\cfbstats_teams_2022_2024.csv


In [33]:
# Second Verification of all teams, counts, and any potential missing or additional teams we don't want becuase I made mistakes above
output_file = os.path.join(base_dir, "cfbstats_teams_2022_2024.csv")
reference_file = os.path.join(base_dir, "fbs_teams_links.csv")  # Reference file for verification

# Load both CSV files
df_main = pd.read_csv(output_file)
df_ref = pd.read_csv(reference_file)

# Count the total number of unique teams in both CSV files
total_teams_main = df_main["Team"].nunique()
total_teams_ref = df_ref["School"].nunique()

# Print the total number of unique teams in each file
print(f"Total number of unique teams in main file: {total_teams_main}")
print(f"Total number of unique teams in reference file: {total_teams_ref}")

# Extract unique team names for comparison
teams_main = set(df_main["Team"])
teams_ref = set(df_ref["School"])

# Apply the name corrections (teams were not named the same from the different sites, but only some, so this allowed me to change them manually)
df_main["Team"] = df_main["Team"].replace(name_corrections)
df_ref["School"] = df_ref["School"].replace(name_corrections)

# Find discrepancies (teams missing in one file or another)
missing_in_main = teams_ref - set(df_main["Team"])
missing_in_ref = teams_main - set(df_ref["School"])

# Print results
if missing_in_main:
    print("\nTeams present in reference file but MISSING in main file (after name corrections):")
    print("\n".join(missing_in_main))
else:
    print("\nAll teams from the reference file are present in the main file.")

if missing_in_ref:
    print("\nTeams present in main file but MISSING in reference file (after name corrections):")
    print("\n".join(missing_in_ref))
else:
    print("\nAll teams from the main file are present in the reference file.")

# Sort the file alphabetically by team name, then by year (descending)
df_main = df_main.sort_values(by=["Team", "Year"], ascending=[True, False])

# Save cleaned and sorted file
df_main.to_csv(output_file, index=False)
print(f"\nUpdated file saved at: {output_file}")


Total number of unique teams in main file: 127
Total number of unique teams in reference file: 127

All teams from the reference file are present in the main file.

All teams from the main file are present in the reference file.

Updated file saved at: C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files\cfbstats_teams_2022_2024.csv
