In [1]:
import pandas as pd
import requests
import time
import random
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# Define base directory for saving files
base_dir = r"C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files"

# Load CSV containing team URLs
fbs_teams_df = pd.read_csv(base_dir + "\\fbs_teams_record_by_season_2022_2024.csv")

# Load existing CSV
file_path = base_dir + "\\fbs_total_team_stats_per_game_average_by_season_2022_2024.csv"
try:
    existing_df = pd.read_csv(file_path)
    existing_df['Year'] = existing_df['Year'].astype(str)  # Ensure Year is str
    scraped_teams = set(existing_df['Team'] + existing_df['Year'])
except FileNotFoundError:
    existing_df = pd.DataFrame()
    scraped_teams = set()

# Filter teams that still need scraping, if any
fbs_teams_df['Year'] = fbs_teams_df['Year'].astype(str)  # Ensure Year is str
teams_to_scrape = fbs_teams_df[~(fbs_teams_df['School'] + fbs_teams_df['Year']).isin(scraped_teams)]

print(f"Found existing file with {len(scraped_teams)} records.")
print(f"Teams remaining to scrape: {len(teams_to_scrape)}")

# List of different User-Agents and Referrers
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]

REFERERS = [
    "https://www.google.com/",
    "https://www.bing.com/",
    "https://www.yahoo.com/",
    "https://duckduckgo.com/"
]

# Function to fetch and scrape the team stats table
def scrape_team_data(row):
    team_name = row['School']
    season_url = row['Year Link']
    year = row['Year']

    # Skip if already scraped
    if (team_name + year) in scraped_teams:
        print(f"Skipping {team_name} {year}, already scraped.")
        return None

    print(f"\nProcessing: {team_name} ({year}) - URL: {season_url}")
    
    max_retries = 2
    for attempt in range(max_retries):
        headers = {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': random.choice(REFERERS)
        }

        response = requests.get(season_url, headers=headers)

        if response.status_code == 200:
            break
        elif response.status_code == 429:
            retry_delay = random.randint(15, 30)
            print(f"Rate limited: {season_url} for {team_name} (Retry {attempt+1}/{max_retries}, Waiting {retry_delay}s)")
            time.sleep(retry_delay)
        else:
            print(f"Failed to retrieve {season_url} for {team_name} (Status Code: {response.status_code})")
            return None
    else:
        print(f"Giving up on {season_url} for {team_name} after {max_retries} retries")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all tables
    tables = soup.find_all('table')

    if not tables:
        print(f"No tables found for {team_name} ({season_url})")
        return None

    team_stats_table = tables[0]  
    rows = team_stats_table.find_all('tr')

    if len(rows) < 2:
        print(f"Not enough header rows for {team_name}")
        return None

    secondary_headers = [th.get_text(strip=True) for th in rows[1].find_all('th')]
    team_stats = []
    
    for row in rows[2:]:
        columns = row.find_all(['td', 'th'])  
        stats = [col.get_text(strip=True) for col in columns]
        if stats and stats[0]:  
            team_stats.append(stats)

    if team_stats:
        print(f"Successfully scraped: {team_name} ({year})")
        
        for stats in team_stats:
            if stats:
                data_dict = {'Team': team_name, 'Year': year}
                for i, header in enumerate(secondary_headers):
                    data_dict[header] = stats[i] if i < len(stats) else None
                
                # Append to CSV immediately to avoid data loss
                pd.DataFrame([data_dict]).to_csv(file_path, mode='a', header=not pd.io.common.file_exists(file_path), index=False)
        
        return True
    return None

with ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(scrape_team_data, row) for _, row in teams_to_scrape.iterrows()]
    for future in futures:
        future.result()

print("Scraping complete! Data is saved as it goes.")


Found existing file with 0 records.
Teams remaining to scrape: 381

Processing: Air Force (2024) - URL: https://www.sports-reference.com/cfb/schools/air-force/2024.html

Processing: Air Force (2023) - URL: https://www.sports-reference.com/cfb/schools/air-force/2023.html

Processing: Air Force (2022) - URL: https://www.sports-reference.com/cfb/schools/air-force/2022.html
Successfully scraped: Air Force (2022)
Successfully scraped: Air Force (2023)

Processing: Akron (2024) - URL: https://www.sports-reference.com/cfb/schools/akron/2024.html

Processing: Akron (2023) - URL: https://www.sports-reference.com/cfb/schools/akron/2023.html
Successfully scraped: Akron (2024)

Processing: Akron (2022) - URL: https://www.sports-reference.com/cfb/schools/akron/2022.html
Successfully scraped: Akron (2023)

Processing: Alabama (2024) - URL: https://www.sports-reference.com/cfb/schools/alabama/2024.html
Successfully scraped: Air Force (2024)

Processing: Alabama (2023) - URL: https://www.sports-refere

In [4]:
# Organize file by School/Team and then Year/Season
# Define base directory for saving files
base_dir = r"C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files"

# Load existing CSV
file_path = base_dir + "\\fbs_total_team_stats_per_game_average_by_season_2022_2024.csv"
existing_df = pd.read_csv(file_path)

# Remove any duplicate rows
existing_df = existing_df.drop_duplicates()

# Remove rows where the 'Team' and 'Year' columns have the same header values (to avoid duplicated headers)
existing_df = existing_df[~existing_df['Team'].str.contains("Team", case=False, na=False)]

# Ensure the 'Year' column is a string for sorting
existing_df['Year'] = existing_df['Year'].astype(str)

# Sort the DataFrame by 'Team' alphabetically and 'Year' in descending order
existing_df = existing_df.sort_values(by=['Team', 'Year'], ascending=[True, False])

# Reset the index after sorting
existing_df.reset_index(drop=True, inplace=True)

# Save the cleaned and sorted DataFrame back to the same CSV
existing_df.to_csv(file_path, index=False)

# Display the cleaned and sorted DataFrame
print("Duplicate rows removed, data sorted, and saved successfully.")
display(existing_df)


Duplicate rows removed, data sorted, and saved successfully.


Unnamed: 0,Team,Year,Split,G,Cmp,Att,Pct,Yds,TD,Avg,Plays,Pass,Rush,Pen,Tot,No.,Fum,Int
0,Air Force,2024,Offense,12,5.2,56.1,44.9,28.8,2.0,4.6,67.6,3.3,13.3,1.4,1.3,3.2,0.4,0.8
1,Air Force,2024,Defense,12,15.6,33.6,64.7,48.4,1.9,5.9,57.7,8.5,7.4,1.1,1.0,5.3,0.7,0.3
2,Air Force,2024,Difference,,-10.4,+22.5,-19.8,-19.6,+0.1,-1.3,+9.9,-5.2,+5.9,+0.3,+0.3,-2.1,-0.3,+0.5
3,Air Force,2023,Offense,13,4.2,55.3,52.4,24.8,2.8,5.8,63.4,3.3,14.5,1.8,1.3,3.0,0.8,0.5
4,Air Force,2023,Defense,13,16.6,29.8,61.7,48.4,0.8,4.9,56.8,8.0,5.7,1.0,1.1,5.3,0.3,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,Wyoming,2023,Defense,13,19.5,34.8,58.4,40.8,0.9,5.2,68.2,11.0,7.8,1.0,1.7,4.6,0.8,0.8
1139,Wyoming,2023,Difference,,-5.0,+1.2,+3.8,-7.4,+0.6,+0.3,-8.8,-3.1,+1.0,+0.8,-0.9,-0.7,-0.4,-0.3
1140,Wyoming,2022,Offense,13,12.0,36.8,51.3,37.5,1.2,5.2,60.2,6.2,8.6,1.2,1.2,4.3,0.3,0.9
1141,Wyoming,2022,Defense,13,20.1,35.9,60.3,48.8,1.3,5.4,69.2,9.3,8.8,1.5,1.1,5.2,0.6,0.5


In [10]:
# Count unique teams to verify total number teams as well as verify each team has 3 entries per season/year
# Define base directory for saving files
base_dir = r"C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files"

# Load existing CSV
file_path = base_dir + "\\fbs_total_team_stats_per_game_average_by_season_2022_2024.csv"
existing_df = pd.read_csv(file_path)

# Remove any duplicate rows
existing_df = existing_df.drop_duplicates()

# Remove rows where the 'Team' and 'Year' columns have the same header values (to avoid duplicated headers)
existing_df = existing_df[~existing_df['Team'].str.contains("Team", case=False, na=False)]

# Count the number of times each team appears per year
team_year_counts = existing_df.groupby(['Team', 'Year']).size()

# Check for teams that appear more or less than 3 times for any given year
teams_with_issues = team_year_counts[team_year_counts != 3]

# If there are teams with issues, print them
if not teams_with_issues.empty:
    print("\nTeams that do not appear exactly 3 times for each year (2022, 2023, 2024):")
    print(teams_with_issues)
else:
    print("\nAll teams appear exactly 3 times for each year (2022, 2023, and 2024).")


All teams appear exactly 3 times for each year (2022, 2023, and 2024).
