In [2]:
import requests
import random
import time
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# Define base directory for saving files
base_dir = r"C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files"

# Load CSV containing team URLs
fbs_teams_df = pd.read_csv(base_dir + "\\fbs_teams_record_by_season_2022_2024.csv")

# Load existing CSV to track which teams have been scraped, if any
file_path = base_dir + "\\fbs_total_team_passing_stats_by_season_2022_2024.csv"
try:
    existing_df = pd.read_csv(file_path)
    existing_df['Year'] = existing_df['Year'].astype(str)  # Ensure Year is str
    scraped_teams = set(existing_df['Team'] + existing_df['Year'])
except FileNotFoundError:
    existing_df = pd.DataFrame()
    scraped_teams = set()

# User-Agent list and Referers
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]

REFERERS = [
    "https://www.google.com/",
    "https://www.bing.com/",
    "https://www.yahoo.com/",
    "https://duckduckgo.com/"
]

# Function to fetch and scrape the Passing table
def scrape_team_data(row_tuple):
    # Extract the row from the tuple
    index, row = row_tuple
    
    team_name = row['School']
    season_url = row['Year Link']
    year = row['Year']

    # Skip if already scraped
    if (team_name + str(year)) in scraped_teams:
        print(f"Skipping {team_name} {year}, already scraped.")
        return (team_name, year, False)

    print(f"\nProcessing: {team_name} ({year}) - URL: {season_url}")
    
    max_retries = 2
    for attempt in range(max_retries):
        headers = {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': random.choice(REFERERS)
        }

        response = requests.get(season_url, headers=headers)

        if response.status_code == 200:
            break
        elif response.status_code == 429:
            retry_delay = random.randint(15, 30)
            print(f"Rate limited: {season_url} for {team_name} (Retry {attempt+1}/{max_retries}, Waiting {retry_delay}s)")
            time.sleep(retry_delay)
        else:
            print(f"Failed to retrieve {season_url} for {team_name} (Status Code: {response.status_code})")
            return (team_name, year, False)
    else:
        print(f"Giving up on {season_url} for {team_name} after {max_retries} retries")
        return (team_name, year, False)

    soup = BeautifulSoup(response.content, 'html.parser')

    # Check if the Passing table exists in the correct div/get table by ID
    passing_div = soup.find('div', {'id': 'div_passing_standard'})

    if passing_div:
        # Find the table within this div by ID
        passing_table = passing_div.find('table', {'id': 'passing_standard'})
    
        if passing_table:
            print(f"Successfully found Passing table for {team_name} ({year})")
            passing_rows = passing_table.find_all('tr')

            # Find the "Team Totals" row
            team_totals_row = None
            for row in passing_rows:
                cells = row.find_all('td')
                if len(cells) > 0 and cells[0].text.strip() == "Team Totals":
                    team_totals_row = cells
                    break
            
            if team_totals_row:
                # Extract the relevant stats from the team totals row, starting from the first stat
                stats = [cell.get_text(strip=True) for cell in team_totals_row[1:]]  # Skip the first cell (Team Totals)
                
                # Move the stats left by one cell for proper alignment in CSV file
                stats = stats[1:]  # Removing the first stat to shift everything left

                # Create the data dictionary for the team totals row, including team and year
                totals_dict = {
                    'Team': team_name,
                    'Year': year,
                    'G': stats[0],
                    'Cmp': stats[1],
                    'Att': stats[2],
                    'Cmp%': stats[3],
                    'Yds': stats[4],
                    'TD': stats[5],
                    'TD%': stats[6],
                    'Int': stats[7],
                    'Int%': stats[8],
                    'Y/A': stats[9],
                    'AY/A': stats[10],
                    'Y/C': stats[11],
                    'Y/G': stats[12],
                    'Rate': stats[13]
                }

                # Save to CSV, appending the data to the file
                pd.DataFrame([totals_dict]).to_csv(file_path, mode='a', header=not pd.io.common.file_exists(file_path), index=False)
                print(f"Data saved for {team_name} ({year})")
                return (team_name, year, True)
            else:
                print(f"Failed to find 'Team Totals' row for {team_name} ({year})")
                return (team_name, year, False)
        else:
            print(f"Failed to find Passing table for {team_name} ({year})")
            return (team_name, year, False)
    else:
        print(f"Failed to find the div containing Passing table for {team_name} ({year})")
        return (team_name, year, False)

# Function to handle the team data scraping with threading
def scrape_all_teams():
    scraped = 0
    failed = 0
    failed_teams = []
    
    with ThreadPoolExecutor(max_workers=3) as executor:
        results = executor.map(scrape_team_data, fbs_teams_df.iterrows())
        
        for team_name, year, success in results:
            if success:
                scraped += 1
            else:
                failed += 1
                failed_teams.append(f"{team_name} ({year})")

    # Print summary of results
    print(f"\nScraping complete.")
    print(f"Successfully scraped {scraped} teams.")
    print(f"Failed to scrape {failed} teams.")
    if failed:
        print("Failed teams: " + ", ".join(failed_teams))

scrape_all_teams()


Skipping Air Force 2024, already scraped.
Skipping Air Force 2023, already scraped.
Skipping Air Force 2022, already scraped.
Skipping Akron 2024, already scraped.
Skipping Akron 2023, already scraped.
Skipping Akron 2022, already scraped.
Skipping Alabama 2024, already scraped.
Skipping Alabama 2023, already scraped.
Skipping Alabama 2022, already scraped.
Skipping Appalachian State 2024, already scraped.
Skipping Appalachian State 2023, already scraped.
Skipping Appalachian State 2022, already scraped.
Skipping Arizona 2024, already scraped.
Skipping Arizona 2023, already scraped.
Skipping Arizona 2022, already scraped.
Skipping Arizona State 2024, already scraped.
Skipping Arizona State 2023, already scraped.
Skipping Arizona State 2022, already scraped.
Skipping Arkansas 2024, already scraped.
Skipping Arkansas 2023, already scraped.
Skipping Arkansas 2022, already scraped.
Skipping Arkansas State 2024, already scraped.
Skipping Arkansas State 2023, already scraped.
Skipping Arkans

In [3]:
# Load the data
df = pd.read_csv(file_path)

# Ensure Year is a string type for correct sorting and filtering
df['Year'] = df['Year'].astype(str)

# Organize the data: First alphabetically by Team, then by Year descending
df_sorted = df.sort_values(by=['Team', 'Year'], ascending=[True, False])

# Save the sorted dataframe back to CSV
df_sorted.to_csv(file_path, index=False)
print(f"Data sorted and saved to {file_path}")

# Count each unique team name and check for records for each season (2022, 2023, 2024)
team_counts = df_sorted.groupby('Team')['Year'].nunique()

# Teams missing any of the 2022, 2023, or 2024 records
missing_teams = team_counts[team_counts < 3]

# Teams with more than 3 records
duplicate_teams = team_counts[team_counts > 3]

# Print team counts, missing teams, and duplicate teams
print("\nTeam count for each team:")
print(team_counts)

if len(missing_teams) > 0:
    print("\nTeams missing records for one or more seasons:")
    print(missing_teams)
else:
    print("\nAll teams have records for all three seasons.")

if len(duplicate_teams) > 0:
    print("\nTeams with more than 3 records (potential duplicates):")
    print(duplicate_teams)
else:
    print("\nNo teams have more than 3 records.")


Data sorted and saved to C:\Users\Christopher\OneDrive - Syracuse University\PythonSportAnalytics\Section_8\Final_Project\CSV_Files\fbs_total_team_passing_stats_by_season_2022_2024.csv

Team count for each team:
Team
Air Force            3
Akron                3
Alabama              3
Appalachian State    3
Arizona              3
                    ..
West Virginia        3
Western Kentucky     3
Western Michigan     3
Wisconsin            3
Wyoming              3
Name: Year, Length: 127, dtype: int64

All teams have records for all three seasons.

No teams have more than 3 records.
