In [1]:
import pandas as pd
import sqlite3
import requests
import os
import csv
from bs4 import BeautifulSoup, Comment
import time
from time import sleep
import random
from requests.exceptions import Timeout, RequestException

Data from https://github.com/nflverse/nflverse-data/releases/

In [2]:
!rm -rf data
!mkdir data
!mkdir data/rosters
!mkdir data/player-stats
!mkdir data/scoring-tables

In [3]:
!rm nfl.db

In [4]:
### Create 'Teams' in nfl.db

teams = [
    ['ARI', 'Arizona Cardinals', 'NFC West'],
    ['ATL', 'Atlanta Falcons', 'NFC South'],
    ['BAL', 'Baltimore Ravens', 'AFC North'],
    ['BUF', 'Buffalo Bills', 'AFC East'],
    ['CAR', 'Carolina Panthers', 'NFC South'],
    ['CHI', 'Chicago Bears', 'NFC North'],
    ['CIN', 'Cincinnati Bengals', 'AFC North'],
    ['CLE', 'Cleveland Browns', 'AFC North'],
    ['DAL', 'Dallas Cowboys', 'NFC East'],
    ['DEN', 'Denver Broncos', 'AFC West'],
    ['DET', 'Detroit Lions', 'NFC North'],
    ['GB', 'Green Bay Packers', 'NFC North'],
    ['HOU', 'Houston Texans', 'AFC South'],
    ['IND', 'Indianapolis Colts', 'AFC South'],
    ['JAX', 'Jacksonville Jaguars', 'AFC South'],
    ['KC', 'Kansas City Chiefs', 'AFC West'],
    ['LAC', 'Los Angeles Chargers', 'AFC West'],
    ['LAR', 'Los Angeles Rams', 'NFC West'],
    ['LVR', 'Las Vegas Raiders', 'AFC West'],
    ['MIA', 'Miami Dolphins', 'AFC East'],
    ['MIN', 'Minnesota Vikings', 'NFC North'],
    ['NE', 'New England Patriots', 'AFC East'],
    ['NO', 'New Orleans Saints', 'NFC South'],
    ['NYG', 'New York Giants', 'NFC East'],
    ['NYJ', 'New York Jets', 'AFC East'],
    ['PHI', 'Philadelphia Eagles', 'NFC East'],
    ['PIT', 'Pittsburgh Steelers', 'AFC North'],
    ['SEA', 'Seattle Seahawks', 'NFC West'],
    ['SF', 'San Francisco 49ers', 'NFC West'],
    ['TB', 'Tampa Bay Buccaneers', 'NFC South'],
    ['TEN', 'Tennessee Titans', 'AFC South'],
    ['WAS', 'Washington Commanders', 'NFC East']
]

df_teams = pd.DataFrame(teams, columns=['TeamID', 'Team', 'Division'])
with sqlite3.connect('nfl.db') as conn:
    df_teams.to_sql('Teams', conn, if_exists='replace', index=False)

In [5]:
# Create 'Games' in nfl.db

url = 'https://raw.githubusercontent.com/nflverse/nfldata/master/data/games.csv'
response = requests.get(url)
if response.ok:
    with open('./data/games.csv', 'wb') as file:
        file.write(response.content)
else:
    raise Exception(f"Failed to download the file. Status code: {response.status_code}")

df = pd.read_csv('./data/games.csv')
df = df[df['season'] >= 2010]

# Standardize team names
standardize_mapping = {
    'OAK': 'LVR',  # Oakland Raiders to Las Vegas Raiders
    'SD': 'LAC',   # San Diego Chargers to Los Angeles Chargers
    'STL': 'LAR',  # St. Louis Rams to Los Angeles Rams
    'LA': 'LAR',   # Los Angeles Rams
    'LV': 'LVR'    # Las Vegas Raiders
}
df['away_team'] = df['away_team'].replace(standardize_mapping)
df['home_team'] = df['home_team'].replace(standardize_mapping)
df.rename(columns={'gameday': 'date'}, inplace=True)
df = df[df['season'] != 1999]

# Standardize the 'game_id' column
df['game_id'] = df['game_id'].apply(lambda x: f"{x.split('_')[0]}_{x.split('_')[1]}_{standardize_mapping.get(x.split('_')[2], x.split('_')[2])}_{standardize_mapping.get(x.split('_')[3], x.split('_')[3])}")

# Convert datetime and create new columns
df['date'] = pd.to_datetime(df['date'])
df['week'] = df['week'].apply(lambda x: f'{x:02d}')
df['game_id_simple'] = df['season'].astype(str) + "_" + df['week']
df['game_id_team1'] = df['game_id_simple'] + "_" + df['home_team']
df['game_id_team2'] = df['game_id_simple'] + "_" + df['away_team']

# Select columns 
selected_columns = [
    'game_id', 'season', 'week', 'game_type', 'date', 'weekday', 'gametime', 
    'away_team', 'away_score', 'home_team', 'home_score', 'location', 'result',	'total', 'overtime', 
    'spread_line', 'total_line', 'away_rest', 'home_rest', 'roof', 'surface', 'temp', 'wind', 
    'away_qb_id', 'home_qb_id', 'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach', 'referee',
    'stadium_id', 'stadium', 'game_id_simple', 'game_id_team1', 'game_id_team2', 'pfr'
]
df_selected = df[selected_columns]

# Save
db_path = 'nfl.db'
conn = sqlite3.connect(db_path)
df_selected.to_sql('Games', conn, if_exists='replace', index=False)
conn.close()
df_selected.to_csv('./data/games_modified.csv', index=False)

In [6]:
# # Delete all games before 2015 season

# # Path to your SQLite database
# db_path = 'nfl.db'  # Replace this with the correct path

# # Connect to the database
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()

# # SQL query to delete all games before 2015
# delete_query = """
# DELETE FROM Games
# WHERE season < 2015;
# """

# # Execute the query
# cursor.execute(delete_query)

# # Commit the changes
# conn.commit()
# conn.close()

In [None]:
### Old interceptions code
### Old sacks code

In [7]:
# Create 'Players' in nfl.db

dataframes = []

# for year in range(2015, 2025):
for year in range(2023, 2025):
    url = f"https://github.com/nflverse/nflverse-data/releases/download/player_stats/player_stats_{year}.csv"
    response = requests.get(url)
    if response.ok:
        file_path = os.path.join('./data/player-stats/', f"player_stats_{year}.csv")
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded and saved player_stats_{year}.csv")
        
        # Load and append the downloaded file to the list of DataFrames
        df = pd.read_csv(file_path)
        if 'opponent_team' in df.columns:
            df = df.drop(columns=['opponent_team'])
        dataframes.append(df)
    else:
        print(f"Failed to download data for the year {year}")

# Merge all player stats DataFrames
merged_df = pd.concat(dataframes, ignore_index=True, sort=False)

# Standardize team abbreviations and format weeks
# standardize_mapping = {'LA': 'LAR', 'LV': 'LVR'}
standardize_mapping = {
    'OAK': 'LVR',  # Oakland Raiders to Las Vegas Raiders
    'SD': 'LAC',   # San Diego Chargers to Los Angeles Chargers
    'STL': 'LAR',  # St. Louis Rams to Los Angeles Rams
    'LA': 'LAR',   # Los Angeles Rams
    'LV': 'LVR'    # Las Vegas Raiders
}
merged_df['recent_team'] = merged_df['recent_team'].replace(standardize_mapping)
merged_df['week'] = merged_df['week'].apply(lambda x: f'{x:02d}')

# Create game IDs
merged_df['game_id_team'] = merged_df['season'].astype(str) + '_' + merged_df['week'].astype(str) + '_' + merged_df['recent_team']
merged_df['game_id_simple'] = merged_df['season'].astype(str) + '_' + merged_df['week'].astype(str)

# Save
merged_df.to_csv('./data/player_stats.csv', index=False)
print("Merged and cleaned player stats saved to './data/player_stats.csv'")

# Merge with game data from games.csv
# games_df = pd.read_csv('./data/games.csv')
# game_id_map = pd.concat([
#     games_df[['game_id_team1', 'game_id', 'home_team', 'away_team']].rename(columns={'game_id_team1': 'game_id_team'}),
#     games_df[['game_id_team2', 'game_id', 'home_team', 'away_team']].rename(columns={'game_id_team2': 'game_id_team'})
# ]).drop_duplicates(subset=['game_id_team'])
# merged_df = merged_df.merge(game_id_map, on='game_id_team', how='left')
games_df = pd.read_csv('./data/games_modified.csv')
game_id_map = pd.concat([
    games_df[['game_id_team1', 'game_id', 'home_team', 'away_team']].rename(columns={'game_id_team1': 'game_id_team'}),
    games_df[['game_id_team2', 'game_id', 'home_team', 'away_team']].rename(columns={'game_id_team2': 'game_id_team'})
]).drop_duplicates(subset=['game_id_team'])
merged_df = merged_df.merge(game_id_map, on='game_id_team', how='left')


# Clean the DataFrame
position_groups_to_remove = ['SPEC', 'LB', 'DB', 'OL', 'DL']
df_cleaned = merged_df[~merged_df['position_group'].isin(position_groups_to_remove)].dropna(subset=['position_group'])

df_cleaned.to_csv('./data/player_stats.csv', index=False)
print("Final cleaned player stats saved to './data/player_stats.csv'")

# Save the cleaned player stats to SQLite database
conn = sqlite3.connect('nfl.db')
cursor = conn.cursor()

# SQL command to create the 'PlayerStats' table
# player_current_team TEXT,  -- Renamed column
create_table_sql = '''
CREATE TABLE IF NOT EXISTS PlayerStats (
    player_display_name TEXT,
    game_id TEXT,
    season INTEGER,
    week INTEGER,
    position TEXT,
    headshot_url TEXT,
    completions INTEGER,
    attempts INTEGER,
    passing_yards INTEGER,
    passing_tds INTEGER,
    interceptions INTEGER,
    sacks INTEGER,
    carries INTEGER,
    rushing_yards INTEGER,
    rushing_tds INTEGER,
    rushing_fumbles INTEGER,
    receptions INTEGER,
    targets INTEGER,
    receiving_yards INTEGER,
    receiving_tds INTEGER,
    receiving_fumbles INTEGER,
    fantasy_points_ppr REAL,
    home_team TEXT,
    away_team TEXT,
    player_current_team TEXT
);
'''

# Execute the SQL command
cursor.execute(create_table_sql)

# Load the cleaned data into a DataFrame
df = pd.read_csv('./data/player_stats.csv')

# Rename the 'recent_team' column to 'player_current_team'
df.rename(columns={'recent_team': 'player_current_team'}, inplace=True)

# Select only the relevant columns to import
columns_to_import = ['player_display_name', 'player_current_team', 'game_id', 'season', 'week', 
                     'position', 'headshot_url', 'completions', 'attempts', 'passing_yards', 
                     'passing_tds', 'interceptions', 'sacks', 'carries', 'rushing_yards', 
                     'rushing_tds', 'rushing_fumbles', 'receptions', 'targets', 'receiving_yards', 
                     'receiving_tds', 'receiving_fumbles', 'fantasy_points_ppr', 'home_team', 'away_team']

df_to_import = df[columns_to_import]

# Import data into the 'PlayerStats' table
df_to_import.to_sql('PlayerStats', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()
print("Player stats saved to 'PlayerStats' table in nfl.db")

Downloaded and saved player_stats_2023.csv
Downloaded and saved player_stats_2024.csv
Merged and cleaned player stats saved to './data/player_stats.csv'
Final cleaned player stats saved to './data/player_stats.csv'
Player stats saved to 'PlayerStats' table in nfl.db


In [8]:
# Create 'Rosters' in nfl.db (2015-2025)

# Iterate through the years 2015 to 2025
# for year in range(2015, 2025):
for year in range(2023, 2025):
    # Construct the URL for the CSV file of the specific year
    url = f"https://github.com/nflverse/nflverse-data/releases/download/rosters/roster_{year}.csv"
    
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open the file in write-binary mode and save the CSV
        with open(f"./data/rosters/roster_{year}.csv", 'wb') as file:
            file.write(response.content)
        print(f"Downloaded and saved roster_{year}.csv")
    else:
        print(f"Failed to download data for the year {year}")

        
# Combine roster data years
dataframes = []

for year in range(2023, 2025):
    file_path = f'./data/rosters/roster_{year}.csv'
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        dataframes.append(df)

merged_data = pd.concat(dataframes, ignore_index=True)

# Make pfr_id's
base_url = "https://www.pro-football-reference.com/players/"
merged_data['url'] = merged_data['pfr_id'].apply(lambda x: f"{base_url}{x[0]}/{x}.htm" if pd.notna(x) else None)

merged_data.to_csv('./data/rosters.csv', index=False)
print("Final file saved to ./data/rosters.csv")

conn = sqlite3.connect('nfl.db')
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS rosters")

# SQL command to create the 'Rosters' table without any primary key
create_table_sql = '''
CREATE TABLE IF NOT EXISTS Rosters (
    season INTEGER,
    team TEXT,
    position TEXT,
    depth_chart_position TEXT,
    status TEXT,
    full_name TEXT,
    first_name TEXT,
    last_name TEXT,
    birth_date TEXT,
    height REAL,
    weight REAL,
    college TEXT,
    pfr_id TEXT,
    years_exp REAL,
    headshot_url TEXT,
    week INTEGER,
    game_type TEXT,
    entry_year REAL,
    rookie_year REAL,
    draft_club TEXT,
    draft_number REAL,
    url TEXT
);
'''

# Execute the SQL command to create the table
cursor.execute(create_table_sql)

# Load the CSV data into a DataFrame
df = pd.read_csv('data/rosters.csv')

# Insert the DataFrame data into the 'rosters' table
df.to_sql('Rosters', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()

print("Rosters table created and data inserted successfully.")


Downloaded and saved roster_2023.csv
Downloaded and saved roster_2024.csv
Final file saved to ./data/rosters.csv
Rosters table created and data inserted successfully.


In [9]:
# Standardize Team Names in Rosters table

standardize_mapping = {
    'ARZ': 'ARI',  # Arizona Cardinals
    'BLT': 'BAL',  # Baltimore Ravens
    'CLV': 'CLE',  # Cleveland Browns
    'HST': 'HOU',  # Houston Texans
    'LA': 'LAR',   # Los Angeles Rams
    'LV': 'LVR',   # Las Vegas Raiders
    'OAK': 'LVR',  # Oakland Raiders to Las Vegas Raiders
    'SD': 'LAC',   # San Diego Chargers to Los Angeles Chargers
    'SL': 'LAR'    # St. Louis Rams to Los Angeles Rams
}

# Connect to the SQLite database
conn = sqlite3.connect('nfl.db')
cursor = conn.cursor()

# Load the Rosters data into a DataFrame
df = pd.read_sql_query("SELECT * FROM Rosters", conn)

# Standardize team abbreviations in the 'team' and 'draft_club' columns
df['team'] = df['team'].replace(standardize_mapping)
df['draft_club'] = df['draft_club'].replace(standardize_mapping)

# Save the updated DataFrame back to the database
df.to_sql('Rosters', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()
print("Rosters table standardized and updated successfully.")

Rosters table standardized and updated successfully.


In [10]:
# Standardize Team Names in rosters.csv

file_path = 'data/rosters.csv'
rosters_df = pd.read_csv(file_path)

# Standardize mapping for team names
standardize_mapping = {
    'ARZ': 'ARI',  # Arizona Cardinals
    'BLT': 'BAL',  # Baltimore Ravens
    'CLV': 'CLE',  # Cleveland Browns
    'HST': 'HOU',  # Houston Texans
    'LA': 'LAR',   # Los Angeles Rams
    'LV': 'LVR',   # Las Vegas Raiders
    'OAK': 'LVR',  # Oakland Raiders to Las Vegas Raiders
    'SD': 'LAC',   # San Diego Chargers to Los Angeles Chargers
    'SL': 'LAR'    # St. Louis Rams to Los Angeles Rams
}

# Apply the standardization mapping to the 'team' column
rosters_df['team'] = rosters_df['team'].replace(standardize_mapping)

# Extract and print the unique standardized team names in a numbered list
standardized_teams = rosters_df['team'].unique()
standardized_team_list_sorted = sorted(list(standardized_teams))

# Print the standardized team names
for idx, team in enumerate(standardized_team_list_sorted, 1):
    print(f"{idx}. {team}")

# If you want to save the standardized data back to a CSV file
rosters_df.to_csv('data/rosters.csv', index=False)

1. ARI
2. ATL
3. BAL
4. BUF
5. CAR
6. CHI
7. CIN
8. CLE
9. DAL
10. DEN
11. DET
12. GB
13. HOU
14. IND
15. JAX
16. KC
17. LAC
18. LAR
19. LVR
20. MIA
21. MIN
22. NE
23. NO
24. NYG
25. NYJ
26. PHI
27. PIT
28. SEA
29. SF
30. TB
31. TEN
32. WAS


In [None]:
# Box Scores 2000-2025
# Not in nfl.db currently

In [2]:
# # # Box Scores 2015-2025
# # # Not in nfl.db currently

# # df = pd.read_csv('./data/games.csv')

# # # Create the 'pfr_url' column
# # df['pfr_url'] = 'https://www.pro-football-reference.com/boxscores/' + df['pfr'] + '.htm'

# # df.to_csv('./data/games.csv', index=False)

# # csv_file_path = 'data/box_scores.csv'
# # games_csv_path = 'data/games.csv'

# # # Define the headers (for reference)
# # headers = ['URL', 'Team', '1', '2', '3', '4', 'OT1', 'OT2', 'OT3', 'OT4', 'Final']

# # # Open the CSV file for writing
# # with open(csv_file_path, 'w', newline='') as csvfile:
# #     score_writer = csv.writer(csvfile)
# #     score_writer.writerow(headers)  # Write the headers to the CSV file

# #     # Loop through each year from 2022 to 2025
# #     for year_to_scrape in range(2015, 2025):
# #         # Read the URLs for the current season from 'games.csv'
# #         game_urls = []
# #         with open(games_csv_path, 'r') as csvfile:
# #             reader = csv.DictReader(csvfile)
# #             for row in reader:
# #                 if row['season'] == str(year_to_scrape):  # Filter for the current season
# #                     game_urls.append(row['pfr_url'])

# #         # Iterate over each URL and scrape data
# #         for url in game_urls:
# #             try:    
# #                 # Print the current game being scraped
# #                 print(f"Scraping game: {url}")
                
# #                 # Send a GET request to the URL
# #                 response = requests.get(url)
# #                 response.raise_for_status()

# #                 # Parse the content with BeautifulSoup
# #                 soup = BeautifulSoup(response.content, 'html.parser')

# #                 # Find the linescore table by its class
# #                 linescore_table = soup.find('table', class_='linescore')

# #                 if linescore_table:
# #                     # Find all rows in the linescore table, skip the header row
# #                     rows = linescore_table.find_all('tr')[1:]

# #                     # Extract and write the data from each row
# #                     for row in rows:
# #                         cols = row.find_all('td')
# #                         team_name = cols[1].text.strip()
# #                         scores = [col.text.strip() for col in cols[2:]]

# #                         # Pad the scores list to match the headers length
# #                         scores += [''] * (len(headers) - 2 - len(scores))

# #                         score_writer.writerow([url, team_name] + scores)

# #                 # Sleep for 3 seconds before the next request
# #                 time.sleep(2)

# #             except Exception as e:
# #                 print(f"Error scraping {url}: {e}")

# #             time.sleep(1)

# # print(f"Scraping complete. The data has been saved to {csv_file_path}.")

# # Box Scores 2015-2025
# df = pd.read_csv('./data/games.csv')

# # Create the 'pfr_url' column
# df['pfr_url'] = 'https://www.pro-football-reference.com/boxscores/' + df['pfr'] + '.htm'

# df.to_csv('./data/games.csv', index=False)

# csv_file_path = 'data/box_scores.csv'
# games_csv_path = 'data/games.csv'

# # Define the headers (for reference)
# headers = ['URL', 'Team', '1', '2', '3', '4', 'OT1', 'OT2', 'OT3', 'OT4', 'Final']

# # Load existing box scores to avoid duplicates
# existing_urls = set()
# if os.path.exists(csv_file_path):
#     with open(csv_file_path, 'r') as csvfile:
#         reader = csv.DictReader(csvfile)
#         for row in reader:
#             existing_urls.add(row['URL'])

# # Open the CSV file for writing (append mode if file exists)
# with open(csv_file_path, 'a', newline='') as csvfile:
#     score_writer = csv.writer(csvfile)

#     # Write headers only if the file is newly created
#     if os.path.getsize(csv_file_path) == 0:
#         score_writer.writerow(headers)  # Write the headers to the CSV file

#     # Loop through each year from 2015 to 2025
#     # for year_to_scrape in range(2015, 2025):
#     for year_to_scrape in range(2018, 2025):
#         # Read the URLs for the current season from 'games.csv'
#         game_urls = []
#         with open(games_csv_path, 'r') as csvfile:
#             reader = csv.DictReader(csvfile)
#             for row in reader:
#                 if row['season'] == str(year_to_scrape):  # Filter for the current season
#                     game_urls.append(row['pfr_url'])

#         # Iterate over each URL and scrape data
#         for url in game_urls:
#             if url in existing_urls:
#                 print(f"Skipping already scraped game: {url}")
#                 continue

#             try:    
#                 # Print the current game being scraped
#                 print(f"Scraping game: {url}")
                
#                 # Send a GET request to the URL
#                 response = requests.get(url)
#                 response.raise_for_status()

#                 # Parse the content with BeautifulSoup
#                 soup = BeautifulSoup(response.content, 'html.parser')

#                 # Find the linescore table by its class
#                 linescore_table = soup.find('table', class_='linescore')

#                 if linescore_table:
#                     # Find all rows in the linescore table, skip the header row
#                     rows = linescore_table.find_all('tr')[1:]

#                     # Extract and write the data from each row
#                     for row in rows:
#                         cols = row.find_all('td')
#                         team_name = cols[1].text.strip()
#                         scores = [col.text.strip() for col in cols[2:]]

#                         # Pad the scores list to match the headers length
#                         scores += [''] * (len(headers) - 2 - len(scores))

#                         score_writer.writerow([url, team_name] + scores)

#                 # Sleep for 3 seconds before the next request
#                 time.sleep(1.3)

#             except Exception as e:
#                 print(f"Error scraping {url}: {e}")

#             time.sleep(1)

# print(f"Scraping complete. The data has been saved to {csv_file_path}.")







# Box Scores 2015-2025 - Organized by Year (UPDATED VERSION)
df = pd.read_csv('./data/games.csv')
df['pfr_url'] = 'https://www.pro-football-reference.com/boxscores/' + df['pfr'] + '.htm'
df.to_csv('./data/games.csv', index=False)
games_csv_path = 'data/games.csv'
headers = ['URL', 'Team', '1', '2', '3', '4', 'OT1', 'OT2', 'OT3', 'OT4', 'Final']
box_scores_dir = 'data/SR-box-scores'
os.makedirs(box_scores_dir, exist_ok=True)
for year_to_scrape in range(2022, 2025):
    csv_file_path = f'{box_scores_dir}/all_box_scores_{year_to_scrape}.csv'
    existing_urls = set()
    if os.path.exists(csv_file_path):
        with open(csv_file_path, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                existing_urls.add(row['URL'])
    game_urls = []
    with open(games_csv_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['season'] == str(year_to_scrape):
                game_urls.append(row['pfr_url'])
    with open(csv_file_path, 'a', newline='') as csvfile:
        score_writer = csv.writer(csvfile)
        if os.path.getsize(csv_file_path) == 0:
            score_writer.writerow(headers)
        for url in game_urls:
            if url in existing_urls:
                print(f"Skipping already scraped game: {url}")
                continue
            try:
                print(f"Scraping game: {url}")
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                linescore_table = soup.find('table', class_='linescore')
                if linescore_table:
                    rows = linescore_table.find_all('tr')[1:]
                    for row in rows:
                        cols = row.find_all('td')
                        team_name = cols[1].text.strip()
                        scores = [col.text.strip() for col in cols[2:]]
                        scores += [''] * (len(headers) - 2 - len(scores))
                        score_writer.writerow([url, team_name] + scores)
                time.sleep(1.3)
            except Exception as e:
                print(f"Error scraping {url}: {e}")
            time.sleep(1)
    print(f"Scraping complete for {year_to_scrape}. The data has been saved to {csv_file_path}.")
print("All box scores scraping complete!")


Scraping game: https://www.pro-football-reference.com/boxscores/202209080ram.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110atl.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110car.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110chi.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110cin.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110det.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110htx.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110mia.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110nyj.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110was.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110crd.htm
Scraping game: https://www.pro-football-reference.com/boxscores/202209110sdg.htm
Scraping game: https://www.p

KeyboardInterrupt: 

In [None]:
# Fix OT columns in box scores
# Not in nfl.db currently

df = pd.read_csv('data/box_scores.csv')

# Function to shift the furthest right value to the 'Final' column
# def shift_to_final(row):
#     for col in reversed(row.index[:-1]):
#         if pd.notna(row[col]):
#             row['Final'] = row[col]
#             row[col] = None
#             break
#     return row
def shift_to_final(row):
    if pd.isna(row['Final']):  # Only shift if the 'Final' column is empty
        for col in reversed(row.index[:-1]):
            if pd.notna(row[col]):
                row['Final'] = row[col]
                row[col] = None
                break
    return row


# Apply the function to each row
df = df.apply(shift_to_final, axis=1)

# Save the modified DataFrame to a new CSV file
df.to_csv('data/box_scores.csv', index=False)

In [None]:
# Add box scores to nfl.db
# Not in nfl.db currently
# Haven't tested

# box_scores_path = '/mnt/data/box_scores.csv'
# box_scores_df = pd.read_csv(box_scores_path)

# # Mapping of full team names to their abbreviations
# team_name_mapping = {
#     'Buffalo Bills': 'BUF',
#     'Denver Broncos': 'DEN',
#     'Houston Texans': 'HOU',
#     'New York Jets': 'NYJ',
#     'Washington Redskins': 'WAS',
#     'Pittsburgh Steelers': 'PIT',
#     'Washington Football Team': 'WAS',
#     'Minnesota Vikings': 'MIN',
#     'Seattle Seahawks': 'SEA',
#     'Los Angeles Chargers': 'LAC',
#     'Dallas Cowboys': 'DAL',
#     'St. Louis Rams': 'LAR',
#     'Cincinnati Bengals': 'CIN',
#     'Washington Commanders': 'WAS',
#     'Baltimore Ravens': 'BAL',
#     'Jacksonville Jaguars': 'JAX',
#     'Green Bay Packers': 'GB',
#     'Detroit Lions': 'DET',z
#     'Atlanta Falcons': 'ATL',
#     'Tampa Bay Buccaneers': 'TB',
#     'Arizona Cardinals': 'ARI',
#     'Las Vegas Raiders': 'LVR',
#     'San Francisco 49ers': 'SF',
#     'Cleveland Browns': 'CLE',
#     'San Diego Chargers': 'LAC',
#     'Kansas City Chiefs': 'KC',
#     'Tennessee Titans': 'TEN',
#     'Carolina Panthers': 'CAR',
#     'Chicago Bears': 'CHI',
#     'New England Patriots': 'NE',
#     'Philadelphia Eagles': 'PHI',
#     'Los Angeles Rams': 'LAR',
#     'New Orleans Saints': 'NO',
#     'Oakland Raiders': 'LVR',
#     'Miami Dolphins': 'MIA',
#     'New York Giants': 'NYG',
#     'Indianapolis Colts': 'IND'
# }

# # Apply the mapping to the 'Team' column in box_scores_df
# box_scores_df['Team'] = box_scores_df['Team'].map(team_name_mapping)

# # Verify that the mapping has been applied correctly
# cleaned_team_names = box_scores_df['Team'].unique().tolist()
# print("Cleaned team names:", cleaned_team_names)

# # Now, let's merge the cleaned box scores data into your SQLite database
# db_path = '/mnt/data/nfl_updated.db'

# # Connect to the SQLite database
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()

# # Create a new table for BoxScores if it doesn't exist
# cursor.execute('''
#     CREATE TABLE IF NOT EXISTS BoxScores (
#         URL TEXT,
#         Team TEXT,
#         Q1 INTEGER,
#         Q2 INTEGER,
#         Q3 INTEGER,
#         Q4 INTEGER,
#         OT1 INTEGER,
#         OT2 INTEGER,
#         OT3 INTEGER,
#         OT4 INTEGER,
#         Final INTEGER
#     );
# ''')

# # Insert cleaned data into the BoxScores table
# box_scores_df.to_sql('BoxScores', conn, if_exists='replace', index=False)

# # Commit the changes and close the connection
# conn.commit()
# conn.close()

# print("Box scores data has been successfully merged into the database.")

In [13]:
# Scoring Tables (touchdown logs)
# Not in nfl.db currently

# for year_to_scrape in range(2015, 2025):
for year_to_scrape in range(2023, 2025):
    # Initialize output CSV file with the year and "scoring_tables" in its name
    output_filename = f'./data/scoring-tables/all_nfl_scoring_tables_{year_to_scrape}.csv'
    with open(output_filename, 'w', newline='') as output_csvfile:
        csvwriter = csv.writer(output_csvfile)
        csvwriter.writerow(['Quarter', 'Time', 'Team', 'Detail', 'Team_1', 'Team_2', 'Game_ID'])  # Added 'Game_ID'

        # Read the CSV file containing the game data
        with open('./data/games.csv', 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            rows = [row for row in reader if int(row['game_id'].split('_')[0]) == year_to_scrape]  # Filter rows for the year

            for row in rows:
                pfr_value = row['pfr']
                game_id = row['game_id']

                # Form the URL using the 'pfr' value
                url = f"https://www.pro-football-reference.com/boxscores/{pfr_value}.htm"

                try:
                    # Fetch the webpage
                    response = requests.get(url)
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # Find the table containing box score data using its class
                    table = soup.find('table', {'id': 'scoring'})

                    last_quarter = None  # To keep track of the last available quarter value

                    # Loop through the rows to get the scoring data
                    for i, row in enumerate(table.find_all('tr')):
                        if i == 0:  # Skip the first header row
                            continue
                        cells = row.find_all(['td', 'th'])
                        if len(cells) > 0:
                            csv_row = [cell.text for cell in cells]

                            # Fill in missing quarter values
                            if csv_row[0]:
                                last_quarter = csv_row[0]
                            else:
                                csv_row[0] = last_quarter

                            csv_row.append(game_id)  # Append 'game_id' to each row
                            csvwriter.writerow(csv_row)

                    print(f"Successfully scraped scoring data for game ID: {game_id}, PFR: {pfr_value}")

                except Exception as e:
                    print(f"An error occurred while scraping {url}. Error: {e}")

                # Sleep for 3 seconds to avoid overloading the server
                time.sleep(2)

    print(f"Scraping completed for {year_to_scrape}. Scoring data saved to {output_filename}.")


Successfully scraped scoring data for game ID: 2023_01_DET_KC, PFR: 202309070kan
Successfully scraped scoring data for game ID: 2023_01_CAR_ATL, PFR: 202309100atl
Successfully scraped scoring data for game ID: 2023_01_HOU_BAL, PFR: 202309100rav
Successfully scraped scoring data for game ID: 2023_01_CIN_CLE, PFR: 202309100cle
Successfully scraped scoring data for game ID: 2023_01_JAX_IND, PFR: 202309100clt
Successfully scraped scoring data for game ID: 2023_01_TB_MIN, PFR: 202309100min
Successfully scraped scoring data for game ID: 2023_01_TEN_NO, PFR: 202309100nor
Successfully scraped scoring data for game ID: 2023_01_SF_PIT, PFR: 202309100pit
Successfully scraped scoring data for game ID: 2023_01_ARI_WAS, PFR: 202309100was
Successfully scraped scoring data for game ID: 2023_01_GB_CHI, PFR: 202309100chi
Successfully scraped scoring data for game ID: 2023_01_LV_DEN, PFR: 202309100den
Successfully scraped scoring data for game ID: 2023_01_MIA_LAC, PFR: 202309100sdg
Successfully scraped s

Successfully scraped scoring data for game ID: 2023_07_ARI_SEA, PFR: 202310220sea
Successfully scraped scoring data for game ID: 2023_07_GB_DEN, PFR: 202310220den
Successfully scraped scoring data for game ID: 2023_07_LAC_KC, PFR: 202310220kan
Successfully scraped scoring data for game ID: 2023_07_MIA_PHI, PFR: 202310220phi
Successfully scraped scoring data for game ID: 2023_07_SF_MIN, PFR: 202310230min
Successfully scraped scoring data for game ID: 2023_08_TB_BUF, PFR: 202310260buf
Successfully scraped scoring data for game ID: 2023_08_HOU_CAR, PFR: 202310290car
Successfully scraped scoring data for game ID: 2023_08_LA_DAL, PFR: 202310290dal
Successfully scraped scoring data for game ID: 2023_08_MIN_GB, PFR: 202310290gnb
Successfully scraped scoring data for game ID: 2023_08_NO_IND, PFR: 202310290clt
Successfully scraped scoring data for game ID: 2023_08_NE_MIA, PFR: 202310290mia
Successfully scraped scoring data for game ID: 2023_08_NYJ_NYG, PFR: 202310290nyg
Successfully scraped sco

Successfully scraped scoring data for game ID: 2023_14_SEA_SF, PFR: 202312100sfo
Successfully scraped scoring data for game ID: 2023_14_BUF_KC, PFR: 202312100kan
Successfully scraped scoring data for game ID: 2023_14_DEN_LAC, PFR: 202312100sdg
Successfully scraped scoring data for game ID: 2023_14_PHI_DAL, PFR: 202312100dal
Successfully scraped scoring data for game ID: 2023_14_TEN_MIA, PFR: 202312110mia
Successfully scraped scoring data for game ID: 2023_14_GB_NYG, PFR: 202312110nyg
Successfully scraped scoring data for game ID: 2023_15_LAC_LV, PFR: 202312140rai
Successfully scraped scoring data for game ID: 2023_15_MIN_CIN, PFR: 202312160cin
Successfully scraped scoring data for game ID: 2023_15_PIT_IND, PFR: 202312160clt
Successfully scraped scoring data for game ID: 2023_15_DEN_DET, PFR: 202312160det
Successfully scraped scoring data for game ID: 2023_15_ATL_CAR, PFR: 202312170car
Successfully scraped scoring data for game ID: 2023_15_CHI_CLE, PFR: 202312170cle
Successfully scraped

Successfully scraped scoring data for game ID: 2024_02_LV_BAL, PFR: 202409150rav
Successfully scraped scoring data for game ID: 2024_02_LAC_CAR, PFR: 202409150car
Successfully scraped scoring data for game ID: 2024_02_NO_DAL, PFR: 202409150dal
Successfully scraped scoring data for game ID: 2024_02_TB_DET, PFR: 202409150det
Successfully scraped scoring data for game ID: 2024_02_IND_GB, PFR: 202409150gnb
Successfully scraped scoring data for game ID: 2024_02_CLE_JAX, PFR: 202409150jax
Successfully scraped scoring data for game ID: 2024_02_SF_MIN, PFR: 202409150min
Successfully scraped scoring data for game ID: 2024_02_SEA_NE, PFR: 202409150nwe
Successfully scraped scoring data for game ID: 2024_02_NYJ_TEN, PFR: 202409150oti
Successfully scraped scoring data for game ID: 2024_02_NYG_WAS, PFR: 202409150was
Successfully scraped scoring data for game ID: 2024_02_LA_ARI, PFR: 202409150crd
Successfully scraped scoring data for game ID: 2024_02_PIT_DEN, PFR: 202409150den
Successfully scraped sc

Successfully scraped scoring data for game ID: 2024_08_CAR_DEN, PFR: 202410270den
Successfully scraped scoring data for game ID: 2024_08_KC_LV, PFR: 202410270rai
Successfully scraped scoring data for game ID: 2024_08_CHI_WAS, PFR: 202410270was
Successfully scraped scoring data for game ID: 2024_08_DAL_SF, PFR: 202410270sfo
Successfully scraped scoring data for game ID: 2024_08_NYG_PIT, PFR: 202410280pit
Successfully scraped scoring data for game ID: 2024_09_HOU_NYJ, PFR: 202410310nyj
Successfully scraped scoring data for game ID: 2024_09_DAL_ATL, PFR: 202411030atl
Successfully scraped scoring data for game ID: 2024_09_DEN_BAL, PFR: 202411030rav
Successfully scraped scoring data for game ID: 2024_09_MIA_BUF, PFR: 202411030buf
Successfully scraped scoring data for game ID: 2024_09_NO_CAR, PFR: 202411030car
Successfully scraped scoring data for game ID: 2024_09_LV_CIN, PFR: 202411030cin
Successfully scraped scoring data for game ID: 2024_09_LAC_CLE, PFR: 202411030cle
Successfully scraped 

Successfully scraped scoring data for game ID: 2024_15_TB_LAC, PFR: 202412150sdg
Successfully scraped scoring data for game ID: 2024_15_PIT_PHI, PFR: 202412150phi
Successfully scraped scoring data for game ID: 2024_15_GB_SEA, PFR: 202412150sea
Successfully scraped scoring data for game ID: 2024_15_CHI_MIN, PFR: 202412160min
Successfully scraped scoring data for game ID: 2024_15_ATL_LV, PFR: 202412160rai
Successfully scraped scoring data for game ID: 2024_16_DEN_LAC, PFR: 202412190sdg
Successfully scraped scoring data for game ID: 2024_16_HOU_KC, PFR: 202412210kan
Successfully scraped scoring data for game ID: 2024_16_PIT_BAL, PFR: 202412210rav
Successfully scraped scoring data for game ID: 2024_16_NYG_ATL, PFR: 202412220atl
Successfully scraped scoring data for game ID: 2024_16_ARI_CAR, PFR: 202412220car
Successfully scraped scoring data for game ID: 2024_16_DET_CHI, PFR: 202412220chi
Successfully scraped scoring data for game ID: 2024_16_CLE_CIN, PFR: 202412220cin
Successfully scraped

In [None]:
# # Add scoring tables (touchdown logs) to nfl.db
# # Not in nfl.db currently
# # Haven't tested

# touchdown_logs_path = '/mnt/data/touchdown_logs.csv'
# touchdown_logs_df = pd.read_csv(touchdown_logs_path)

# # Updated mapping of full team names to their abbreviations
# updated_team_name_mapping = {
#     'Patriots': 'NE',
#     'Steelers': 'PIT',
#     'Bills': 'BUF',
#     'Colts': 'IND',
#     'Bears': 'CHI',
#     'Packers': 'GB',
#     'Chiefs': 'KC',
#     'Texans': 'HOU',
#     'Panthers': 'CAR',
#     'Jaguars': 'JAX',
#     'Browns': 'CLE',
#     'Jets': 'NYJ',
#     'Seahawks': 'SEA',
#     'Rams': 'LAR',
#     'Redskins': 'WAS',
#     'Dolphins': 'MIA',
#     'Cardinals': 'ARI',
#     'Saints': 'NO',
#     'Lions': 'DET',
#     'Chargers': 'LAC',
#     'Broncos': 'DEN',
#     'Ravens': 'BAL',
#     'Bengals': 'CIN',
#     'Raiders': 'LVR',
#     'Titans': 'TEN',
#     'Buccaneers': 'TB',
#     'Cowboys': 'DAL',
#     'Giants': 'NYG',
#     'Falcons': 'ATL',
#     'Eagles': 'PHI',
#     '49ers': 'SF',
#     'Vikings': 'MIN',
#     'Washington': 'WAS',
#     'Football Team': 'WAS',
#     'Commanders': 'WAS'
# }

# # Apply the updated mapping to the 'Team' column in touchdown_logs_df
# touchdown_logs_df['Team'] = touchdown_logs_df['Team'].map(updated_team_name_mapping)

# # Connect to the SQLite database
# db_path = '/mnt/data/nfl_updated.db'
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()

# # Create a new table for TouchdownLogs if it doesn't exist
# cursor.execute('''
#     CREATE TABLE IF NOT EXISTS TouchdownLogs (
#         Quarter INTEGER,
#         Time TEXT,
#         Team TEXT,
#         Detail TEXT,
#         Team_1 INTEGER,
#         Team_2 INTEGER,
#         Game_ID TEXT
#     );
# ''')

# # Insert cleaned data into the TouchdownLogs table, replacing any existing data
# touchdown_logs_df.to_sql('TouchdownLogs', conn, if_exists='replace', index=False)

# # Commit the changes and close the connection
# conn.commit()
# conn.close()

# print("Touchdown logs data has been successfully cleaned and merged into the database.")


In [None]:
# --- Creating home_spread and away_spread columns in nfl.db --- #

In [None]:
# # QB game tables
# # Not in nfl.db currently

# df = pd.read_csv('./data/rosters.csv')

# # Drop rows where 'pfr_id' is missing
# df = df.dropna(subset=['pfr_id'])

# # Filter for Quarterbacks
# qbs = df[df['position'] == 'QB']

# # Open a CSV file to write the data
# with open('./data/game_logs_qb.csv', 'w', newline='') as file:
#     writer = csv.writer(file)
#     headers_written = False  # To track if headers have been written to the file

#     # Initialize a counter for progress tracking
#     total_qbs = len(qbs)
#     qb_counter = 0

#     # Iterate over each quarterback and scrape data
#     for index, qb in qbs.iterrows():
#         qb_counter += 1
#         url = qb['url']
#         print(f"Processing QB {qb_counter}/{total_qbs}: {qb['first_name']} {qb['last_name']}")
#         first_name = qb['first_name']  # Get the player's first name
#         last_name = qb['last_name']    # Get the player's last name
#         position = 'QB'  # Assuming position is always QB as per your filter

#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             game_logs_table = soup.find('table', {'id': 'passing'})  # Adjust this ID if needed

#             if game_logs_table:  # Check if the table is found
#                 header_row = game_logs_table.find('thead').find_all('tr')[-1]
#                 data_rows = game_logs_table.find('tbody').find_all('tr')

#                 if not headers_written:  # Write headers only once
#                     headers = [header.text.strip() for header in header_row.find_all('th')]
#                     # headers.extend(['Player URL', 'Position', 'First Name', 'Last Name'])  # Add new headers
#                     # headers.extend(['Awards', 'Player URL', 'Position', 'First Name', 'Last Name'])  # Add new headers
#                     headers.append('Player URL')  # Add only the Player URL column at the very end
#                     writer.writerow(headers)
#                     headers_written = True

#                 for row in data_rows:
#                     cells = row.find_all(['th', 'td'])
#                     data = [cell.text.strip() for cell in cells]

#                     # # Extract special characters for the 'Awards' column
#                     # awards = ''
#                     # if '*' in data[0]:
#                     #     awards += 'Pro Bowl '
#                     # if '+' in data[0]:
#                     #     awards += 'All-Pro '
#                     # awards = awards.strip()
#                     # # Clean special characters from the 'Year' column (first column)
#                     # data[0] = data[0].replace('*', '').replace('+', '')
#                     # # Add the awards to the data
#                     # data.append(awards)

#                     # data.extend([url, position, first_name, last_name])  # Append additional data
#                     data.append(url)  # Append only the Player URL to the end of the data
#                     writer.writerow(data)
#                 print(f"Data written for {first_name} {last_name}")

#             else:
#                 print(f"No game logs table found for URL: {url}")
#         else:
#             print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")

#         print(f'Processed URL: {url}')  # Print the URL being processed
#         time.sleep(2)  # Add a 3-second delay after processing each URL

# print('Data saved to game_logs_qb.csv')


In [14]:
# # QB game tables
# # Not in nfl.db currently

# df = pd.read_csv('./data/rosters.csv')

# # Drop rows where 'pfr_id' is missing
# df = df.dropna(subset=['pfr_id'])

# # Filter for Quarterbacks
# qbs = df[df['position'] == 'QB']

# # Hardcode the headers
headers = [
    'Player URL', 'Position', 'First Name', 'Last Name', 'Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 
    'TD%', 'Int', 'Int%', '1D', 'Succ%', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 
    'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC', 'GWD', 'AV', 'Awards'
]

# # Open a CSV file to write the data
# with open('./data/game_logs_qb.csv', 'w', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(headers)  # Write the hardcoded headers

#     # Initialize a counter for progress tracking
#     total_qbs = len(qbs)
#     qb_counter = 0

#     # Iterate over each quarterback and scrape data
#     for index, qb in qbs.iterrows():
#         qb_counter += 1
#         url = qb['url']
#         print(f"Processing QB {qb_counter}/{total_qbs}: {qb['first_name']} {qb['last_name']}")
#         first_name = qb['first_name']  # Get the player's first name
#         last_name = qb['last_name']    # Get the player's last name
#         position = 'QB'  # Assuming position is always QB as per your filter

#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             game_logs_table = soup.find('table', {'id': 'passing'})  # Adjust this ID if needed

#             if game_logs_table:  # Check if the table is found
#                 data_rows = game_logs_table.find('tbody').find_all('tr')

#                 for row in data_rows:
#                     cells = row.find_all(['th', 'td'])
#                     data = [cell.text.strip() for cell in cells]
#                     data = [url, position, first_name, last_name] + data
#                     writer.writerow(data)

                
#                 print(f"Data written for {first_name} {last_name}")

#             else:
#                 print(f"No game logs table found for URL: {url}")
#         else:
#             print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")

#         print(f'Processed URL: {url}')  # Print the URL being processed
#         time.sleep(2)  # Add a 3-second delay after processing each URL

# print('Data saved to game_logs_qb.csv')
import time
import csv
import requests
from bs4 import BeautifulSoup
from requests.exceptions import Timeout, RequestException
import pandas as pd

df = pd.read_csv('./data/rosters.csv')

# Drop rows where 'pfr_id' is missing
df = df.dropna(subset=['pfr_id'])

# Filter for Quarterbacks
qbs = df[df['position'] == 'QB']


# Load existing QB data from CSV
try:
    existing_qb_data = pd.read_csv('./data/game_logs_qb.csv')
    # Create a set of (Player URL, Year) tuples to check for duplicates
    existing_url_years = set(zip(existing_qb_data['Player URL'], existing_qb_data['Year']))
except FileNotFoundError:
    # If the file doesn't exist, initialize an empty set
    existing_qb_data = pd.DataFrame(columns=headers)
    existing_url_years = set()

# Open the CSV file in append mode
with open('./data/game_logs_qb.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    
    # If the file is empty, write the headers first
    if file.tell() == 0:
        writer.writerow(headers)

    # Initialize a counter for progress tracking
    total_qbs = len(qbs)
    qb_counter = 0
    max_retries = 3  # Set maximum number of retries

    # Iterate over each quarterback and scrape data
    for index, qb in qbs.iterrows():
        qb_counter += 1
        url = qb['url']
        first_name = qb['first_name']
        last_name = qb['last_name']
        position = 'QB'

        print(f"Processing QB {qb_counter}/{total_qbs}: {first_name} {last_name}")

        retries = 0  # Retry counter
        success = False  # Flag for successful request

        while retries < max_retries and not success:
            try:
                response = requests.get(url, timeout=10)  # Add timeout to avoid long waiting periods
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    game_logs_table = soup.find('table', {'id': 'passing'})  # Adjust this ID if needed

                    if game_logs_table:  # Check if the table is found
                        data_rows = game_logs_table.find('tbody').find_all('tr')

                        for row in data_rows:
                            cells = row.find_all(['th', 'td'])
                            data = [cell.text.strip() for cell in cells]

                            # Extract 'Year' from the data (assumes year is in the correct column)
                            year = data[0]  # Adjust index if Year is in another column

                            # Check if the (url, year) combination already exists
                            if (url, year) in existing_url_years:
                                print(f"Data for {first_name} {last_name} in year {year} already exists. Skipping...")
                                continue

                            # If unique, append data with player information
                            data = [url, position, first_name, last_name] + data
                            writer.writerow(data)
                            existing_url_years.add((url, year))  # Add the new combination to the set

                        print(f"Data written for {first_name} {last_name}")
                        success = True  # Mark as successful
                    else:
                        print(f"No game logs table found for URL: {url}")
                        success = True  # No need to retry if there's no table
                else:
                    print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")
                    retries += 1

            except (Timeout, RequestException) as e:
                retries += 1
                print(f"Error processing {url}: {e}. Retrying ({retries}/{max_retries})...")
                time.sleep(2 ** retries)  # Exponential backoff

        # If not successful after all retries, skip the QB
        if not success:
            print(f"Skipping {first_name} {last_name} after {max_retries} failed attempts.")

        print(f'Processed URL: {url}')
        time.sleep(2.25)  # Add a delay after processing each URL

print('Data saved to game_logs_qb.csv')


Processing QB 1/208: Aaron Rodgers
Data written for Aaron Rodgers
Processed URL: https://www.pro-football-reference.com/players/R/RodgAa00.htm
Processing QB 2/208: Joseph Flacco
Data written for Joseph Flacco
Processed URL: https://www.pro-football-reference.com/players/F/FlacJo00.htm
Processing QB 3/208: Joshua Johnson
Data for Joshua Johnson in year 2021 already exists. Skipping...
Data for Joshua Johnson in year 2021 already exists. Skipping...
Data for Joshua Johnson in year  already exists. Skipping...
Data written for Joshua Johnson
Processed URL: https://www.pro-football-reference.com/players/J/JohnJo05.htm
Processing QB 4/208: John Stafford
Data written for John Stafford
Processed URL: https://www.pro-football-reference.com/players/S/StafMa00.htm
Processing QB 5/208: Daniel McCoy
Data written for Daniel McCoy
Processed URL: https://www.pro-football-reference.com/players/M/McCoCo00.htm
Processing QB 6/208: Blaine Gabbert
Data written for Blaine Gabbert
Processed URL: https://www

Processing QB 52/208: Will Grier
Data written for Will Grier
Processed URL: https://www.pro-football-reference.com/players/G/GrieWi00.htm
Processing QB 53/208: Jarrett Stidham
Data written for Jarrett Stidham
Processed URL: https://www.pro-football-reference.com/players/S/StidJa00.htm
Processing QB 54/208: Easton Stick
Data written for Easton Stick
Processed URL: https://www.pro-football-reference.com/players/S/SticEa00.htm
Processing QB 55/208: Gardner Minshew
Data written for Gardner Minshew
Processed URL: https://www.pro-football-reference.com/players/M/MinsGa00.htm
Processing QB 56/208: Drew Lock
Data written for Drew Lock
Processed URL: https://www.pro-football-reference.com/players/L/LockDr00.htm
Processing QB 57/208: Daniel Jones
Data written for Daniel Jones
Processed URL: https://www.pro-football-reference.com/players/J/JoneDa05.htm
Processing QB 58/208: Tyler Huntley
Data written for Tyler Huntley
Processed URL: https://www.pro-football-reference.com/players/H/HuntTy01.htm
Pr

Processing QB 106/208: Matthew Ryan
Data written for Matthew Ryan
Processed URL: https://www.pro-football-reference.com/players/R/RyanMa00.htm
Processing QB 107/208: Joseph Flacco
Data for Joseph Flacco in year 2008 already exists. Skipping...
Data for Joseph Flacco in year 2009 already exists. Skipping...
Data for Joseph Flacco in year 2010 already exists. Skipping...
Data for Joseph Flacco in year 2011 already exists. Skipping...
Data for Joseph Flacco in year 2012 already exists. Skipping...
Data for Joseph Flacco in year 2013 already exists. Skipping...
Data for Joseph Flacco in year 2014 already exists. Skipping...
Data for Joseph Flacco in year 2015 already exists. Skipping...
Data for Joseph Flacco in year 2016 already exists. Skipping...
Data for Joseph Flacco in year 2017 already exists. Skipping...
Data for Joseph Flacco in year 2018 already exists. Skipping...
Data for Joseph Flacco in year 2019 already exists. Skipping...
Data for Joseph Flacco in year 2020 already exists. 

Processing QB 115/208: Theodore Bridgewater
Data for Theodore Bridgewater in year 2014 already exists. Skipping...
Data for Theodore Bridgewater in year 2015 already exists. Skipping...
Data for Theodore Bridgewater in year  already exists. Skipping...
Data for Theodore Bridgewater in year 2017 already exists. Skipping...
Data for Theodore Bridgewater in year 2018 already exists. Skipping...
Data for Theodore Bridgewater in year 2019 already exists. Skipping...
Data for Theodore Bridgewater in year 2020 already exists. Skipping...
Data for Theodore Bridgewater in year 2021 already exists. Skipping...
Data for Theodore Bridgewater in year 2022 already exists. Skipping...
Data for Theodore Bridgewater in year 2023 already exists. Skipping...
Data for Theodore Bridgewater in year 2024 already exists. Skipping...
Data written for Theodore Bridgewater
Processed URL: https://www.pro-football-reference.com/players/B/BridTe00.htm
Processing QB 116/208: Derek Carr
Data for Derek Carr in year 20

Processing QB 127/208: Jared Goff
Data for Jared Goff in year 2016 already exists. Skipping...
Data for Jared Goff in year 2017 already exists. Skipping...
Data for Jared Goff in year 2018 already exists. Skipping...
Data for Jared Goff in year 2019 already exists. Skipping...
Data for Jared Goff in year 2020 already exists. Skipping...
Data for Jared Goff in year 2021 already exists. Skipping...
Data for Jared Goff in year 2022 already exists. Skipping...
Data for Jared Goff in year 2023 already exists. Skipping...
Data for Jared Goff in year 2024 already exists. Skipping...
Data written for Jared Goff
Processed URL: https://www.pro-football-reference.com/players/G/GoffJa00.htm
Processing QB 128/208: Jacoby Brissett
Data for Jacoby Brissett in year 2016 already exists. Skipping...
Data for Jacoby Brissett in year 2017 already exists. Skipping...
Data for Jacoby Brissett in year 2018 already exists. Skipping...
Data for Jacoby Brissett in year 2019 already exists. Skipping...
Data for 

Processing QB 141/208: Mason Rudolph
Data for Mason Rudolph in year 2019 already exists. Skipping...
Data for Mason Rudolph in year 2020 already exists. Skipping...
Data for Mason Rudolph in year 2021 already exists. Skipping...
Data for Mason Rudolph in year  already exists. Skipping...
Data for Mason Rudolph in year 2023 already exists. Skipping...
Data for Mason Rudolph in year 2024 already exists. Skipping...
Data written for Mason Rudolph
Processed URL: https://www.pro-football-reference.com/players/R/RudoMa00.htm
Processing QB 142/208: Lamar Jackson
Data for Lamar Jackson in year 2018 already exists. Skipping...
Data for Lamar Jackson in year 2019 already exists. Skipping...
Data for Lamar Jackson in year 2020 already exists. Skipping...
Data for Lamar Jackson in year 2021 already exists. Skipping...
Data for Lamar Jackson in year 2022 already exists. Skipping...
Data for Lamar Jackson in year 2023 already exists. Skipping...
Data for Lamar Jackson in year 2024 already exists. Sk

Processing QB 159/208: Justin Herbert
Data for Justin Herbert in year 2020 already exists. Skipping...
Data for Justin Herbert in year 2021 already exists. Skipping...
Data for Justin Herbert in year 2022 already exists. Skipping...
Data for Justin Herbert in year 2023 already exists. Skipping...
Data for Justin Herbert in year 2024 already exists. Skipping...
Data written for Justin Herbert
Processed URL: https://www.pro-football-reference.com/players/H/HerbJu00.htm
Processing QB 160/208: Benjamin DiNucci
Data for Benjamin DiNucci in year 2020 already exists. Skipping...
Data written for Benjamin DiNucci
Processed URL: https://www.pro-football-reference.com/players/D/DiNuBe00.htm
Processing QB 161/208: Jalen Hurts
Data for Jalen Hurts in year 2020 already exists. Skipping...
Data for Jalen Hurts in year 2021 already exists. Skipping...
Data for Jalen Hurts in year 2022 already exists. Skipping...
Data for Jalen Hurts in year 2023 already exists. Skipping...
Data for Jalen Hurts in yea

Processing QB 185/208: Tommy DeVito
Data for Tommy DeVito in year 2023 already exists. Skipping...
Data for Tommy DeVito in year 2024 already exists. Skipping...
Data written for Tommy DeVito
Processed URL: https://www.pro-football-reference.com/players/D/DeViTo00.htm
Processing QB 186/208: Hendon Hooker
Data for Hendon Hooker in year 2024 already exists. Skipping...
Data written for Hendon Hooker
Processed URL: https://www.pro-football-reference.com/players/H/HookHe00.htm
Processing QB 187/208: Aidan O'Connell
Data for Aidan O'Connell in year 2023 already exists. Skipping...
Data for Aidan O'Connell in year 2024 already exists. Skipping...
Data written for Aidan O'Connell
Processed URL: https://www.pro-football-reference.com/players/O/OConAi00.htm
Processing QB 188/208: Clayton Tune
Data for Clayton Tune in year 2023 already exists. Skipping...
Data for Clayton Tune in year 2024 already exists. Skipping...
Data written for Clayton Tune
Processed URL: https://www.pro-football-reference

In [16]:
# # TE game tables
# # Not in nfl.db currently

# df = pd.read_csv('./data/rosters.csv')

# # Drop rows where 'pfr_id' is missing
# df = df.dropna(subset=['pfr_id'])

# # Filter for Tight Ends
# tes = df[df['position'].str.lower() == 'te']

# # Open a CSV file to write the data for tight ends
# with open('./data/game_logs_te.csv', 'w', newline='') as file:
#     writer = csv.writer(file)
#     headers_written = False  # To track if headers have been written to the file

#     # Initialize a counter for progress tracking
#     total_tes = len(tes)
#     te_counter = 0

#     # Iterate over each tight end and scrape data
#     for index, te in tes.iterrows():
#         te_counter += 1
#         print(f"Processing TE {te_counter}/{total_tes}: {te['first_name']} {te['last_name']}")

#         url = te['url']
#         first_name = te['first_name']  # Get the player's first name
#         last_name = te['last_name']    # Get the player's last name
#         position = 'TE'  # Set the position to 'TE'

#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             game_logs_table = soup.find('table', {'id': 'receiving_and_rushing'})

#             if game_logs_table:  # Check if the table is found
#                 header_row = game_logs_table.find('thead').find_all('tr')[-1]
#                 data_rows = game_logs_table.find('tbody').find_all('tr')

#                 if not headers_written:  # Write headers only once
#                     headers = ['Player URL', 'Position', 'First Name', 'Last Name']  # Add new headers at the beginning
#                     headers.extend(header.text.strip() for header in header_row.find_all('th'))  # Add existing headers
#                     writer.writerow(headers)
#                     headers_written = True

#                 for row in data_rows:
#                     cells = row.find_all(['th', 'td'])
#                     data = [url, position, first_name, last_name]  # Start with additional data
#                     data.extend(cell.text.strip() for cell in cells)  # Append scraped data
#                     writer.writerow(data)
#                 print(f"Data written for {first_name} {last_name}")

#             else:
#                 print(f"No game logs table found for URL: {url}")
#         else:
#             print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")

#         # print(f'Processed URL: {url}')  # Print the URL being processed
#         time.sleep(2)  # Add a 3-second delay after processing each URL

# print('Data saved to game_logs_te.csv')
import pandas as pd
import csv
import requests
from bs4 import BeautifulSoup
import time
import os

# TE game tables
# Not in nfl.db currently

df = pd.read_csv('./data/rosters.csv')

# Drop rows where 'pfr_id' is missing
df = df.dropna(subset=['pfr_id'])

# Filter for Tight Ends
tes = df[df['position'].str.lower() == 'te']

# Initialize processed players list
processed_urls = set()

# Check if the CSV file already exists to resume processing
if os.path.exists('./data/game_logs_te.csv'):
    with open('./data/game_logs_te.csv', 'r', newline='') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the headers
        for row in reader:
            processed_urls.add(row[0])  # Assuming URL is the first column

# Open the CSV file in append mode to add new data
with open('./data/game_logs_te.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    headers_written = os.path.getsize('./data/game_logs_te.csv') > 0  # Check if headers are already written

    # Initialize a counter for progress tracking
    total_tes = len(tes)
    te_counter = 0

    # Iterate over each tight end and scrape data
    for index, te in tes.iterrows():
        url = te['url']
        
        # Skip if this URL has already been processed
        if url in processed_urls:
            print(f"Skipping already processed TE: {te['first_name']} {te['last_name']}")
            continue

        te_counter += 1
        print(f"Processing TE {te_counter}/{total_tes}: {te['first_name']} {te['last_name']}")

        first_name = te['first_name']  # Get the player's first name
        last_name = te['last_name']    # Get the player's last name
        position = 'TE'  # Set the position to 'TE'

        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            game_logs_table = soup.find('table', {'id': 'receiving_and_rushing'})

            if game_logs_table:  # Check if the table is found
                header_row = game_logs_table.find('thead').find_all('tr')[-1]
                data_rows = game_logs_table.find('tbody').find_all('tr')

                if not headers_written:  # Write headers only once
                    headers = ['Player URL', 'Position', 'First Name', 'Last Name']  # Add new headers at the beginning
                    headers.extend(header.text.strip() for header in header_row.find_all('th'))  # Add existing headers
                    writer.writerow(headers)
                    headers_written = True

                for row in data_rows:
                    cells = row.find_all(['th', 'td'])
                    data = [url, position, first_name, last_name]  # Start with additional data
                    data.extend(cell.text.strip() for cell in cells)  # Append scraped data
                    writer.writerow(data)
                print(f"Data written for {first_name} {last_name}")

            else:
                print(f"No game logs table found for URL: {url}")
        else:
            print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")

        # Add the processed URL to the set to keep track
        processed_urls.add(url)

        time.sleep(2)  # Add a delay after processing each URL

print('Data saved to game_logs_te.csv')

Skipping already processed TE: Marcedes Lewis
Skipping already processed TE: Jimmy Graham
Skipping already processed TE: Zachary Ertz
Skipping already processed TE: Travis Kelce
Skipping already processed TE: Logan Thomas
Skipping already processed TE: Jordan Matthews
Skipping already processed TE: Chris Manhertz
Skipping already processed TE: MyCole Pruitt
Skipping already processed TE: Darren Waller
Skipping already processed TE: Eric Tomlinson
Skipping already processed TE: Blake Bell
Skipping already processed TE: Christopher Uzomah
Skipping already processed TE: Jesse James
Skipping already processed TE: Geoffrey Swaim
Skipping already processed TE: Tyler Kroft
Skipping already processed TE: Austin Hooper
Skipping already processed TE: Nicholas Vannett
Skipping already processed TE: Stephen Anderson
Skipping already processed TE: John Holtz
Skipping already processed TE: Hunter Henry
Skipping already processed TE: Tyler Higbee
Skipping already processed TE: Mo Alie-Cox
Skipping al

Processing TE 46/291: Payne Durham
Data written for Payne Durham
Processing TE 47/291: Sam LaPorta
Data written for Sam LaPorta
Processing TE 48/291: Michael Mayer
Data written for Michael Mayer
Processing TE 49/291: Davis Allen
Data written for Davis Allen
Processing TE 50/291: Luke Musgrave
Data written for Luke Musgrave
Skipping already processed TE: Marcedes Lewis
Skipping already processed TE: Zachary Ertz
Skipping already processed TE: Travis Kelce
Skipping already processed TE: Logan Thomas
Skipping already processed TE: Jordan Matthews
Skipping already processed TE: Chris Manhertz
Skipping already processed TE: MyCole Pruitt
Skipping already processed TE: Darren Waller
Skipping already processed TE: Eric Tomlinson
Skipping already processed TE: Christopher Uzomah
Skipping already processed TE: Geoffrey Swaim
Skipping already processed TE: Austin Hooper
Skipping already processed TE: Nicholas Vannett
Skipping already processed TE: Hunter Henry
Skipping already processed TE: Tyle

In [17]:
# WR game tables
# Not in nfl.db currently

# ### Loop through wide receivers
# df = pd.read_csv('./data/rosters.csv')

# # Drop rows where 'pfr_id' is missing
# df = df.dropna(subset=['pfr_id'])

# # Filter for Wide Receivers ('WR' or 'wr')
# wrs = df[df['position'].str.lower() == 'wr']

# # Open a CSV file to write the data for wide receivers
# with open('./data/game_logs_wr.csv', 'w', newline='') as file:
#     writer = csv.writer(file)
#     headers_written = False  # To track if headers have been written to the file

#     # Initialize a counter for progress tracking
#     total_wrs = len(wrs)
#     wr_counter = 0

#     # Iterate over each wide receiver and scrape data
#     for index, wr in wrs.iterrows():
#         wr_counter += 1
#         url = wr['url']
#         print(f"Processing WR {wr_counter}/{total_wrs}: {wr['first_name']} {wr['last_name']}")
#         first_name = wr['first_name']  # Get the player's first name
#         last_name = wr['last_name']    # Get the player's last name
#         position = 'WR'  # Set the position to 'WR'

#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             game_logs_table = soup.find('table', {'id': 'receiving_and_rushing'})

#             if game_logs_table:  # Check if the table is found
#                 header_row = game_logs_table.find('thead').find_all('tr')[-1]
#                 data_rows = game_logs_table.find('tbody').find_all('tr')

#                 if not headers_written:  # Write headers only once
#                     headers = ['Player URL', 'Position', 'First Name', 'Last Name']  # Add new headers at the beginning
#                     headers.extend(header.text.strip() for header in header_row.find_all('th'))  # Add existing headers
#                     writer.writerow(headers)
#                     headers_written = True

#                 for row in data_rows:
#                     cells = row.find_all(['th', 'td'])
#                     data = [url, position, first_name, last_name]  # Start with additional data
#                     data.extend(cell.text.strip() for cell in cells)  # Append scraped data
#                     writer.writerow(data)
#                 print(f"Data written for {first_name} {last_name}")

#             else:
#                 print(f"No game logs table found for URL: {url}")
#         else:
#             print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")

#         # print(f'Processed URL: {url}')  # Print the URL being processed
#         time.sleep(2)  # Add a 3-second delay after processing each URL

# print('Data saved to game_logs_wr.csv')


df = pd.read_csv('./data/rosters.csv')

# Drop rows where 'pfr_id' is missing
df = df.dropna(subset=['pfr_id'])

# Filter for Wide Receivers ('WR' or 'wr')
wrs = df[df['position'].str.lower() == 'wr']

# Read existing game logs if the file already exists
existing_data = pd.DataFrame()
try:
    existing_data = pd.read_csv('./data/game_logs_wr.csv')
    # Extract unique player URLs from the existing data
    existing_urls = existing_data['Player URL'].unique()
    # Filter out already processed wide receivers
    wrs = wrs[~wrs['url'].isin(existing_urls)]
except FileNotFoundError:
    # If the file does not exist, proceed with all wide receivers
    print('No existing game logs found, starting fresh.')

# Open a CSV file to write the data for wide receivers
with open('./data/game_logs_wr.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    headers_written = False  # To track if headers have been written to the file

    # Initialize a counter for progress tracking
    total_wrs = len(wrs)
    wr_counter = 0

    # Iterate over each wide receiver and scrape data
    for index, wr in wrs.iterrows():
        wr_counter += 1
        url = wr['url']
        print(f"Processing WR {wr_counter}/{total_wrs}: {wr['first_name']} {wr['last_name']}")
        first_name = wr['first_name']  # Get the player's first name
        last_name = wr['last_name']    # Get the player's last name
        position = 'WR'  # Set the position to 'WR'

        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            game_logs_table = soup.find('table', {'id': 'receiving_and_rushing'})

            if game_logs_table:  # Check if the table is found
                header_row = game_logs_table.find('thead').find_all('tr')[-1]
                data_rows = game_logs_table.find('tbody').find_all('tr')

                if not headers_written:  # Write headers only once
                    headers = ['Player URL', 'Position', 'First Name', 'Last Name']  # Add new headers at the beginning
                    headers.extend(header.text.strip() for header in header_row.find_all('th'))  # Add existing headers
                    writer.writerow(headers)
                    headers_written = True

                for row in data_rows:
                    cells = row.find_all(['th', 'td'])
                    data = [url, position, first_name, last_name]  # Start with additional data
                    data.extend(cell.text.strip() for cell in cells)  # Append scraped data
                    writer.writerow(data)
                print(f"Data written for {first_name} {last_name}")

            else:
                print(f"No game logs table found for URL: {url}")
        else:
            print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")

        time.sleep(2)

print('Data saved to game_logs_wr.csv')

No existing game logs found, starting fresh.
Processing WR 1/582: Matthew Slater
No game logs table found for URL: https://www.pro-football-reference.com/players/S/SlatMa00.htm
Processing WR 2/582: Quintorris Jones
Data written for Quintorris Jones
Processing WR 3/582: Randall Cobb
Data written for Randall Cobb
Processing WR 4/582: Cole Beasley
Data written for Cole Beasley
Processing WR 5/582: Marvin Jones
Data written for Marvin Jones
Processing WR 6/582: Adam Thielen
Data written for Adam Thielen
Processing WR 7/582: Marquise Goodwin
Data written for Marquise Goodwin
Processing WR 8/582: Keenan Allen
Data written for Keenan Allen
Processing WR 9/582: Robert Woods
Data written for Robert Woods
Processing WR 10/582: DeAndre Hopkins
Data written for DeAndre Hopkins
Processing WR 11/582: Willie Snead
Data written for Willie Snead
Processing WR 12/582: Odell Beckham
Data written for Odell Beckham
Processing WR 13/582: Brandin Cooks
Data written for Brandin Cooks
Processing WR 14/582: Mar

Data written for Lil'Jordan Humphrey
Processing WR 118/582: Jalen Guyton
Data written for Jalen Guyton
Processing WR 119/582: Tyron Johnson
Data written for Tyron Johnson
Processing WR 120/582: Malik Taylor
Data written for Malik Taylor
Processing WR 121/582: Greg Dortch
Data written for Greg Dortch
Processing WR 122/582: Jeffrey Smith
Data written for Jeffrey Smith
Processing WR 123/582: Andrew Isabella
Data written for Andrew Isabella
Processing WR 124/582: Darius Slayton
Data written for Darius Slayton
Processing WR 125/582: Thomas Kennedy
Data written for Thomas Kennedy
Processing WR 126/582: Gary Jennings
No game logs table found for URL: https://www.pro-football-reference.com/players/J/JennGa00.htm
Processing WR 127/582: Juwann Winfree
Data written for Juwann Winfree
Processing WR 128/582: Nsimba Webster
Data written for Nsimba Webster
Processing WR 129/582: N'Keal Harry
Data written for N'Keal Harry
Processing WR 130/582: Parris Campbell
Data written for Parris Campbell
Processi

Data written for Kyle Philips
Processing WR 228/582: Jalen Nailor
Data written for Jalen Nailor
Processing WR 229/582: Braylon Sanders
Data written for Braylon Sanders
Processing WR 230/582: Brandon Johnson
Data written for Brandon Johnson
Processing WR 231/582: Jalen Virgil
Data written for Jalen Virgil
Processing WR 232/582: Kwamie Lassiter
No game logs table found for URL: https://www.pro-football-reference.com/players/L/LassKw20.htm
Processing WR 233/582: Johnny Johnson
No game logs table found for URL: https://www.pro-football-reference.com/players/J/JohnJo12.htm
Processing WR 234/582: Deven Thompkins
Data written for Deven Thompkins
Processing WR 235/582: Dovontavean Martin
Data written for Dovontavean Martin
Processing WR 236/582: Rashid Shaheed
Data written for Rashid Shaheed
Processing WR 237/582: Dennis Houston
Data written for Dennis Houston
Processing WR 238/582: Jared Bernhardt
No game logs table found for URL: https://www.pro-football-reference.com/players/B/BernJa00.htm


Data written for Jamal Agnew
Processing WR 337/582: David Moore
Data written for David Moore
Processing WR 338/582: Noah Brown
Data written for Noah Brown
Processing WR 339/582: John Smith-Schuster
Data written for John Smith-Schuster
Processing WR 340/582: Isaiah Jones
Data written for Isaiah Jones
Processing WR 341/582: Cooper Kupp
Data written for Cooper Kupp
Processing WR 342/582: Rod Godwin
Data written for Rod Godwin
Processing WR 343/582: Joshua Dedmon-Reynolds Reynolds
Data written for Joshua Dedmon-Reynolds Reynolds
Processing WR 344/582: Cam Sims
Data written for Cam Sims
Processing WR 345/582: Marquez Valdes-Scantling
Data written for Marquez Valdes-Scantling
Processing WR 346/582: Equanimeous St. Brown
Data written for Equanimeous St. Brown
Processing WR 347/582: Byron Pringle
Data written for Byron Pringle
Processing WR 348/582: Keith Kirkwood
Data written for Keith Kirkwood
Processing WR 349/582: Courtland Sutton
Data written for Courtland Sutton
Processing WR 350/582: An

Processing WR 451/582: Chatarius Atwell
Data written for Chatarius Atwell
Processing WR 452/582: Jalen Camp
Data written for Jalen Camp
Processing WR 453/582: Ben Skowronek
Data written for Ben Skowronek
Processing WR 454/582: Jaelon Darden
Data written for Jaelon Darden
Processing WR 455/582: Ja'Marr Chase
Data written for Ja'Marr Chase
Processing WR 456/582: DeVonta Smith
Data written for DeVonta Smith
Processing WR 457/582: Kadarius Toney
Data written for Kadarius Toney
Processing WR 458/582: Jacob Harris
Data written for Jacob Harris
Processing WR 459/582: Rondale Moore
Data written for Rondale Moore
Processing WR 460/582: Kawaan Baker
No game logs table found for URL: https://www.pro-football-reference.com/players/B/BakeKa00.htm
Processing WR 461/582: Terrace Marshall
Data written for Terrace Marshall
Processing WR 462/582: Michael Bandy
Data written for Michael Bandy
Processing WR 463/582: Amon-Ra St. Brown
Data written for Amon-Ra St. Brown
Processing WR 464/582: Elijah Moore
Da

No game logs table found for URL: https://www.pro-football-reference.com/players/P/PalmTe00.htm
Processing WR 558/582: Ryan Flournoy
Data written for Ryan Flournoy
Processing WR 559/582: Brenden Rice
No game logs table found for URL: https://www.pro-football-reference.com/players/R/RiceBr00.htm
Processing WR 560/582: Devaughn Vele
Data written for Devaughn Vele
Processing WR 561/582: Cornelius Johnson
No game logs table found for URL: https://www.pro-football-reference.com/players/J/JohnCo02.htm
Processing WR 562/582: Roman Wilson
No game logs table found for URL: https://www.pro-football-reference.com/players/W/WilsRo02.htm
Processing WR 563/582: Ainias Smith
Data written for Ainias Smith
Processing WR 564/582: Jordan Whittington
Data written for Jordan Whittington
Processing WR 565/582: Devontez Walker
Data written for Devontez Walker
Processing WR 566/582: Jermaine Burton
Data written for Jermaine Burton
Processing WR 567/582: Anthony Gould
Data written for Anthony Gould
Processing 

In [18]:
# # RB game tables
# # Not in nfl.db currently

# df = pd.read_csv('./data/rosters.csv')

# # Drop rows where 'pfr_id' is missing
# df = df.dropna(subset=['pfr_id'])

# # Filter for Running Backs ('RB' or 'rb')
# rbs = df[df['position'].str.lower() == 'rb']

# # Open a CSV file to write the data for running backs
# with open('./data/game_logs_rb.csv', 'w', newline='') as file:
#     writer = csv.writer(file)
#     headers_written = False  # To track if headers have been written to the file

#     # Initialize a counter for progress tracking
#     total_rbs = len(rbs)
#     rb_counter = 0

#     # Iterate over each running back and scrape data
#     for index, rb in rbs.iterrows():
#         rb_counter += 1
#         url = rb['url']
#         print(f"Processing RB {rb_counter}/{total_rbs}: {rb['first_name']} {rb['last_name']}")
#         first_name = rb['first_name']  # Get the player's first name
#         last_name = rb['last_name']    # Get the player's last name
#         position = 'RB'  # Set the position to 'RB'

#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             game_logs_table = soup.find('table', {'id': 'rushing_and_receiving'})

#             if game_logs_table:  # Check if the table is found
#                 header_row = game_logs_table.find('thead').find_all('tr')[-1]
#                 data_rows = game_logs_table.find('tbody').find_all('tr')

#                 if not headers_written:  # Write headers only once
#                     headers = ['Player URL', 'Position', 'First Name', 'Last Name']  # Add new headers at the beginning
#                     headers.extend(header.text.strip() for header in header_row.find_all('th'))  # Add existing headers
#                     writer.writerow(headers)
#                     headers_written = True

#                 for row in data_rows:
#                     cells = row.find_all(['th', 'td'])
#                     data = [url, position, first_name, last_name]  # Start with additional data
#                     data.extend(cell.text.strip() for cell in cells)  # Append scraped data
#                     writer.writerow(data)
#                 print(f"Data written for {first_name} {last_name}")

#             else:
#                 print(f"No game logs table found for URL: {url}")
#         else:
#             print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")

#         # print(f'Processed URL: {url}')  # Print the URL being processed
#         time.sleep(2)  # Add a 3-second delay after processing each URL

# print('Data saved to game_logs_rb.csv')

import pandas as pd
import requests
import os
import csv
from bs4 import BeautifulSoup
import time

# RB game tables
# Not in nfl.db currently

df = pd.read_csv('./data/rosters.csv')

# Drop rows where 'pfr_id' is missing
df = df.dropna(subset=['pfr_id'])

# Filter for Running Backs ('RB' or 'rb')
rbs = df[df['position'].str.lower() == 'rb']

# Load existing data from game_logs_rb.csv if it exists
existing_data = []
if os.path.exists('./data/game_logs_rb.csv'):
    with open('./data/game_logs_rb.csv', 'r', newline='') as existing_file:
        reader = csv.reader(existing_file)
        existing_data = list(reader)

# Extract the headers and existing player URLs
if existing_data:
    headers = existing_data[0]
    existing_player_urls = {row[headers.index('Player URL')] for row in existing_data[1:]}
else:
    headers = []
    existing_player_urls = set()

# Open a CSV file to write the data for running backs
with open('./data/game_logs_rb.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    
    # Write headers if the file is empty or headers are not present
    if not headers:
        headers = ['Player URL', 'Position', 'First Name', 'Last Name']  # Add new headers at the beginning
        # Assuming the code will later fetch the actual headers from the table.
        headers_written = False
    else:
        headers_written = True

    # Initialize a counter for progress tracking
    total_rbs = len(rbs)
    rb_counter = 0

    # Iterate over each running back and scrape data
    for index, rb in rbs.iterrows():
        rb_counter += 1
        url = rb['url']
        print(f"Processing RB {rb_counter}/{total_rbs}: {rb['first_name']} {rb['last_name']}")
        first_name = rb['first_name']  # Get the player's first name
        last_name = rb['last_name']    # Get the player's last name
        position = 'RB'  # Set the position to 'RB'

        # Check if the player's data is already in the file
        if url in existing_player_urls:
            print(f"Skipping {rb['first_name']} {rb['last_name']}, data already exists.")
            continue

        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            game_logs_table = soup.find('table', {'id': 'rushing_and_receiving'})

            if game_logs_table:  # Check if the table is found
                header_row = game_logs_table.find('thead').find_all('tr')[-1]
                data_rows = game_logs_table.find('tbody').find_all('tr')

                if not headers_written:  # Write headers only once
                    headers.extend(header.text.strip() for header in header_row.find_all('th'))  # Add existing headers
                    writer.writerow(headers)
                    headers_written = True

                for row in data_rows:
                    cells = row.find_all(['th', 'td'])
                    data = [url, position, first_name, last_name]  # Start with additional data
                    data.extend(cell.text.strip() for cell in cells)  # Append scraped data
                    writer.writerow(data)
                    existing_player_urls.add(url)  # Add the URL to the set to prevent duplicates during this run

                print(f"Data written for {first_name} {last_name}")

            else:
                print(f"No game logs table found for URL: {url}")
        else:
            print(f"Failed to retrieve URL: {url} with status code: {response.status_code}")

        time.sleep(2)  # Add a 2-second delay after processing each URL

print('Data saved to game_logs_rb.csv')


Processing RB 1/360: Taiwan Jones
Data written for Taiwan Jones
Processing RB 2/360: Brandon Bolden
Data written for Brandon Bolden
Processing RB 3/360: Kyle Juszczyk
No game logs table found for URL: https://www.pro-football-reference.com/players/J/JuszKy00.htm
Processing RB 4/360: Latavius Murray
Data written for Latavius Murray
Processing RB 5/360: Cordarrelle Patterson
Data written for Cordarrelle Patterson
Processing RB 6/360: Damien Williams
Data written for Damien Williams
Processing RB 7/360: Jerick McKinnon
Data written for Jerick McKinnon
Processing RB 8/360: Michael Burton
Data written for Michael Burton
Processing RB 9/360: Dominique Mostert
Data written for Dominique Mostert
Processing RB 10/360: Ameer Abdullah
Data written for Ameer Abdullah
Processing RB 11/360: Melvin Gordon
Data written for Melvin Gordon
Processing RB 12/360: Dwayne Washington
Data written for Dwayne Washington
Processing RB 13/360: Derrick Henry
Data written for Derrick Henry
Processing RB 14/360: Cor

Data written for Jermar Jefferson
Processing RB 114/360: Jaret Patterson
Data written for Jaret Patterson
Processing RB 115/360: Kene Nwangwu
Data written for Kene Nwangwu
Processing RB 116/360: Chris Evans
Data written for Chris Evans
Processing RB 117/360: Jake Funk
Data written for Jake Funk
Processing RB 118/360: Rhamondre Stevenson
Data written for Rhamondre Stevenson
Processing RB 119/360: Najee Harris
Data written for Najee Harris
Processing RB 120/360: Khalil Herbert
Data written for Khalil Herbert
Processing RB 121/360: Kenneth Gainwell
Data written for Kenneth Gainwell
Processing RB 122/360: Michael Carter
Data written for Michael Carter
Processing RB 123/360: Avery Williams
Data written for Avery Williams
Processing RB 124/360: Travis Etienne
Data written for Travis Etienne
Processing RB 125/360: Trey Sermon
Data written for Trey Sermon
Processing RB 126/360: Javonte Williams
Data written for Javonte Williams
Processing RB 127/360: Tyler Badie
Data written for Tyler Badie
Pr

No game logs table found for URL: https://www.pro-football-reference.com/players/B/BawdNi00.htm
Processing RB 218/360: Chase Edmonds
Skipping Chase Edmonds, data already exists.
Processing RB 219/360: Rashaad Penny
Skipping Rashaad Penny, data already exists.
Processing RB 220/360: Nicholas Chubb
Skipping Nicholas Chubb, data already exists.
Processing RB 221/360: John Kelly
Skipping John Kelly, data already exists.
Processing RB 222/360: Royce Freeman
Skipping Royce Freeman, data already exists.
Processing RB 223/360: Saquon Barkley
Skipping Saquon Barkley, data already exists.
Processing RB 224/360: Alexander Mattison
Skipping Alexander Mattison, data already exists.
Processing RB 225/360: Justice Hill
Skipping Justice Hill, data already exists.
Processing RB 226/360: Miles Sanders
Skipping Miles Sanders, data already exists.
Processing RB 227/360: Devin Singletary
Skipping Devin Singletary, data already exists.
Processing RB 228/360: Tony Pollard
Skipping Tony Pollard, data already 

No game logs table found for URL: https://www.pro-football-reference.com/players/N/NichLe00.htm
Processing RB 315/360: Keaton Mitchell
Skipping Keaton Mitchell, data already exists.
Processing RB 316/360: Owen Wright
No game logs table found for URL: https://www.pro-football-reference.com/players/W/WrigOw00.htm
Processing RB 317/360: Bijan Robinson
Skipping Bijan Robinson, data already exists.
Processing RB 318/360: Kendre Miller
Skipping Kendre Miller, data already exists.
Processing RB 319/360: Cartavious Bigsby
Skipping Cartavious Bigsby, data already exists.
Processing RB 320/360: Chase Brown
Skipping Chase Brown, data already exists.
Processing RB 321/360: Christopher Rodriguez
Skipping Christopher Rodriguez, data already exists.
Processing RB 322/360: Scott Matlock
No game logs table found for URL: https://www.pro-football-reference.com/players/M/MatlSc00.htm
Processing RB 323/360: Christopher Vaughn
Skipping Christopher Vaughn, data already exists.
Processing RB 324/360: Kenny M

In [21]:
# Clean em

file_paths = [
    './data/game_logs_qb.csv',
    './data/game_logs_wr.csv',
    './data/game_logs_rb.csv',
    './data/game_logs_te.csv'
]

# Loop through each file path
for file_path in file_paths:
    # Load data from each CSV file
    data = pd.read_csv(file_path)
    
    # Ensure the 'Year' column is treated as a string
    data['Year'] = data['Year'].astype(str)

    # Identifying Pro Bowl and First-Team AP All-Pro selections
    data['Pro Bowl'] = data['Year'].str.contains('*', regex=False)
    data['First-Team AP All-Pro'] = data['Year'].str.contains('+', regex=False)

    # Cleaning up the 'Year' column
    data['Year'] = data['Year'].str.replace('*', '', regex=False)
    data['Year'] = data['Year'].str.replace('+', '', regex=False)

    # Save the modified DataFrame back to the same CSV file
    data.to_csv(file_path, index=False)

    # Print confirmation that the file has been updated
    print(f"File updated: {file_path}")


AttributeError: Can only use .str accessor with string values!

In [23]:
# Clean em

file_paths = [
#     './data/game_logs_wr.csv',
#     './data/game_logs_rb.csv',
    './data/game_logs_te.csv'
]

# Loop through each file path
for file_path in file_paths:
    # Load data from each CSV file
    data = pd.read_csv(file_path)
    
    # Ensure the 'Year' column is treated as a string
    data['Season'] = data['Season'].astype(str)

    # Identifying Pro Bowl and First-Team AP All-Pro selections
    data['Pro Bowl'] = data['Season'].str.contains('*', regex=False)
    data['First-Team AP All-Pro'] = data['Season'].str.contains('+', regex=False)

    # Cleaning up the 'Year' column
    data['Season'] = data['Season'].str.replace('*', '', regex=False)
    data['Season'] = data['Season'].str.replace('+', '', regex=False)

    # Save the modified DataFrame back to the same CSV file
    data.to_csv(file_path, index=False)

    # Print confirmation that the file has been updated
    print(f"File updated: {file_path}")


ParserError: Error tokenizing data. C error: Expected 36 fields in line 21, saw 39


In [None]:
!open data/game_logs_qb.csv

In [None]:
# # Multi team RBs?

# file_path = 'data/game_logs_rb.csv'
# data = pd.read_csv(file_path)

# # Identify rows with '2TM'
# multi_team_rows = data[data['Tm'] == '2TM']
# multi_team_indices = multi_team_rows.index

# # Loop through each multi-team row
# for idx in multi_team_indices:
#     # Find individual team rows for the same player and year
#     player = data.at[idx, 'First Name']
#     last_name = data.at[idx, 'Last Name']
#     year = data.at[idx, 'Year']
#     age = data.at[idx, 'Age']
    
#     # Find corresponding team rows and update their year and age
#     team_rows = data[(data['First Name'] == player) & (data['Last Name'] == last_name) & (data['Year'].isna()) & (data['Tm'] != '2TM')]
#     data.loc[team_rows.index, 'Year'] = year
#     data.loc[team_rows.index, 'Age'] = age

# # Drop the multi-team rows
# data = data[data['Tm'] != '2TM']

# # Save the updated DataFrame to a new CSV file
# output_file_path = 'data/game_logs_rb.csv'
# data.to_csv(output_file_path, index=False)

# print(f"Updated data saved to {output_file_path}")


In [2]:
def get_with_backoff(url, max_retries=6, backoff_factor=1.5, timeout=15):
    """GET a URL with exponential backoff, jitter, and a session User-Agent. Returns a requests.Response or None."""
    session = get_with_backoff.session
    for attempt in range(1, max_retries + 1):
        try:
            resp = session.get(url, timeout=timeout)
            # Handle rate limiting and server errors with backoff
            if resp.status_code == 429:
                wait = backoff_factor * (2 ** (attempt - 1)) + random.uniform(0, 1)
                print(f"429 from {url}. Sleeping {wait:.1f}s (attempt {attempt}/{max_retries})")
                time.sleep(wait)
                continue
            if resp.status_code >= 500:
                wait = backoff_factor * (2 ** (attempt - 1)) + random.uniform(0, 1)
                print(f"Server error {resp.status_code} for {url}. Sleeping {wait:.1f}s (attempt {attempt}/{max_retries})")
                time.sleep(wait)
                continue
            return resp
        except RequestException as e:
            wait = backoff_factor * (2 ** (attempt - 1)) + random.uniform(0, 1)
            print(f"RequestException for {url}: {e}. Sleeping {wait:.1f}s (attempt {attempt}/{max_retries})")
            time.sleep(wait)
            continue
    print(f"Failed to GET {url} after {max_retries} attempts")
    return None

# create a single session and attach it to the helper to avoid re-creating sockets
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 (nfl-ai-scraper)'
})
get_with_backoff.session = session


In [3]:
# UPDATED Team Game Logs with Rate Limiting
# Replace the existing Team Game Logs cell with this code

# Create directories if they don't exist
data_dir = './data/SR-game-logs'
os.makedirs(data_dir, exist_ok=True)

# Create directories if they don't exist
opponent_data_dir = './data/SR-opponent-game-logs'
os.makedirs(opponent_data_dir, exist_ok=True)

# List of teams
teams = [
    ['crd', 'Arizona Cardinals'],
    ['atl', 'Atlanta Falcons'],
    ['rav', 'Baltimore Ravens'],
    ['buf', 'Buffalo Bills'],
    ['car', 'Carolina Panthers'],
    ['chi', 'Chicago Bears'],
    ['cin', 'Cincinnati Bengals'],
    ['cle', 'Cleveland Browns'],
    ['dal', 'Dallas Cowboys'],
    ['den', 'Denver Broncos'],
    ['det', 'Detroit Lions'],
    ['gnb', 'Green Bay Packers'],
    ['htx', 'Houston Texans'],
    ['clt', 'Indianapolis Colts'],
    ['jax', 'Jacksonville Jaguars'],
    ['kan', 'Kansas City Chiefs'],
    ['sdg', 'Los Angeles Chargers'],
    ['ram', 'Los Angeles Rams'],
    ['rai', 'Las Vegas Raiders'],
    ['mia', 'Miami Dolphins'],
    ['min', 'Minnesota Vikings'],
    ['nwe', 'New England Patriots'],
    ['nor', 'New Orleans Saints'],
    ['nyg', 'New York Giants'],
    ['nyj', 'New York Jets'],
    ['phi', 'Philadelphia Eagles'],
    ['pit', 'Pittsburgh Steelers'],
    ['sea', 'Seattle Seahawks'],
    ['sfo', 'San Francisco 49ers'],
    ['tam', 'Tampa Bay Buccaneers'],
    ['oti', 'Tennessee Titans'],
    ['was', 'Washington Commanders']
]

team_game_logs_headers = [
    'week_num', 'game_day_of_week', 'game_date', 'boxscore_word', 'game_outcome', 'overtime', 
    'game_location', 'opp', 'pts_off', 'pts_def', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td', 
    'pass_int', 'pass_sacked', 'pass_sacked_yds', 'pass_yds_per_att', 'pass_net_yds_per_att', 
    'pass_cmp_perc', 'pass_rating', 'rush_att', 'rush_yds', 'rush_yds_per_att', 'rush_td', 
    'fgm', 'fga', 'xpm', 'xpa', 'punt', 'punt_yds', 'third_down_success', 'third_down_att', 
    'fourth_down_success', 'fourth_down_att', 'time_of_poss', 'Team_Name'
]

opponent_game_logs_headers = [
    'week_num', 'game_day_of_week', 'game_date', 'boxscore_word', 'game_outcome', 'overtime',
    'game_location', 'opp', 'pts_off', 'pts_def', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td',
    'pass_int', 'pass_sacked', 'pass_sacked_yds', 'pass_yds_per_att', 'pass_net_yds_per_att',
    'pass_cmp_perc', 'pass_rating', 'rush_att', 'rush_yds', 'rush_yds_per_att', 'rush_td',
    'fgm', 'fga', 'xpm', 'xpa', 'punt', 'punt_yds', 'third_down_success', 'third_down_att',
    'fourth_down_success', 'fourth_down_att', 'time_of_poss', 'Team_Name'
]

# Loop through the years
for year in range(2023, 2025):
    all_team_game_logs = []  # Create empty lists to accumulate team and opponent data for each year
    all_opponent_game_logs = []

    for team in teams:
        abbreviation, name = team
        print(f'Processing {name} for the year {year}')
        url = f'https://www.pro-football-reference.com/teams/{abbreviation}/{year}/gamelog/'
        response = get_with_backoff(url)
        
        if response is None:
            print(f'Failed to retrieve page {url} for {name} in {year}: exhausted retries')
            # back off a bit longer before continuing to the next team
            sleep(5 + random.uniform(0, 2))
            continue
        if response.status_code != 200:
            print(f'Failed to retrieve page {url} for {name} in {year}: {response.status_code}')
            sleep(3 + random.uniform(0, 2))
            continue

        soup = BeautifulSoup(response.content, 'html.parser')

        for table_id in [f'gamelog{year}', f'gamelog_opp{year}']:
            table = soup.find('table', {'id': table_id})

            if table is None:
                print(f'Table with id {table_id} not found on page {url} for {name} in {year}')
                continue

            tbody = table.find('tbody')
            game_logs = []
            for tr in tbody.find_all('tr'):
                row_data = []
                for td in tr.find_all(['th', 'td']):
                    row_data.append(td.text)
                if table_id == f'gamelog{year}':
                    row_data.append(name)
                    game_logs.append(row_data)
                elif table_id == f'gamelog_opp{year}':
                    row_data.append(name)
                    all_opponent_game_logs.append(row_data)

            if table_id == f'gamelog{year}':
                all_team_game_logs.extend(game_logs)

            # Check if playoff game logs exist for this team and year
            playoff_table_id = f'playoff_gamelog{year}'
            playoff_table = soup.find('table', {'id': playoff_table_id})

            if playoff_table:
                playoff_tbody = playoff_table.find('tbody')
                playoff_game_logs = []
                for tr in playoff_tbody.find_all('tr'):
                    row_data = []
                    for td in tr.find_all(['th', 'td']):
                        row_data.append(td.text)
                    row_data.append(name)
                    playoff_game_logs.append(row_data)

                all_team_game_logs.extend(playoff_game_logs)

        # Randomized sleep after processing each team to avoid bursts
        sleep(2.5 + random.uniform(0, 1.5))
        
    # Extra sleep between teams
    sleep(1.5 + random.uniform(0, 1.0))

    # Save the accumulated team and opponent data to CSV files, named based on the year
    with open(f'./data/SR-game-logs/all_teams_game_logs_{year}.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(team_game_logs_headers)
        writer.writerows(all_team_game_logs)

    with open(f'./data/SR-opponent-game-logs/all_teams_opponent_game_logs_{year}.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(opponent_game_logs_headers)
        writer.writerows(all_opponent_game_logs)


Processing Arizona Cardinals for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/crd/2023/gamelog/ for Arizona Cardinals in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/crd/2023/gamelog/ for Arizona Cardinals in 2023
Processing Atlanta Falcons for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/atl/2023/gamelog/ for Atlanta Falcons in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/atl/2023/gamelog/ for Atlanta Falcons in 2023
Processing Baltimore Ravens for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/rav/2023/gamelog/ for Baltimore Ravens in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/rav/2023/gamelog/ for Baltimore Ravens in 2023
Processing Buffalo Bills for the y

Processing Pittsburgh Steelers for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/pit/2023/gamelog/ for Pittsburgh Steelers in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/pit/2023/gamelog/ for Pittsburgh Steelers in 2023
Processing Seattle Seahawks for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/sea/2023/gamelog/ for Seattle Seahawks in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/sea/2023/gamelog/ for Seattle Seahawks in 2023
Processing San Francisco 49ers for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/sfo/2023/gamelog/ for San Francisco 49ers in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/sfo/2023/gamelog/ for San Francisco 49ers in 2023
Processing Tampa

Table with id gamelog_opp2024 not found on page https://www.pro-football-reference.com/teams/mia/2024/gamelog/ for Miami Dolphins in 2024
Processing Minnesota Vikings for the year 2024
Table with id gamelog2024 not found on page https://www.pro-football-reference.com/teams/min/2024/gamelog/ for Minnesota Vikings in 2024
Table with id gamelog_opp2024 not found on page https://www.pro-football-reference.com/teams/min/2024/gamelog/ for Minnesota Vikings in 2024
Processing New England Patriots for the year 2024
Table with id gamelog2024 not found on page https://www.pro-football-reference.com/teams/nwe/2024/gamelog/ for New England Patriots in 2024
Table with id gamelog_opp2024 not found on page https://www.pro-football-reference.com/teams/nwe/2024/gamelog/ for New England Patriots in 2024
Processing New Orleans Saints for the year 2024
Table with id gamelog2024 not found on page https://www.pro-football-reference.com/teams/nor/2024/gamelog/ for New Orleans Saints in 2024
Table with id gam

In [None]:
# # Team Game Logs
# # Not in nfl.db currently

# # Create directories if they don't exist
# data_dir = './data/SR-game-logs'
# os.makedirs(data_dir, exist_ok=True)

# # Create directories if they don't exist
# opponent_data_dir = './data/SR-opponent-game-logs'
# os.makedirs(opponent_data_dir, exist_ok=True)

# # List of teams
# teams = [
#     ['crd', 'Arizona Cardinals'],
#     ['atl', 'Atlanta Falcons'],
#     ['rav', 'Baltimore Ravens'],
#     ['buf', 'Buffalo Bills'],
#     ['car', 'Carolina Panthers'],
#     ['chi', 'Chicago Bears'],
#     ['cin', 'Cincinnati Bengals'],
#     ['cle', 'Cleveland Browns'],
#     ['dal', 'Dallas Cowboys'],
#     ['den', 'Denver Broncos'],
#     ['det', 'Detroit Lions'],
#     ['gnb', 'Green Bay Packers'],
#     ['htx', 'Houston Texans'],
#     ['clt', 'Indianapolis Colts'],
#     ['jax', 'Jacksonville Jaguars'],
#     ['kan', 'Kansas City Chiefs'],
#     ['sdg', 'Los Angeles Chargers'],
#     ['ram', 'Los Angeles Rams'],
#     ['rai', 'Las Vegas Raiders'],
#     ['mia', 'Miami Dolphins'],
#     ['min', 'Minnesota Vikings'],
#     ['nwe', 'New England Patriots'],
#     ['nor', 'New Orleans Saints'],
#     ['nyg', 'New York Giants'],
#     ['nyj', 'New York Jets'],
#     ['phi', 'Philadelphia Eagles'],
#     ['pit', 'Pittsburgh Steelers'],
#     ['sea', 'Seattle Seahawks'],
#     ['sfo', 'San Francisco 49ers'],
#     ['tam', 'Tampa Bay Buccaneers'],
#     ['oti', 'Tennessee Titans'],
#     ['was', 'Washington Commanders']
# ]

# # Custom headers for team game logs and opponent game logs
# # team_game_logs_headers = [
# #     'Week', 'Day', 'Date', '', 'OT', '', 'Opp', 'Tm', 'Opp', 'Cmp', 'Att', 'Yds', 'TD', 'Int', 'Sk', 'Yds', 'Y/A', 'NY/A',
# #     'Cmp%', 'Rate', 'Att', 'Yds', 'Y/A', 'TD', 'FGM', 'FGA', 'XPM', 'XPA', 'Pnt', 'Yds', '3DConv', '3DAtt', '4DConv', '4DAtt', 'ToP', 'Team_Name'
# # ]
# # team_game_logs_headers = [
# #     'Week', 'Day', 'Date', '', '', 'OT', '', 'Opp', 'Tm', 'Opp', 'Cmp', 'Att', 'Yds', 'TD', 'Int', 'Sk', 'Yds', 'Y/A', 'NY/A',
# #     'Cmp%', 'Rate', 'Att', 'Yds', 'Y/A', 'TD', 'FGM', 'FGA', 'XPM', 'XPA', 'Pnt', 'Yds', '3DConv', '3DAtt', '4DConv', '4DAtt', 'ToP', 'Team_Name'
# # ]
# team_game_logs_headers = [
#     'week_num', 'game_day_of_week', 'game_date', 'boxscore_word', 'game_outcome', 'overtime', 
#     'game_location', 'opp', 'pts_off', 'pts_def', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td', 
#     'pass_int', 'pass_sacked', 'pass_sacked_yds', 'pass_yds_per_att', 'pass_net_yds_per_att', 
#     'pass_cmp_perc', 'pass_rating', 'rush_att', 'rush_yds', 'rush_yds_per_att', 'rush_td', 
#     'fgm', 'fga', 'xpm', 'xpa', 'punt', 'punt_yds', 'third_down_success', 'third_down_att', 
#     'fourth_down_success', 'fourth_down_att', 'time_of_poss', 'Team_Name'
# ]
# # Need to change to:
# # week_num
# # game_day_of_week
# # game_date
# # boxscore_word
# # game_outcome
# # overtime
# # game_location
# # opp
# # pts_off
# # pts_def
# # pass_cmp
# # pass_att
# # pass_yds
# # pass_td
# # pass_int
# # pass_sacked
# # pass_sacked_yds
# # pass_yds_per_att
# # pass_net_yds_per_att
# # pass_cmp_perc
# # pass_rating
# # rush_att
# # rush_yds
# # rush_yds_per_att
# # rush_td
# # fgm
# # fga
# # xpm
# # xpa
# # punt
# # punt_yds
# # third_down_success
# # third_down_att
# # fourth_down_success
# # fourth_down_att
# # time_of_poss
# # team_name

# # opponent_game_logs_headers = [
# #     'Week', 'Day', 'Date', '', 'OT', '', 'Opp', 'Tm', 'Opp', 'Cmp', 'Att', 'Yds', 'TD', 'Int', 'Sk', 'Yds', 'Y/A', 'NY/A',
# #     'Cmp%', 'Rate', 'Att', 'Yds', 'Y/A', 'TD', 'FGM', 'FGA', 'XPM', 'XPA', 'Pnt', 'Yds', '3DConv', '3DAtt', '4DConv', '4DAtt', 'ToP', 'Team_Name'
# # ]
# # opponent_game_logs_headers = [
# #     'Week', 'Day', 'Date', '', '', 'OT', '', 'Opp', 'Tm', 'Opp', 'Cmp', 'Att', 'Yds', 'TD', 'Int', 'Sk', 'Yds', 'Y/A', 'NY/A',
# #     'Cmp%', 'Rate', 'Att', 'Yds', 'Y/A', 'TD', 'FGM', 'FGA', 'XPM', 'XPA', 'Pnt', 'Yds', '3DConv', '3DAtt', '4DConv', '4DAtt', 'ToP', 'Team_Name'
# # ]
# opponent_game_logs_headers = [
#     'week_num', 'game_day_of_week', 'game_date', 'boxscore_word', 'game_outcome', 'overtime',
#     'game_location', 'opp', 'pts_off', 'pts_def', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td',
#     'pass_int', 'pass_sacked', 'pass_sacked_yds', 'pass_yds_per_att', 'pass_net_yds_per_att',
#     'pass_cmp_perc', 'pass_rating', 'rush_att', 'rush_yds', 'rush_yds_per_att', 'rush_td',
#     'fgm', 'fga', 'xpm', 'xpa', 'punt', 'punt_yds', 'third_down_success', 'third_down_att',
#     'fourth_down_success', 'fourth_down_att', 'time_of_poss', 'Team_Name'
# ]

# # Loop through the years
# # for year in range(2015, 2025):
# for year in range(2023, 2025):
#     all_team_game_logs = []  # Create empty lists to accumulate team and opponent data for each year
#     all_opponent_game_logs = []

#     for team in teams:
#         abbreviation, name = team
#         print(f'Processing {name} for the year {year}')  # Include the year in the print statement
#         url = f'https://www.pro-football-reference.com/teams/{abbreviation}/{year}/gamelog/'
#         response = requests.get(url)

#         if response.status_code != 200:
#             print(f'Failed to retrieve page {url} for {name} in {year}: {response.status_code}')
#             sleep(1.3)  # Wait 1.3 seconds before the next request
#             continue

#         soup = BeautifulSoup(response.content, 'html.parser')

#         for table_id in [f'gamelog{year}', f'gamelog_opp{year}']:
#             table = soup.find('table', {'id': table_id})

#             if table is None:
#                 print(f'Table with id {table_id} not found on page {url} for {name} in {year}')
#                 continue

#             tbody = table.find('tbody')
#             game_logs = []
#             for tr in tbody.find_all('tr'):
#                 row_data = []
#                 for td in tr.find_all(['th', 'td']):
#                     row_data.append(td.text)
#                 if table_id == f'gamelog{year}':
#                     row_data.append(name)
#                     game_logs.append(row_data)
#                 elif table_id == f'gamelog_opp{year}':
#                     row_data.append(name)
#                     all_opponent_game_logs.append(row_data)

#             if table_id == f'gamelog{year}':
#                 all_team_game_logs.extend(game_logs)

#             playoff_table_id = f'playoff_gamelog{year}'
#             playoff_table = soup.find('table', {'id': playoff_table_id})

#             if playoff_table:
#                 playoff_tbody = playoff_table.find('tbody')
#                 playoff_game_logs = []
#                 for tr in playoff_tbody.find_all('tr'):
#                     row_data = []
#                     for td in tr.find_all(['th', 'td']):
#                         row_data.append(td.text)
#                     row_data.append(name)
#                     playoff_game_logs.append(row_data)

#                 all_team_game_logs.extend(playoff_game_logs)

#     with open(f'./data/SR-game-logs/all_teams_game_logs_{year}.csv', mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         writer.writerow(team_game_logs_headers)
#         writer.writerows(all_team_game_logs)

#     with open(f'./data/SR-opponent-game-logs/all_teams_opponent_game_logs_{year}.csv', mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         writer.writerow(opponent_game_logs_headers)
#         writer.writerows(all_opponent_game_logs)


Processing Arizona Cardinals for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/crd/2023/gamelog/ for Arizona Cardinals in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/crd/2023/gamelog/ for Arizona Cardinals in 2023
Processing Atlanta Falcons for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/atl/2023/gamelog/ for Atlanta Falcons in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/atl/2023/gamelog/ for Atlanta Falcons in 2023
Processing Baltimore Ravens for the year 2023
Table with id gamelog2023 not found on page https://www.pro-football-reference.com/teams/rav/2023/gamelog/ for Baltimore Ravens in 2023
Table with id gamelog_opp2023 not found on page https://www.pro-football-reference.com/teams/rav/2023/gamelog/ for Baltimore Ravens in 2023
Processing Buffalo Bills for the y

KeyboardInterrupt: 

In [4]:
# Create game_id in game logs

import os
import pandas as pd

directory = 'data/SR-game-logs/'

# List to store DataFrames
df_list = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):  # Ensure we are processing only CSV files
        file_path = os.path.join(directory, filename)
        
        # Extract the season from the filename, assuming the format: all_teams_opponent_game_logs_YYYY.csv
        # season = filename.split('_')[-1].replace('.csv', '')
        season = filename.split('_')[-1].replace('.csv', '')

        # Load the CSV file
        df = pd.read_csv(file_path)
        
        # Add the season column
        # df['season'] = season

        # Add the raw season column
        df['season'] = season
        
        # Add the cleaned season column (if needed separately for game_id logic)
        # df['season'] = raw_season  # You can fu

        # Add the cleaned season column (if needed separately for game_id logic)
        # df['season'] = raw_season  # You can further clean this if necessary
        
        # Append the DataFrame to the list
        df_list.append(df)

# Combine all DataFrames into one
df = pd.concat(df_list, ignore_index=True)

team_abbreviation_map = {
    'Arizona Cardinals': 'ARI',
    'Atlanta Falcons': 'ATL',
    'Baltimore Ravens': 'BAL',
    'Buffalo Bills': 'BUF',
    'Carolina Panthers': 'CAR',
    'Chicago Bears': 'CHI',
    'Cincinnati Bengals': 'CIN',
    'Cleveland Browns': 'CLE',
    'Dallas Cowboys': 'DAL',
    'Denver Broncos': 'DEN',
    'Detroit Lions': 'DET',
    'Green Bay Packers': 'GB',
    'Houston Texans': 'HOU',
    'Indianapolis Colts': 'IND',
    'Jacksonville Jaguars': 'JAX',
    'Kansas City Chiefs': 'KC',
    'Los Angeles Chargers': 'LAC',
    'Los Angeles Rams': 'LAR',
    'Las Vegas Raiders': 'LVR',
    'Oakland Raiders': 'LVR',
    'Miami Dolphins': 'MIA',
    'Minnesota Vikings': 'MIN',
    'New England Patriots': 'NE',
    'New Orleans Saints': 'NO',
    'New York Giants': 'NYG',
    'New York Jets': 'NYJ',
    'Philadelphia Eagles': 'PHI',
    'Pittsburgh Steelers': 'PIT',
    'Seattle Seahawks': 'SEA',
    'San Francisco 49ers': 'SF',
    'Tampa Bay Buccaneers': 'TB',
    'Tennessee Titans': 'TEN',
    'Washington Commanders': 'WAS',
    'Washington Football Team': 'WAS',
    'Washington Redskins': 'WAS',
    'St. Louis Rams': 'STL',
    'San Diego Chargers': 'LAC'
}

# Function to determine home and away teams
def determine_home_away(row):
    if row['game_location'] == '@':
        away_team = team_abbreviation_map[row['Team_Name']]
        home_team = team_abbreviation_map[row['opp']]
    else:
        home_team = team_abbreviation_map[row['Team_Name']]
        away_team = team_abbreviation_map[row['opp']]
        # home_team = team_abbreviation_map.get(row['Team_Name'], 'UNKNOWN')
        # away_team = team_abbreviation_map.get(row['opp'], 'UNKNOWN')
    return pd.Series([home_team, away_team])
    
df[['home_team_id', 'away_team_id']] = df.apply(determine_home_away, axis=1)
# df['home_team'] = df.apply(lambda row: team_abbreviation_map[row['Opp']] if row['Unnamed: 6'] == '@' else team_abbreviation_map[row['Team_Name']], axis=1)
# df['away_team'] = df.apply(lambda row: team_abbreviation_map[row['Team_Name']] if row['Unnamed: 6'] == '@' else team_abbreviation_map[row['Opp']], axis=1)

# Ensure 'week_num' is a string and pad single digits with a leading zero
df['week_num'] = df['week_num'].astype(str).str.zfill(2)

# Create the 'game_id' column by combining 'season', 'week_num', 'away_team', and 'home_team'
df['game_id'] = df['season'] + '_' + df['week_num'] + '_' + df['away_team_id'] + '_' + df['home_team_id']

# Save the updated combined DataFrame to a new CSV file
output_file_path_with_teams = 'data/all_team_game_logs.csv'
df.to_csv(output_file_path_with_teams, index=False)
print(f"Updated file with home and away teams saved to: {output_file_path_with_teams}")


ValueError: Columns must be same length as key

In [5]:
# # Aggregate all_game_logs.csv to single row per game

# df = pd.read_csv('data/all_team_game_logs.csv')

# # Grouping the data by game_id and aggregating the stats separately for home and away teams
# grouped_df = df.groupby('game_id', group_keys=False).apply(lambda x: pd.Series({
#     'home_pts_off': x.loc[x['game_location'] == 'N', 'pts_off'].sum() or x.loc[x['game_location'] == '', 'pts_off'].sum(),
#     'away_pts_off': x.loc[x['game_location'] == '@', 'pts_off'].sum(),
#     'home_pass_cmp': x.loc[x['game_location'] == 'N', 'pass_cmp'].sum() or x.loc[x['game_location'] == '', 'pass_cmp'].sum(),
#     'away_pass_cmp': x.loc[x['game_location'] == '@', 'pass_cmp'].sum(),
#     'home_pass_att': x.loc[x['game_location'] == 'N', 'pass_att'].sum() or x.loc[x['game_location'] == '', 'pass_att'].sum(),
#     'away_pass_att': x.loc[x['game_location'] == '@', 'pass_att'].sum(),
#     'home_pass_yds': x.loc[x['game_location'] == 'N', 'pass_yds'].sum() or x.loc[x['game_location'] == '', 'pass_yds'].sum(),
#     'away_pass_yds': x.loc[x['game_location'] == '@', 'pass_yds'].sum(),
#     'home_pass_td': x.loc[x['game_location'] == 'N', 'pass_td'].sum() or x.loc[x['game_location'] == '', 'pass_td'].sum(),
#     'away_pass_td': x.loc[x['game_location'] == '@', 'pass_td'].sum(),
#     'home_pass_int': x.loc[x['game_location'] == 'N', 'pass_int'].sum() or x.loc[x['game_location'] == '', 'pass_int'].sum(),
#     'away_pass_int': x.loc[x['game_location'] == '@', 'pass_int'].sum(),
#     'home_pass_sacked': x.loc[x['game_location'] == 'N', 'pass_sacked'].sum() or x.loc[x['game_location'] == '', 'pass_sacked'].sum(),
#     'away_pass_sacked': x.loc[x['game_location'] == '@', 'pass_sacked'].sum(),
#     'home_pass_yds_per_att': x.loc[x['game_location'] == 'N', 'pass_yds_per_att'].mean() or x.loc[x['game_location'] == '', 'pass_yds_per_att'].mean(),
#     'away_pass_yds_per_att': x.loc[x['game_location'] == '@', 'pass_yds_per_att'].mean(),
#     'home_pass_net_yds_per_att': x.loc[x['game_location'] == 'N', 'pass_net_yds_per_att'].mean() or x.loc[x['game_location'] == '', 'pass_net_yds_per_att'].mean(),
#     'away_pass_net_yds_per_att': x.loc[x['game_location'] == '@', 'pass_net_yds_per_att'].mean(),
#     'home_pass_cmp_perc': x.loc[x['game_location'] == 'N', 'pass_cmp_perc'].mean() or x.loc[x['game_location'] == '', 'pass_cmp_perc'].mean(),
#     'away_pass_cmp_perc': x.loc[x['game_location'] == '@', 'pass_cmp_perc'].mean(),
#     'home_pass_rating': x.loc[x['game_location'] == 'N', 'pass_rating'].mean() or x.loc[x['game_location'] == '', 'pass_rating'].mean(),
#     'away_pass_rating': x.loc[x['game_location'] == '@', 'pass_rating'].mean(),
#     'home_rush_att': x.loc[x['game_location'] == 'N', 'rush_att'].sum() or x.loc[x['game_location'] == '', 'rush_att'].sum(),
#     'away_rush_att': x.loc[x['game_location'] == '@', 'rush_att'].sum(),
#     'home_rush_yds': x.loc[x['game_location'] == 'N', 'rush_yds'].sum() or x.loc[x['game_location'] == '', 'rush_yds'].sum(),
#     'away_rush_yds': x.loc[x['game_location'] == '@', 'rush_yds'].sum(),
#     'home_rush_yds_per_att': x.loc[x['game_location'] == 'N', 'rush_yds_per_att'].mean() or x.loc[x['game_location'] == '', 'rush_yds_per_att'].mean(),
#     'away_rush_yds_per_att': x.loc[x['game_location'] == '@', 'rush_yds_per_att'].mean(),
#     'home_rush_td': x.loc[x['game_location'] == 'N', 'rush_td'].sum() or x.loc[x['game_location'] == '', 'rush_td'].sum(),
#     'away_rush_td': x.loc[x['game_location'] == '@', 'rush_td'].sum(),
# }))

# # Save the result to a CSV file if needed
# grouped_df.to_csv('data/all_team_game_logs.csv', index=True)

# Aggregate all_game_logs.csv to single row per game

df = pd.read_csv('data/all_team_game_logs.csv')

# Grouping the data by game_id and aggregating the stats separately for home and away teams
grouped_df = df.groupby('game_id', group_keys=False).apply(lambda x: pd.Series({
    'season': x['season'].iloc[0],  # Ensure the season is included from the first entry
    'home_pts_off': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pts_off'].sum(),
    'away_pts_off': x.loc[x['game_location'] == '@', 'pts_off'].sum(),
    'home_pass_cmp': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_cmp'].sum(),
    'away_pass_cmp': x.loc[x['game_location'] == '@', 'pass_cmp'].sum(),
    'home_pass_att': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_att'].sum(),
    'away_pass_att': x.loc[x['game_location'] == '@', 'pass_att'].sum(),
    'home_pass_yds': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_yds'].sum(),
    'away_pass_yds': x.loc[x['game_location'] == '@', 'pass_yds'].sum(),
    'home_pass_td': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_td'].sum(),
    'away_pass_td': x.loc[x['game_location'] == '@', 'pass_td'].sum(),
    'home_pass_int': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_int'].sum(),
    'away_pass_int': x.loc[x['game_location'] == '@', 'pass_int'].sum(),
    'home_pass_sacked': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_sacked'].sum(),
    'away_pass_sacked': x.loc[x['game_location'] == '@', 'pass_sacked'].sum(),
    'home_pass_yds_per_att': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_yds_per_att'].mean(),
    'away_pass_yds_per_att': x.loc[x['game_location'] == '@', 'pass_yds_per_att'].mean(),
    'home_pass_net_yds_per_att': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_net_yds_per_att'].mean(),
    'away_pass_net_yds_per_att': x.loc[x['game_location'] == '@', 'pass_net_yds_per_att'].mean(),
    'home_pass_cmp_perc': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_cmp_perc'].mean(),
    'away_pass_cmp_perc': x.loc[x['game_location'] == '@', 'pass_cmp_perc'].mean(),
    'home_pass_rating': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'pass_rating'].mean(),
    'away_pass_rating': x.loc[x['game_location'] == '@', 'pass_rating'].mean(),
    'home_rush_att': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'rush_att'].sum(),
    'away_rush_att': x.loc[x['game_location'] == '@', 'rush_att'].sum(),
    'home_rush_yds': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'rush_yds'].sum(),
    'away_rush_yds': x.loc[x['game_location'] == '@', 'rush_yds'].sum(),
    'home_rush_yds_per_att': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'rush_yds_per_att'].mean(),
    'away_rush_yds_per_att': x.loc[x['game_location'] == '@', 'rush_yds_per_att'].mean(),
    'home_rush_td': x.loc[x['game_location'].isnull() | (x['game_location'] == ''), 'rush_td'].sum(),
    'away_rush_td': x.loc[x['game_location'] == '@', 'rush_td'].sum(),
}))

# Save the result to a CSV file if needed
grouped_df.to_csv('data/all_team_game_logs.csv', index=True)


FileNotFoundError: [Errno 2] No such file or directory: 'data/all_team_game_logs.csv'

In [None]:
!open data/all_team_game_logs.csv
# !open data/all_opponent_team_game_logs.csv

In [None]:
# Team Stats and Rankings
# Not in nfl.db currently

import os
import requests
from bs4 import BeautifulSoup
import csv
from time import sleep

# Create directories if they don't exist
data_dir = './data/SR-team-stats'
os.makedirs(data_dir, exist_ok=True)

# List of teams and abbreviations
teams = [
    ['crd', 'Arizona Cardinals'],
    ['atl', 'Atlanta Falcons'],
    ['rav', 'Baltimore Ravens'],
    ['buf', 'Buffalo Bills'],
    ['car', 'Carolina Panthers'],
    ['chi', 'Chicago Bears'],
    ['cin', 'Cincinnati Bengals'],
    ['cle', 'Cleveland Browns'],
    ['dal', 'Dallas Cowboys'],
    ['den', 'Denver Broncos'],
    ['det', 'Detroit Lions'],
    ['gnb', 'Green Bay Packers'],
    ['htx', 'Houston Texans'],
    ['clt', 'Indianapolis Colts'],
    ['jax', 'Jacksonville Jaguars'],
    ['kan', 'Kansas City Chiefs'],
    ['sdg', 'Los Angeles Chargers'],
    ['ram', 'Los Angeles Rams'],
    ['rai', 'Las Vegas Raiders'],
    ['mia', 'Miami Dolphins'],
    ['min', 'Minnesota Vikings'],
    ['nwe', 'New England Patriots'],
    ['nor', 'New Orleans Saints'],
    ['nyg', 'New York Giants'],
    ['nyj', 'New York Jets'],
    ['phi', 'Philadelphia Eagles'],
    ['pit', 'Pittsburgh Steelers'],
    ['sea', 'Seattle Seahawks'],
    ['sfo', 'San Francisco 49ers'],
    ['tam', 'Tampa Bay Buccaneers'],
    ['oti', 'Tennessee Titans'],
    ['was', 'Washington Commanders']
]

# Define headers for team stats CSV
team_stats_headers = [
    'Player', 'PF', 'Yds', 'Ply', 'Y/P', 'TO', 'FL', '1stD', 'Cmp', 'Att', 'Yds', 'TD', 'Int', 'NY/A',
    '1stD', 'Att', 'Yds', 'TD', 'Y/A', '1stD', 'Pen', 'Yds', '1stPy', '#Dr', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds', 'Pts', 'Team'
]

# Loop through the years
# for year in range(2015, 2025):
for year in range(2023, 2025):
    all_team_stats = []  # Create empty list to accumulate team stats data for each year

    for team in teams:
        abbreviation, name = team
        print(f'Processing {name} for the year {year}')  # Include the year in the print statement
        url = f'https://www.pro-football-reference.com/teams/{abbreviation}/{year}.htm'
        response = requests.get(url)

        if response.status_code != 200:
            print(f'Failed to retrieve page {url} for {name} in {year}: {response.status_code}')
            continue

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the main table for team stats (e.g., "team_stats" table)
        table = soup.find('table', {'id': 'team_stats'})
        
        if table is None:
            print(f'Team stats table not found on page {url} for {name} in {year}')
            continue

        tbody = table.find('tbody')
        for tr in tbody.find_all('tr'):
            row_data = [tr.find('th').text.strip()]  # Start with the 'Player' column data
            row_data.extend([td.text.strip() for td in tr.find_all('td')])  # Add the rest of the row data
            row_data.append(abbreviation)  # Append team abbreviation as the last column
            all_team_stats.append(row_data)

        sleep(2.5)  # Sleep for 2.5 seconds after processing each team

    # Save the accumulated team stats data to a CSV file, named based on the year
    with open(f'{data_dir}/all_teams_stats_{year}.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(team_stats_headers)
        writer.writerows(all_team_stats)

    print(f'Saved data for all teams for the year {year}')


In [None]:
# Schedule & Game Results 
# Not in nfl.db currently

# Create directories if they don't exist
data_dir = './data/SR-schedule-and-game-results'
os.makedirs(data_dir, exist_ok=True)

# List of teams and abbreviations
teams = [
    ['crd', 'Arizona Cardinals'],
    ['atl', 'Atlanta Falcons'],
    ['rav', 'Baltimore Ravens'],
    ['buf', 'Buffalo Bills'],
    ['car', 'Carolina Panthers'],
    ['chi', 'Chicago Bears'],
    ['cin', 'Cincinnati Bengals'],
    ['cle', 'Cleveland Browns'],
    ['dal', 'Dallas Cowboys'],
    ['den', 'Denver Broncos'],
    ['det', 'Detroit Lions'],
    ['gnb', 'Green Bay Packers'],
    ['htx', 'Houston Texans'],
    ['clt', 'Indianapolis Colts'],
    ['jax', 'Jacksonville Jaguars'],
    ['kan', 'Kansas City Chiefs'],
    ['sdg', 'Los Angeles Chargers'],
    ['ram', 'Los Angeles Rams'],
    ['rai', 'Las Vegas Raiders'],
    ['mia', 'Miami Dolphins'],
    ['min', 'Minnesota Vikings'],
    ['nwe', 'New England Patriots'],
    ['nor', 'New Orleans Saints'],
    ['nyg', 'New York Giants'],
    ['nyj', 'New York Jets'],
    ['phi', 'Philadelphia Eagles'],
    ['pit', 'Pittsburgh Steelers'],
    ['sea', 'Seattle Seahawks'],
    ['sfo', 'San Francisco 49ers'],
    ['tam', 'Tampa Bay Buccaneers'],
    ['oti', 'Tennessee Titans'],
    ['was', 'Washington Commanders']
]

# Updated headers for the schedule and game results CSV
schedule_headers = [
    'Week', 'Day', 'Date', 'Time', 'Boxscore', 'Outcome', 'OT', 'Rec', 'Home/Away', 'Opp', 
    'Tm', 'OppPts', '1stD', 'TotYd', 'PassY', 'RushY', 'TO_lost', 
    'Opp1stD', 'OppTotYd', 'OppPassY', 'OppRushY', 'TO_won',
    'Offense', 'Defense', 'Sp. Tms'
]

# Loop through the years
# for year in range(2015, 2025):
for year in range(2023, 2025):
    all_games = []  # Create an empty list to accumulate game data for each year

    for team in teams:
        abbreviation, name = team
        print(f'Processing {name} for the year {year}')  # Include the year in the print statement
        url = f'https://www.pro-football-reference.com/teams/{abbreviation}/{year}.htm'
        response = requests.get(url)

        if response.status_code != 200:
            print(f'Failed to retrieve page {url} for {name} in {year}: {response.status_code}')
            continue

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the "Schedule & Game Results" table
        table = soup.find('table', {'id': 'games'})
        if table is None:
            print(f'Schedule & Game Results table not found on page {url} for {name} in {year}')
            continue

        tbody = table.find('tbody')
        team_games = []  # Store game data for this team

        for tr in tbody.find_all('tr'):
            # Initialize the row with the Week number
            row_data = []
            week_th = tr.find('th', {'data-stat': 'week_num'})
            week_num = week_th.text.strip() if week_th else ''
            row_data.append(week_num)

            # Add the rest of the data from 'td' elements
            for td in tr.find_all('td'):
                row_data.append(td.text.strip())

            # Ensure row_data matches the number of headers
            if len(row_data) != len(schedule_headers):
                row_data += [''] * (len(schedule_headers) - len(row_data))

            team_games.append(row_data)
            all_games.append(row_data)  # Also add to the all_games list

        # Save the team's data to its own CSV file
        team_file_path = f'{data_dir}/{abbreviation}_{year}_schedule_and_game_results.csv'
        with open(team_file_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(schedule_headers)
            writer.writerows(team_games)

        print(f'Saved schedule data for {name} for the year {year}')
        sleep(2.5)  # Sleep for 2.5 seconds after processing each team

# Merge all years and teams into all_teams_schedule_and_game_results_merged.csv
data_dir = './data/SR-schedule-and-game-results'

# List to hold all data from the files
all_games = []

# Iterate over all files in the directory
for filename in os.listdir(data_dir):
    if filename.endswith("_schedule_and_game_results.csv"):
        # Extract the team abbreviation and year from the filename
        team_abbr = filename.split('_')[0]
        season_year = filename.split('_')[1]
        
        # Construct the full path to the file
        file_path = os.path.join(data_dir, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Add new columns for the team abbreviation and season year
        df['Team'] = team_abbr
        df['Season'] = season_year
        
        # Append the DataFrame to the list of all games
        all_games.append(df)

# Concatenate all the DataFrames into a single DataFrame
merged_df = pd.concat(all_games, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_output_path = os.path.join(data_dir, 'all_teams_schedule_and_game_results_merged.csv')
merged_df.to_csv(merged_output_path, index=False)

print(f"Successfully merged all team files into {merged_output_path}")

# # Standardize the Team column abbreviations
# team_abbreviation_mapping = {
#     'gnb': 'gb',
#     'htx': 'hou',
#     'clt': 'ind',
#     'kan': 'kc',
#     'sdg': 'lac',
#     'ram': 'lar',
#     'rai': 'lvr',
#     'nwe': 'ne',
#     'nor': 'no',
#     'sfo': 'sf',
#     'tam': 'tb',
#     'oti': 'ten',
#     'rav': 'bal',
#     'crd': 'ari'
# }

In [None]:
# Team Conversions 
# Not in nfl.db currently

# Create directories if they don't exist
data_dir = './data/SR-team-conversions'
os.makedirs(data_dir, exist_ok=True)

# List of teams and abbreviations
teams = [
    ['crd', 'Arizona Cardinals'],
    ['atl', 'Atlanta Falcons'],
    ['rav', 'Baltimore Ravens'],
    ['buf', 'Buffalo Bills'],
    ['car', 'Carolina Panthers'],
    ['chi', 'Chicago Bears'],
    ['cin', 'Cincinnati Bengals'],
    ['cle', 'Cleveland Browns'],
    ['dal', 'Dallas Cowboys'],
    ['den', 'Denver Broncos'],
    ['det', 'Detroit Lions'],
    ['gnb', 'Green Bay Packers'],
    ['htx', 'Houston Texans'],
    ['clt', 'Indianapolis Colts'],
    ['jax', 'Jacksonville Jaguars'],
    ['kan', 'Kansas City Chiefs'],
    ['sdg', 'Los Angeles Chargers'],
    ['ram', 'Los Angeles Rams'],
    ['rai', 'Las Vegas Raiders'],
    ['mia', 'Miami Dolphins'],
    ['min', 'Minnesota Vikings'],
    ['nwe', 'New England Patriots'],
    ['nor', 'New Orleans Saints'],
    ['nyg', 'New York Giants'],
    ['nyj', 'New York Jets'],
    ['phi', 'Philadelphia Eagles'],
    ['pit', 'Pittsburgh Steelers'],
    ['sea', 'Seattle Seahawks'],
    ['sfo', 'San Francisco 49ers'],
    ['tam', 'Tampa Bay Buccaneers'],
    ['oti', 'Tennessee Titans'],
    ['was', 'Washington Commanders']
]

# Define headers for the team conversions CSV
team_conversions_headers = [
    'Player', '3DAtt', '3DConv', '3D%', '4DAtt', '4DConv', '4D%', 'RZAtt', 'RZTD', 'RZPct', 'Team'
]

# Loop through the years
# for year in range(2015, 2025):
for year in range(2023, 2025):
    for team in teams:
        abbreviation, name = team
        print(f'Processing {name} for the year {year}')  # Include the year in the print statement
        url = f'https://www.pro-football-reference.com/teams/{abbreviation}/{year}.htm'
        response = requests.get(url)

        if response.status_code != 200:
            print(f'Failed to retrieve page {url} for {name} in {year}: {response.status_code}')
            continue

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the "Team Conversions" table (e.g., "team_conversions" table)
        table = soup.find('table', {'id': 'team_conversions'})

        if table is None:
            print(f'Team Conversions table not found on page {url} for {name} in {year}')
            continue

        all_conversions = []
        tbody = table.find('tbody')
        for tr in tbody.find_all('tr'):
            row_data = [td.text.strip() for td in tr.find_all(['th', 'td'])]  # Extract row data including headers
            row_data.append(abbreviation)  # Append team abbreviation at the end
            all_conversions.append(row_data)

        # Save the conversion data for this team to a separate CSV file
        team_file = f'{data_dir}/{abbreviation}_{year}_team_conversions.csv'
        with open(team_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(team_conversions_headers)
            writer.writerows(all_conversions)

        print(f'Saved team conversions data for {name} for the year {year} to {team_file}')
        
        sleep(2.5)  # Sleep for 2.5 seconds after processing each team


In [None]:
# --- Creating home_spread and away_spread columns in nfl.db --- #
# Adjusting to also make team_favorite

# Function to calculate home_spread, away_spread, team_favorite, and team_covered
def calculate_spreads_and_favorite(spread_line, home_team, away_team, home_score, away_score):
    if spread_line is None or home_score is None or away_score is None:
        # Handle cases where spread_line or scores are missing
        return "N/A", "N/A", "N/A", "N/A"

    # Ensure spread_line is a float for arithmetic operations
    spread_line = float(spread_line)
    abs_spread = abs(spread_line)  # Use absolute value of the spread for comparisons

    if spread_line > 0:
        # Home team is favored
        home_spread = f"-{spread_line}"  # Home team is the favorite
        away_spread = f"+{spread_line}"  # Away team is the underdog
        team_favorite = home_team
        # Determine which team covered the spread
        if home_score > away_score + abs_spread:
            team_covered = home_team
        elif away_score > home_score - abs_spread:
            team_covered = away_team
        else:
            team_covered = "Push"
    else:
        # Away team is favored
        home_spread = f"+{-spread_line}"  # Home team is the underdog
        away_spread = f"-{-spread_line}"  # Away team is the favorite
        team_favorite = away_team
        # Determine which team covered the spread
        if away_score > home_score + abs_spread:
            team_covered = away_team
        elif home_score > away_score - abs_spread:
            team_covered = home_team
        else:
            team_covered = "Push"

    return home_spread, away_spread, team_favorite, team_covered

# Connect to the SQLite database
db_path = 'nfl.db'  # Update the path if needed
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Add new columns to the 'Games' table (if they don't already exist)
try:
    cursor.execute("ALTER TABLE Games ADD COLUMN home_spread TEXT;")
except sqlite3.OperationalError:
    pass  # Column already exists

try:
    cursor.execute("ALTER TABLE Games ADD COLUMN away_spread TEXT;")
except sqlite3.OperationalError:
    pass  # Column already exists

try:
    cursor.execute("ALTER TABLE Games ADD COLUMN team_favorite TEXT;")
except sqlite3.OperationalError:
    pass  # Column already exists

try:
    cursor.execute("ALTER TABLE Games ADD COLUMN team_covered TEXT;")
except sqlite3.OperationalError:
    pass  # Column already exists

# Update each row in the 'Games' table with home_spread, away_spread, team_favorite, and team_covered
cursor.execute("SELECT game_id, spread_line, home_team, away_team, home_score, away_score FROM Games;")
games = cursor.fetchall()

for game in games:
    game_id, spread_line, home_team, away_team, home_score, away_score = game
    home_spread, away_spread, team_favorite, team_covered = calculate_spreads_and_favorite(spread_line, home_team, away_team, home_score, away_score)
    update_query = "UPDATE Games SET home_spread = ?, away_spread = ?, team_favorite = ?, team_covered = ? WHERE game_id = ?;"
    cursor.execute(update_query, (home_spread, away_spread, team_favorite, team_covered, game_id))

# Commit the changes and close the connection
conn.commit()
conn.close()

print("Columns 'home_spread', 'away_spread', 'team_favorite', and 'team_covered' have been added and updated for all rows in the 'Games' table.")


In [None]:
# Passing/Rushing/Receiving (from boxscore pages)
# For longest reception
# Not in nfl.db currently

# import requests
# from bs4 import BeautifulSoup
# import csv
# import os
# import time

# # !mkdir data/passing-rushing-receiving-game-logs/

# # Passing, Rushing, and Receiving Game Logs
# # for year_to_scrape in range(2015, 2025):
# for year_to_scrape in range(2024, 2025):
#     # Initialize output CSV file with the year in its name
#     output_filename = f'./data/passing-rushing-receiving-game-logs/all_passing_rushing_receiving_{year_to_scrape}.csv'
#     with open(output_filename, 'w', newline='') as output_csvfile:
#         csvwriter = csv.writer(output_csvfile)
#         csvwriter.writerow([
#             'player', 'team', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td', 'pass_int', 'pass_sacked', 
#             'pass_sacked_yds', 'pass_long', 'pass_rating', 'rush_att', 'rush_yds', 'rush_td', 'rush_long', 
#             'targets', 'rec', 'rec_yds', 'rec_td', 'rec_long', 'fumbles', 'fumbles_lost', 'game_id'
#         ])  # Added 'game_id' to the header row

#         # Read the CSV file containing the game data
#         with open('./data/games.csv', 'r') as csvfile:
#             reader = csv.DictReader(csvfile)
#             rows = [row for row in reader if int(row['game_id'].split('_')[0]) == year_to_scrape]  # Filter rows for the year

#             for row in rows:
#                 pfr_value = row['pfr']
#                 game_id = row['game_id']

#                 # Form the URL using the 'pfr' value
#                 url = f"https://www.pro-football-reference.com/boxscores/{pfr_value}.htm"

#                 try:
#                     # Fetch the webpage
#                     response = requests.get(url)

#                     # Check if we are being rate limited (status code 429)
#                     if response.status_code == 429:
#                         print(f"Rate limit exceeded for URL {url}. Please try again later.")
#                         time.sleep(60)  # Sleep for 60 seconds before retrying
#                         continue

#                     soup = BeautifulSoup(response.text, 'html.parser')

#                     # Find the table containing player stats
#                     table = soup.find('div', id='div_player_offense')
#                     if table:
#                         # Loop through the rows to get the player stats
#                         for i, tr in enumerate(table.find_all('tr')):
#                             if i == 0:  # Skip the first header row
#                                 continue
#                             player_name = tr.find('th').get_text() if tr.find('th') else ''
#                             stats = [td.get_text() for td in tr.find_all('td')]
#                             row_data = [player_name] + stats + [game_id]  # Append game_id to the end of the row
#                             csvwriter.writerow(row_data)

#                     print(f"Successfully scraped data for game ID: {game_id}, PFR: {pfr_value}")

#                 except Exception as e:
#                     print(f"An error occurred while scraping {url}. Error: {e}")

#                 # Sleep for 2 seconds to avoid overloading the server
#                 time.sleep(2)

#     print(f"Scraping completed for {year_to_scrape}. Data saved to {output_filename}.")

import requests
from bs4 import BeautifulSoup
import csv
import os
import time

# Directory for saving game logs (ensure it exists)
os.makedirs('./data/passing-rushing-receiving-game-logs/', exist_ok=True)

# For years in range (2024 in this case)
for year_to_scrape in range(2023, 2025):
    output_filename = f'./data/passing-rushing-receiving-game-logs/all_passing_rushing_receiving_{year_to_scrape}.csv'
    
    # Initialize CSV file with a header, including 'player_id'
    with open(output_filename, 'w', newline='') as output_csvfile:
        csvwriter = csv.writer(output_csvfile)
        csvwriter.writerow([
            'player', 'player_id', 'team', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td', 'pass_int', 
            'pass_sacked', 'pass_sacked_yds', 'pass_long', 'pass_rating', 'rush_att', 'rush_yds', 'rush_td', 
            'rush_long', 'targets', 'rec', 'rec_yds', 'rec_td', 'rec_long', 'fumbles', 'fumbles_lost', 'game_id'
        ])  # Added 'player_id' to the header row

        # Open the game data file
        with open('./data/games.csv', 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            rows = [row for row in reader if int(row['game_id'].split('_')[0]) == year_to_scrape]

            for row in rows:
                pfr_value = row['pfr']
                game_id = row['game_id']

                url = f"https://www.pro-football-reference.com/boxscores/{pfr_value}.htm"
                
                try:
                    response = requests.get(url)

                    # Rate limit handling
                    if response.status_code == 429:
                        print(f"Rate limit exceeded for URL {url}. Please try again later.")
                        time.sleep(60)
                        continue

                    soup = BeautifulSoup(response.text, 'html.parser')

                    # Find the player offense table
                    table = soup.find('div', id='div_player_offense')
                    if table:
                        for i, tr in enumerate(table.find_all('tr')):
                            if i == 0:  # Skip header row
                                continue
                            player_cell = tr.find('th')
                            if player_cell:
                                player_name = player_cell.get_text()

                                # Extract href from <a> if it exists
                                player_link = player_cell.find('a')
                                player_id = player_link['href'].split('/')[-1] if player_link else None  # Extract only the 'player_id' part

                                # Collect stats
                                stats = [td.get_text() for td in tr.find_all('td')]
                                row_data = [player_name, player_id] + stats + [game_id]  # Append game_id and player_id
                                csvwriter.writerow(row_data)

                    print(f"Successfully scraped data for game ID: {game_id}, PFR: {pfr_value}")

                except Exception as e:
                    print(f"An error occurred while scraping {url}. Error: {e}")

                # Sleep between requests
                time.sleep(2)

    print(f"Scraping completed for {year_to_scrape}. Data saved to {output_filename}.")


In [None]:
# Clean weird rows ^

import pandas as pd
import os

# Directory path
directory = 'data/passing-rushing-receiving-game-logs/'

# Iterate through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)

        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Remove rows where the 'player' column is "Player" or NaN (missing)
        df_cleaned = df[(df['player'] != 'Player') & (df['player'].notna())]

        # Save the cleaned DataFrame back to the same CSV file
        df_cleaned.to_csv(file_path, index=False)

        print(f"Processed {filename}")


In [None]:
# Merge all ^

import pandas as pd
import os

# Directory path
directory = 'data/passing-rushing-receiving-game-logs/'
merged_file_path = 'data/all_passing_rushing_receiving.csv'  # Path where the merged file will be saved

# List to hold all DataFrames
dataframes = []

# Iterate through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        
        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Append the DataFrame to the list
        dataframes.append(df)
        
        print(f"Added {filename} to the merge list")

# Concatenate all DataFrames in the list into one large DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv(merged_file_path, index=False)

print(f"All files have been merged into {merged_file_path}")

In [None]:
# Add opponent_team column ^

file_path = 'data/all_passing_rushing_receiving.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Dictionary to map incorrect team codes to the correct ones
team_corrections = {
    'NWE': 'NE',
    'GNB': 'GB',
    'KAN': 'KC',
    'STL': 'LAR',
    'NOR': 'NO',
    'SDG': 'LAC',
    'OAK': 'LVR',
    'TAM': 'TB',
    'SFO': 'SF'
}

# Apply the corrections to the 'team' column
df['team'] = df['team'].replace(team_corrections)

# Function to extract the opponent team from the game_id column
def get_opponent_team(row):
    game_id = row['game_id']
    team = row['team']
    
    # Split the game_id to extract the away and home teams
    _, _, away_team, home_team = game_id.split('_')
    
    # Determine the opponent based on whether the player's team is the home or away team
    if team == home_team:
        return away_team
    elif team == away_team:
        return home_team
    else:
        return None  # In case the team does not match either home or away (shouldn't happen)

# Function to determine if the player was at home or away
def is_player_home(row):
    game_id = row['game_id']
    team = row['team']
    
    # Split the game_id to extract the away and home teams
    _, _, away_team, home_team = game_id.split('_')
    
    # Check if the player's team is the home team
    return 'y' if team == home_team else 'n'

# Apply the functions to create new columns 'opponent_team' and 'home'
df['opponent_team'] = df.apply(get_opponent_team, axis=1)
df['home'] = df.apply(is_player_home, axis=1)

# Save the updated dataframe to the same CSV file
df.to_csv('data/all_passing_rushing_receiving.csv', index=False)  # Save the result

# Optionally display the first few rows to verify the changes
print(df.head())
# !open data/all_passing_rushing_receiving.csv

In [None]:
# Add position column ^

# Load the CSV files
all_passing_file = 'data/all_passing_rushing_receiving.csv'
# rosters_file = 'data/Rosters.csv'
rosters_file = 'data/rosters.csv'

all_passing_df = pd.read_csv(all_passing_file)
rosters_df = pd.read_csv(rosters_file)

# Merge the two dataframes on player names
merged_df = pd.merge(all_passing_df, rosters_df[['full_name', 'position']], 
                     left_on='player', right_on='full_name', how='left')

# Ensure the 'position' column exists even if no matches are found
if 'position' not in merged_df.columns:
    merged_df['position'] = None

# Filter only relevant positions (QB, WR, TE, RB)
relevant_positions = ['QB', 'WR', 'TE', 'RB']
merged_df['position'] = merged_df['position'].where(merged_df['position'].isin(relevant_positions), None)

# Drop the full_name column that was added during the merge
merged_df.drop(columns=['full_name'], inplace=True)

# Ensure all rows for a player have the same position
merged_df['position'] = merged_df.groupby('player')['position'].transform(lambda x: x.ffill().bfill())

# Save the updated dataframe to a new CSV file
merged_df.to_csv('data/all_passing_rushing_receiving.csv', index=False)

# Optionally display the updated dataframe
print(merged_df[['player', 'position']].head())

# Optional command to open the CSV file (depending on your environment)
# !open data/all_passing_rushing_receiving.csv

In [None]:
# # MERGE PLAYER ID'S

# # Load the CSV files
# roster_df = pd.read_csv('data/rosters.csv')
# new_stats_df = pd.read_csv('all_passing_rushing_receiving.csv')

# # Merging the 'gsis_id' from the roster dataframe into the 'new_stats_df' (all_passing_rushing_receiving_2024)
# # where 'player_id' matches 'pfr_id'
# merged_new_stats = new_stats_df.merge(roster_df[['gsis_id', 'pfr_id']], 
#                                       left_on='player_id', 
#                                       right_on='pfr_id', 
#                                       how='left')

# # Optional: Save the merged dataframe to a new CSV file if needed
# merged_new_stats.to_csv('data/all_passing_rushing_receiving.csv', index=False)

# # Display the first few rows of the merged dataframe
# print(merged_new_stats.head())


In [None]:
# # # # Defense (from boxscore pages)
# # # # For sacks/defensive INT
# # # # Not in nfl.db currently

import requests
from bs4 import BeautifulSoup, Comment
import csv
import os
import time

# Create the directory for defense game logs
os.makedirs('data/defense-game-logs', exist_ok=True)

# Headers for the defense stats, including 'game_id'
headers = [
    'player', 'team', 'def_int', 'def_int_yds', 'def_int_td', 'def_int_long', 'pass_defended', 'sacks',
    'tackles_combined', 'tackles_solo', 'tackles_assists', 'tackles_loss', 'qb_hits', 'fumbles_rec',
    'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced', 'game_id'
]

for year_to_scrape in range(2023, 2025):
    output_filename = f'./data/defense-game-logs/all_defense_{year_to_scrape}.csv'
    with open(output_filename, 'w', newline='') as output_csvfile:
        csvwriter = csv.writer(output_csvfile)
        csvwriter.writerow(headers)

        with open('./data/games.csv', 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            rows = [row for row in reader if int(row['game_id'].split('_')[0]) == year_to_scrape]

            for row in rows:
                # Check if 'away_score' and 'home_score' are both present
                if not row['away_score'] or not row['home_score']:
                    print(f"Skipping game {row['game_id']} due to missing scores.")
                    continue  # Skip this game if scores are missing
                
                pfr_value = row['pfr']
                game_id = row['game_id']
                url = f"https://www.pro-football-reference.com/boxscores/{pfr_value}.htm"

                try:
                    response = requests.get(url)

                    if response.status_code == 429:
                        print(f"Rate limit exceeded for URL {url}. Please try again later.")
                        time.sleep(3)
                        continue

                    soup = BeautifulSoup(response.text, 'html.parser')

                    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

                    for comment in comments:
                        soup_comment = BeautifulSoup(comment, 'html.parser')
                        table = soup_comment.find('table', id='player_defense')
                        if table:
                            for i, tr in enumerate(table.find_all('tr')):
                                if i == 0:
                                    continue
                                player_name = tr.find('th').get_text() if tr.find('th') else ''
                                stats = [td.get_text() for td in tr.find_all('td')]
                                row_data = [player_name] + stats + [game_id]
                                csvwriter.writerow(row_data)
                            print(f"Successfully scraped data for game ID: {game_id}, PFR: {pfr_value}")
                            break

                except Exception as e:
                    print(f"An error occurred while scraping {url}. Error: {e}")

                time.sleep(2)

    print(f"Scraping completed for {year_to_scrape}. Data saved to {output_filename}.")


In [None]:
# Clean bad rows ^

df = pd.read_csv('./data/defense-game-logs/all_defense_2024.csv')

# Drop rows with any missing data
df.dropna(inplace=True)

# Write the cleaned DataFrame back to the CSV file
df.to_csv('./data/defense-game-logs/all_defense_2024.csv', index=False)

# Loop version
# directory = './data/defense-game-logs/'

# # Loop through all files in the directory
# for filename in os.listdir(directory):
#     if filename.endswith('.csv'):  # Process only CSV files
#         file_path = os.path.join(directory, filename)
        
#         # Load the CSV file into a DataFrame
#         df = pd.read_csv(file_path)
        
#         # Drop rows with any missing data
#         df.dropna(inplace=True)
        
#         # Write the cleaned DataFrame back to the CSV file
#         df.to_csv(file_path, index=False)
#         print(f"Processed file: {filename}")


In [None]:
# Save all nfl.db to csv's

import sqlite3
import pandas as pd
from IPython.display import display

db_path = 'nfl.db'  # Update this path if needed

conn = sqlite3.connect(db_path)

# Get all table names
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(tables_query).fetchall()

# Download each table to a CSV file
for table in tables:
    table_name = table[0]
    # Read the table into a pandas DataFrame
    df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
    # Save the DataFrame to a CSV file
    csv_file_name = f"{table_name}.csv"
    df.to_csv(csv_file_name, index=False)
    print(f"Downloaded {table_name} to {csv_file_name}")

# Close the connection
conn.close()

In [None]:
!open PlayerStats.csv

In [None]:
# Save all nfl.db to other files

import json
import sqlite3
import os

# Connect to the database
db_path = 'nfl.db'
conn = sqlite3.connect(db_path)

# Get the directory where the database is located
base_dir = os.path.dirname(db_path)

# Function to export a table to JSON format
def export_table_to_json(table_name):
    output_path = os.path.join(base_dir, f"{table_name}.json")
    query = f"SELECT * FROM {table_name}"
    
    # Fetch all data from the table
    cursor = conn.execute(query)
    rows = cursor.fetchall()
    columns = [description[0] for description in cursor.description]
    
    # Convert to list of dicts
    data = [dict(zip(columns, row)) for row in rows]
    
    # Write to a JSON file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# Export all tables to JSON
export_table_to_json('Teams')
export_table_to_json('Games')
export_table_to_json('PlayerStats')
export_table_to_json('Rosters')

import pandas as pd
import sqlite3
import os

# Connect to the database
db_path = 'nfl.db'
conn = sqlite3.connect(db_path)

# Get the directory where the database is located
base_dir = os.path.dirname(db_path)

# Function to export a table to XLSX format
def export_table_to_xlsx(table_name):
    output_path = os.path.join(base_dir, f"{table_name}.xlsx")
    query = f"SELECT * FROM {table_name}"
    
    # Read data into a pandas DataFrame
    df = pd.read_sql(query, conn)
    
    # Write to Excel file
    df.to_excel(output_path, index=False)

# Export all tables to XLSX
export_table_to_xlsx('Teams')
export_table_to_xlsx('Games')
export_table_to_xlsx('PlayerStats')
export_table_to_xlsx('Rosters')

import xml.etree.ElementTree as ET
import sqlite3
import os

# Connect to the database
db_path = 'nfl.db'
conn = sqlite3.connect(db_path)

# Get the directory where the database is located
base_dir = os.path.dirname(db_path)

# Function to export a table to XML format
def export_table_to_xml(table_name):
    output_path = os.path.join(base_dir, f"{table_name}.xml")
    query = f"SELECT * FROM {table_name}"
    
    # Fetch all data from the table
    cursor = conn.execute(query)
    rows = cursor.fetchall()
    columns = [description[0] for description in cursor.description]
    
    # Create the root element
    root = ET.Element(table_name)
    
    # Iterate over rows and create XML tree structure
    for row in rows:
        entry = ET.SubElement(root, "entry")
        for col_name, value in zip(columns, row):
            col_element = ET.SubElement(entry, col_name)
            col_element.text = str(value)
    
    # Write to an XML file
    tree = ET.ElementTree(root)
    tree.write(output_path, encoding='utf-8', xml_declaration=True)

# Export all tables to XML
export_table_to_xml('Teams')
export_table_to_xml('Games')
export_table_to_xml('PlayerStats')
export_table_to_xml('Rosters')


In [None]:
# JSON pieces

import json
import sqlite3
import os
import math

# Connect to the database
db_path = 'nfl.db'
conn = sqlite3.connect(db_path)

# Get the directory where the database is located
base_dir = os.path.dirname(db_path)

# Function to split data into chunks of approx 7MB
def split_data_into_chunks(data, max_size_mb=7):
    # Convert max_size to bytes (7MB = 7 * 1024 * 1024 bytes)
    max_size_bytes = max_size_mb * 1024 * 1024
    
    chunks = []
    current_chunk = []
    current_size = 0
    
    for record in data:
        record_size = len(json.dumps(record).encode('utf-8'))  # Estimate size of the record
        if current_size + record_size > max_size_bytes:
            chunks.append(current_chunk)
            current_chunk = []
            current_size = 0
        current_chunk.append(record)
        current_size += record_size
    
    # Append the final chunk
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

# Function to export a table to JSON format in chunks
def export_table_to_json_in_chunks(table_name, max_size_mb=4):
    output_path_base = os.path.join(base_dir, table_name)
    query = f"SELECT * FROM {table_name}"
    
    # Fetch all data from the table
    cursor = conn.execute(query)
    rows = cursor.fetchall()
    columns = [description[0] for description in cursor.description]
    
    # Convert to list of dicts
    data = [dict(zip(columns, row)) for row in rows]
    
    # Split data into chunks
    data_chunks = split_data_into_chunks(data, max_size_mb=max_size_mb)
    
    # Write each chunk to a separate JSON file
    for i, chunk in enumerate(data_chunks, start=1):
        output_path = f"{output_path_base}{i}.json"
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(chunk, f, ensure_ascii=False, indent=4)

# Export tables with chunking for large datasets
export_table_to_json_in_chunks('Teams')  # Assuming Teams and Games are smaller
export_table_to_json_in_chunks('Games')
export_table_to_json_in_chunks('PlayerStats')  # Split into PlayerStats1, PlayerStats2, ...
export_table_to_json_in_chunks('Rosters')  # Split into Rosters1, Rosters2, ...


In [None]:
# # Print all tables and columns

# # Step 1: Connect to the SQLite database
# conn = sqlite3.connect('nfl.db')
# cursor = conn.cursor()

# # Step 2: Query all table names
# cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
# tables = cursor.fetchall()

# # Step 3: Print all table and column names
# for table_name in tables:
#     table_name = table_name[0]
#     print(f"Table: {table_name}")
    
#     cursor.execute(f"PRAGMA table_info({table_name});")
#     columns = cursor.fetchall()
    
#     for column in columns:
#         print(f"  Column: {column[1]}")
    
#     print()  # Add a newline for better readability

# # Close the connection
# conn.close()


In [None]:
# # Merge box scores

# import pandas as pd

# # Read the CSV files
# df1 = pd.read_csv('./data/box_scores.csv')
# df2 = pd.read_csv('./data/box_scores_2010_2019.csv')

# # Concatenate the dataframes
# merged_df = pd.concat([df1, df2])

# # Reset index if you want a clean index
# merged_df.reset_index(drop=True, inplace=True)

# # Save the merged dataframe to a new CSV file
# merged_df.to_csv('./data/box_scores.csv', index=False)

# print('Merged CSV saved as ./data/box_scores.csv')


In [None]:
# # Merge scoring tables (touchdown logs)

# # Directory containing the CSV files
# directory = './data/scoring-tables/'

# # Initialize an empty list to hold the dataframes
# dataframes = []

# # Iterate through each file in the directory
# for filename in os.listdir(directory):
#     if filename.endswith('.csv'):
#         file_path = os.path.join(directory, filename)
#         # Load the CSV file into a DataFrame
#         df = pd.read_csv(file_path)
#         # Append the DataFrame to the list
#         dataframes.append(df)

# # Concatenate all the DataFrames into one
# merged_df = pd.concat(dataframes, ignore_index=True)

# # Save the merged DataFrame to a CSV file
# merged_df.to_csv('./data/touchdown_logs.csv', index=False)

# print("Merging completed. The merged data is saved as 'touchdown_logs.csv'.")


In [None]:
# # Delete table 

# import sqlite3

# # Path to your SQLite database
# database_path = 'nfl.db'

# # Connect to the database
# conn = sqlite3.connect(database_path)
# cursor = conn.cursor()

# # SQL command to drop the Rosters table
# drop_table_query = "DROP TABLE IF EXISTS PlayerStats;"

# # Execute the query
# cursor.execute(drop_table_query)

# # Commit the changes
# conn.commit()

# # Close the connection
# conn.close()

# print("Rosters table deleted successfully.")

In [None]:
# Fix game log column names

import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = 'data/SR-game-logs/'

# List of new headers
new_headers = [
    'week_num', 'game_day_of_week', 'game_date', 'boxscore_word', 'game_outcome', 'overtime',
    'game_location', 'opp', 'pts_off', 'pts_def', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td',
    'pass_int', 'pass_sacked', 'pass_sacked_yds', 'pass_yds_per_att', 'pass_net_yds_per_att',
    'pass_cmp_perc', 'pass_rating', 'rush_att', 'rush_yds', 'rush_yds_per_att', 'rush_td', 'fgm',
    'fga', 'xpm', 'xpa', 'punt', 'punt_yds', 'third_down_success', 'third_down_att', 
    'fourth_down_success', 'fourth_down_att', 'time_of_poss', 'team_name'
]

# Loop over all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Full path to the CSV file
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Change the column headers to the new headers
        df.columns = new_headers
        
        # Save the CSV file back
        df.to_csv(file_path, index=False)
        
        print(f"Updated headers for {filename}")

print("All CSV files have been updated with new headers.")

In [None]:
# Fix opponent game log column names

# Fix game log column names

import os
import pandas as pd

# Path to the folder containing the CSV files
folder_path = 'data/SR-opponent-game-logs/'

# List of new headers
new_headers = [
    'week_num', 'game_day_of_week', 'game_date', 'boxscore_word', 'game_outcome', 'overtime',
    'game_location', 'opp', 'pts_off', 'pts_def', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td',
    'pass_int', 'pass_sacked', 'pass_sacked_yds', 'pass_yds_per_att', 'pass_net_yds_per_att',
    'pass_cmp_perc', 'pass_rating', 'rush_att', 'rush_yds', 'rush_yds_per_att', 'rush_td', 'fgm',
    'fga', 'xpm', 'xpa', 'punt', 'punt_yds', 'third_down_success', 'third_down_att', 
    'fourth_down_success', 'fourth_down_att', 'time_of_poss', 'team_name'
]

# Loop over all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Full path to the CSV file
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Change the column headers to the new headers
        df.columns = new_headers
        
        # Save the CSV file back
        df.to_csv(file_path, index=False)
        
        print(f"Updated headers for {filename}")

print("All CSV files have been updated with new headers.")