<a href="https://colab.research.google.com/github/bradymiller2310/FantasyFootballDashboard/blob/main/ESPN_FF_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
!pip install espn-api
!pip install pandas
!pip install openpyxl

#pip uninstall -y numpy catboost
!pip install numpy==1.24.4  # Safe version compatible with catboost
!pip install catboost --no-cache-dir



In [62]:
from espn_api.football import League
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool

In [63]:
league = League(league_id=1292514, year=2024, espn_s2="AECPNWyLvfnlcFfE0Xb73O%2BP5ZU9n1G%2FSnP3ixxvuqSwQRcQe7bEQfBuz3YBNeRluTYofdhX2aTajj0jZSF%2B8i9ewdk8Dwf6p9G215F2FjmI06wPKyhjleL4WzaT3PCiP15FDXe55YNMkQeJXgbsCuK1VHbdQ1Nxidq0JxAHezRn2zAt6WwNtSqrF0k2nNHV33VpJqPmMk4msFmaoCaz0UAp95xxy1YmXyEUnZWgA60HJkbsgYkZ8AaESwFENsPYdUvKYqCwnZLjvgI5varhOTDYekjvokdtaCoxJfzOdGLJ3g%3D%3D", swid="{D68FE5F2-46CE-421C-B248-DDA01360BB34}")
print(league.teams)  #Teams in league

[Team(Fightin' Furries), Team(Quon Solo), Team(Jen-eral ⚔️), Team(Captain Sweatpants), Team(Salmon LipBalm !!), Team(Graham’s Groupie), Team(pop-pop's bible study), Team(Bucktown Bandits), Team(bird gang), Team(bungalicious  💅)]


## **Testing out api functionalities**
*This section contains code chunks that load in various data - used to see what we had access to.*

In [64]:
for week in range(1, 15):
    print(f"Week {week} Schedule:")
    for matchup in league.scoreboard(week=week):
        print(f"{matchup.home_team} vs {matchup.away_team}")

Week 1 Schedule:
Team(Bucktown Bandits) vs Team(Graham’s Groupie)
Team(bungalicious  💅) vs Team(Fightin' Furries)
Team(bird gang) vs Team(Salmon LipBalm !!)
Team(pop-pop's bible study) vs Team(Quon Solo)
Team(Jen-eral ⚔️) vs Team(Captain Sweatpants)
Week 2 Schedule:
Team(Fightin' Furries) vs Team(Salmon LipBalm !!)
Team(Quon Solo) vs Team(Bucktown Bandits)
Team(Captain Sweatpants) vs Team(Graham’s Groupie)
Team(pop-pop's bible study) vs Team(bungalicious  💅)
Team(Jen-eral ⚔️) vs Team(bird gang)
Week 3 Schedule:
Team(Quon Solo) vs Team(Captain Sweatpants)
Team(Fightin' Furries) vs Team(pop-pop's bible study)
Team(Salmon LipBalm !!) vs Team(Jen-eral ⚔️)
Team(Bucktown Bandits) vs Team(bungalicious  💅)
Team(Graham’s Groupie) vs Team(bird gang)
Week 4 Schedule:
Team(pop-pop's bible study) vs Team(Jen-eral ⚔️)
Team(bungalicious  💅) vs Team(Quon Solo)
Team(bird gang) vs Team(Captain Sweatpants)
Team(Bucktown Bandits) vs Team(Fightin' Furries)
Team(Graham’s Groupie) vs Team(Salmon LipBalm !!)


In [65]:
free_agents = league.free_agents(size=20, position='QB')  # Get top 20 available QBs

print("Available Free Agents (QB):")
for player in free_agents:
    print(f"  - {player.name} ({player.proTeam})")

Available Free Agents (QB):
  - C.J. Stroud (HOU)
  - Tua Tagovailoa (MIA)
  - Anthony Richardson (IND)
  - Aaron Rodgers (None)
  - Kirk Cousins (ATL)
  - Bryce Young (CAR)
  - Michael Penix Jr. (ATL)
  - Trevor Lawrence (JAX)
  - Jameis Winston (CLE)
  - Derek Carr (NO)
  - Justin Fields (NYJ)
  - Joe Flacco (IND)
  - Deshaun Watson (CLE)
  - Cooper Rush (BAL)
  - Daniel Jones (IND)
  - Tom Brady (TB)
  - Mason Rudolph (PIT)
  - Mac Jones (SF)
  - Tommy DeVito (NYG)
  - Aidan O'Connell (LV)


In [66]:
for team in league.teams:
    print(f"\nTeam: {team.team_name}")
    print("Roster:")
    for player in team.roster:
        print(f"  - {player.name} ({player.position})")


Team: Fightin' Furries
Roster:
  - Tyreek Hill (WR)
  - Isiah Pacheco (RB)
  - Joe Mixon (RB)
  - Patrick Mahomes (QB)
  - Dalton Kincaid (TE)
  - Zay Flowers (WR)
  - Jaxon Smith-Njigba (WR)
  - Gus Edwards (RB)
  - Matthew Stafford (QB)
  - Kareem Hunt (RB)
  - Broncos D/ST (D/ST)
  - David Njoku (TE)
  - Caleb Williams (QB)
  - DeAndre Hopkins (WR)
  - Bills D/ST (D/ST)
  - Matthew Wright (K)

Team: Quon Solo
Roster:
  - Justin Jefferson (WR)
  - Kyren Williams (RB)
  - Marvin Harrison Jr. (WR)
  - Mike Evans (WR)
  - D'Andre Swift (RB)
  - Brian Robinson Jr. (RB)
  - Kyler Murray (QB)
  - J.K. Dobbins (RB)
  - Geno Smith (QB)
  - Chris Boswell (K)
  - Romeo Doubs (WR)
  - Mark Andrews (TE)
  - Bears D/ST (D/ST)
  - Courtland Sutton (WR)
  - Josh Downs (WR)
  - Evan Engram (TE)

Team: Jen-eral ⚔️
Roster:
  - Derrick Henry (RB)
  - A.J. Brown (WR)
  - Alvin Kamara (RB)
  - Jalen Hurts (QB)
  - Calvin Ridley (WR)
  - Raheem Mostert (RB)
  - Jets D/ST (D/ST)
  - Pat Freiermuth (TE)
  

In [67]:
week_number = 1
box_scores = league.box_scores(week=week_number)

for matchup in box_scores:
    print(f"\nMatchup: {matchup.home_team} vs {matchup.away_team}")
    print(f"  Home Score: {matchup.home_score}")
    print(f"  Away Score: {matchup.away_score}")


Matchup: Team(Bucktown Bandits) vs Team(Graham’s Groupie)
  Home Score: 73.0
  Away Score: 96.0

Matchup: Team(bungalicious  💅) vs Team(Fightin' Furries)
  Home Score: 101.0
  Away Score: 94.0

Matchup: Team(bird gang) vs Team(Salmon LipBalm !!)
  Home Score: 59.0
  Away Score: 101.0

Matchup: Team(pop-pop's bible study) vs Team(Quon Solo)
  Home Score: 79.0
  Away Score: 97.0

Matchup: Team(Jen-eral ⚔️) vs Team(Captain Sweatpants)
  Home Score: 82.0
  Away Score: 110.0


In [68]:
# getting injury status of free agents (not on rosters)

free_agents = league.free_agents()
i=0
for player in free_agents:
  player_obj = free_agents[i]
  print(f"Player: {player_obj.name}, Injury Status: {player_obj.injuryStatus}, Injured: {player_obj.injured}")
  i+=1

Player: Ladd McConkey, Injury Status: ACTIVE, Injured: False
Player: Keenan Allen, Injury Status: ACTIVE, Injured: False
Player: Michael Pittman Jr., Injury Status: QUESTIONABLE, Injured: False
Player: Steelers D/ST, Injury Status: [], Injured: False
Player: C.J. Stroud, Injury Status: ACTIVE, Injured: False
Player: Kyle Pitts, Injury Status: ACTIVE, Injured: False
Player: Adam Thielen, Injury Status: ACTIVE, Injured: False
Player: Chris Olave, Injury Status: QUESTIONABLE, Injured: False
Player: Tucker Kraft, Injury Status: ACTIVE, Injured: False
Player: 49ers D/ST, Injury Status: [], Injured: False
Player: Jerome Ford, Injury Status: QUESTIONABLE, Injured: False
Player: Texans D/ST, Injury Status: [], Injured: False
Player: Quentin Johnston, Injury Status: ACTIVE, Injured: False
Player: Tua Tagovailoa, Injury Status: QUESTIONABLE, Injured: False
Player: Nick Chubb, Injury Status: QUESTIONABLE, Injured: False
Player: Anthony Richardson, Injury Status: QUESTIONABLE, Injured: False
Playe

In [69]:
# getting injury status of players on rosters
for team in league.teams:
    print(f"\nTeam: {team.team_name}")
    for player in team.roster:
        print(f"Player: {player.name}, Injury Status: {player.injuryStatus}, Injured: {player.injured}")


Team: Fightin' Furries
Player: Tyreek Hill, Injury Status: QUESTIONABLE, Injured: False
Player: Isiah Pacheco, Injury Status: ACTIVE, Injured: False
Player: Joe Mixon, Injury Status: ACTIVE, Injured: False
Player: Patrick Mahomes, Injury Status: ACTIVE, Injured: False
Player: Dalton Kincaid, Injury Status: ACTIVE, Injured: False
Player: Zay Flowers, Injury Status: QUESTIONABLE, Injured: False
Player: Jaxon Smith-Njigba, Injury Status: ACTIVE, Injured: False
Player: Gus Edwards, Injury Status: ACTIVE, Injured: False
Player: Matthew Stafford, Injury Status: ACTIVE, Injured: False
Player: Kareem Hunt, Injury Status: ACTIVE, Injured: False
Player: Broncos D/ST, Injury Status: NORMAL, Injured: False
Player: David Njoku, Injury Status: QUESTIONABLE, Injured: False
Player: Caleb Williams, Injury Status: QUESTIONABLE, Injured: False
Player: DeAndre Hopkins, Injury Status: ACTIVE, Injured: False
Player: Bills D/ST, Injury Status: NORMAL, Injured: False
Player: Matthew Wright, Injury Status: AC

In [70]:
# getting starting roster for each week

for team in league.teams:
    print(f"\nTeam: {team.team_name} (Week {week})")

    # Get the team's roster for the given week
    for player in team.roster:
        #if player.lineupSlot != "BE" and player.lineupSlot != "IR":  # Exclude Bench and IR players
            print(f"  - {player.name} ({player.lineupSlot})")



Team: Fightin' Furries (Week 14)
  - Tyreek Hill (WR)
  - Isiah Pacheco (RB)
  - Joe Mixon (BE)
  - Patrick Mahomes (QB)
  - Dalton Kincaid (BE)
  - Zay Flowers (BE)
  - Jaxon Smith-Njigba (RB/WR/TE)
  - Gus Edwards (BE)
  - Matthew Stafford (BE)
  - Kareem Hunt (RB)
  - Broncos D/ST (BE)
  - David Njoku (TE)
  - Caleb Williams (BE)
  - DeAndre Hopkins (WR)
  - Bills D/ST (D/ST)
  - Matthew Wright (K)

Team: Quon Solo (Week 14)
  - Justin Jefferson (WR)
  - Kyren Williams (RB)
  - Marvin Harrison Jr. (BE)
  - Mike Evans (BE)
  - D'Andre Swift (RB)
  - Brian Robinson Jr. (RB/WR/TE)
  - Kyler Murray (QB)
  - J.K. Dobbins (BE)
  - Geno Smith (BE)
  - Chris Boswell (K)
  - Romeo Doubs (BE)
  - Mark Andrews (TE)
  - Bears D/ST (D/ST)
  - Courtland Sutton (WR)
  - Josh Downs (BE)
  - Evan Engram (BE)

Team: Jen-eral ⚔️ (Week 14)
  - Derrick Henry (RB)
  - A.J. Brown (WR)
  - Alvin Kamara (RB)
  - Jalen Hurts (QB)
  - Calvin Ridley (WR)
  - Raheem Mostert (BE)
  - Jets D/ST (D/ST)
  - Pat Fr

## **Getting the fantasy team rosters**
*Will use to join to player data to maintain team assigments*

In [71]:
# Define the range of weeks
weeks = range(1, 19)  # Weeks 1 to 18

# Create an empty list to store data
all_weeks_roster = []

# Loop through each week and get the roster data
for week in weeks:
    week_data = []  # Temporary storage for current week's data

    for team in league.teams:
        team_name = team.team_name  # Fantasy Team Name

        for player in team.roster:
            week_data.append({
                "cur_Fteam": team_name,
                "week": week,
                "espn_id": player.playerId,  # ESPN Unique Player ID
                "cur_roster_slot": player.position if player.lineupSlot != "BE" else "Bench"
            })

    # If no data is returned for this week, stop iterating
    if not week_data:
        print(f"No data found for Week {week}. Stopping iteration.")
        break  # Stop looping when a week has no data

    # Add this week's data to the master list
    all_weeks_roster.extend(week_data)

# Convert to DataFrame
weekly_rosters = pd.DataFrame(all_weeks_roster)

cur_roster = weekly_rosters[weekly_rosters['week'] == weekly_rosters['week'].max()].drop('week', axis=1)
print(cur_roster)

             cur_Fteam  espn_id cur_roster_slot
2720  Fightin' Furries  3116406              WR
2721  Fightin' Furries  4361529              RB
2722  Fightin' Furries  3116385           Bench
2723  Fightin' Furries  3139477              QB
2724  Fightin' Furries  4385690           Bench
...                ...      ...             ...
2875   bungalicious  💅  4243537           Bench
2876   bungalicious  💅  4689936               K
2877   bungalicious  💅  3912547              QB
2878   bungalicious  💅   -16011            D/ST
2879   bungalicious  💅  4426385              RB

[160 rows x 3 columns]


### **Getting statistical data from xlsx files**

In [72]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [73]:
file_path = "/content/drive/My Drive/ff_data.xlsx"

# Load specific sheets
season_stats_all = pd.read_excel(file_path, sheet_name="Season", engine="openpyxl")
weekly_stats_all = pd.read_excel(file_path, sheet_name="Weekly", engine="openpyxl")

# Display first few rows
print(season_stats_all.head())
print(weekly_stats_all.head())

   season   player_id player_display_name team position  games_played  \
0    2021  00-0019596           Tom Brady   TB       QB          20.0   
1    2021  00-0022924  Ben Roethlisberger  PIT       QB          18.0   
2    2021  00-0023459       Aaron Rodgers   GB       QB          17.0   
3    2021  00-0023682    Ryan Fitzpatrick  WAS       QB           1.0   
4    2021  00-0024243      Marcedes Lewis   GB       TE          14.0   

   completions  attempts  comp%  passing_yards  ...  total_st_tds  st_tds_pg  \
0        573.0     847.0  67.65         6187.0  ...           NaN        NaN   
1        448.0     693.0  64.65         4170.0  ...           NaN        NaN   
2        386.0     560.0  68.93         4340.0  ...           NaN        NaN   
3          3.0       6.0  50.00           13.0  ...           NaN        NaN   
4          0.0       0.0    NaN            0.0  ...           NaN        NaN   

   total_fp  fp_pg  espn_id                name   age  height  weight  \
0      

## **Joining current fantasy team roster data with weekly and season data**

### *Season data*

In [74]:
season_stats_all = pd.merge(season_stats_all, cur_roster, on="espn_id", how="left")

season_stats_all.loc[
    (season_stats_all['season'] == 2024) &
    (season_stats_all['cur_Fteam'] != 'FA') &
    (season_stats_all['cur_roster_slot'] != 'FA'),
    ['cur_Fteam', 'cur_roster_slot']
] = 'FA'

In [75]:
#print(season_stats_all.head(10))

### *Weekly data*

In [76]:
weekly_stats_all = pd.merge(weekly_stats_all, weekly_rosters, on=["espn_id", "week"], how="left")

weekly_stats_all.loc[
    (weekly_stats_all['season'] == 2024) &
    (weekly_stats_all['cur_Fteam'] != 'FA') &
    (weekly_stats_all['cur_roster_slot'] != 'FA'),
    ['cur_Fteam', 'cur_roster_slot']
] = 'FA'

In [77]:
#print(weekly_stats_all.head(10))

# **Getting matchup data**

In [78]:
total_weeks = league.settings.reg_season_count  # Regular season length

# Initialize records dict
team_records = {team.team_name: {'W': 0, 'L': 0} for team in league.teams}

# Store all weekly matchups and records
matchups_all_weeks = []

# Loop through each week
for week in range(1, total_weeks + 1):
    scoreboard = league.scoreboard(week=week)

    for matchup in scoreboard:
        home = matchup.home_team.team_name
        away = matchup.away_team.team_name
        home_score = matchup.home_score
        away_score = matchup.away_score

        # Determine winner and update records
        if home_score > away_score:
            team_records[home]['W'] += 1
            team_records[away]['L'] += 1
        elif home_score < away_score:
            team_records[away]['W'] += 1
            team_records[home]['L'] += 1
        else:
            # Tie case (optional)
            pass

        # Record after this week
        home_record = f"{team_records[home]['W']}-{team_records[home]['L']}"
        away_record = f"{team_records[away]['W']}-{team_records[away]['L']}"

        matchups_all_weeks.append({
            'week': week,
            'home_team': home,
            'away_team': away,
            'home_score': home_score,
            'away_score': away_score,
            'home_record': home_record,
            'away_record': away_record
        })

# Convert to DataFrame
weekly_matchup_data = pd.DataFrame(matchups_all_weeks)

# Display the result
print(weekly_matchup_data)

    week              home_team              away_team  home_score  \
0      1       Bucktown Bandits       Graham’s Groupie        73.0   
1      1        bungalicious  💅       Fightin' Furries       101.0   
2      1              bird gang      Salmon LipBalm !!        59.0   
3      1  pop-pop's bible study              Quon Solo        79.0   
4      1            Jen-eral ⚔️     Captain Sweatpants        82.0   
..   ...                    ...                    ...         ...   
65    14       Graham’s Groupie       Bucktown Bandits       143.0   
66    14      Salmon LipBalm !!       Fightin' Furries       115.0   
67    14     Captain Sweatpants              Quon Solo        88.0   
68    14            Jen-eral ⚔️  pop-pop's bible study        68.0   
69    14              bird gang        bungalicious  💅        81.0   

    away_score home_record away_record  
0         96.0         0-1         1-0  
1         94.0         1-0         0-1  
2        101.0         0-1         1

# **Getting injury status**
PROBABLY CAN'T USE B/C ONLY HAVE 2024 DATA

In [79]:
all_players = []
current_week = league.current_week

# 1. Rostered players from each team
for team in league.teams:
    for player in team.roster:
        all_players.append({
            "week": current_week,
            "team_owner": team.team_name,
            "name": player.name,
            "position": player.position,
            "espn_id": player.playerId,
            "injured": player.injured,
            "injury_status": player.injuryStatus,
            "rostered": True
        })

# 2. Free agents (unrostered players)
for player in league.free_agents(size=1000):  # you can adjust the size to pull more players
    all_players.append({
        "week": current_week,
        "team_owner": "Free Agent",
        "name": player.name,
        "position": player.position,
        "espn_id": player.playerId,
        "injured": player.injured,
        "injury_status": player.injuryStatus,
        "rostered": False
    })

# Optional: Convert to pandas DataFrame
import pandas as pd
injury_status = pd.DataFrame(all_players)

# Display or export
print(injury_status.head(10))

print(injury_status[injury_status["team_owner"] == "Free Agent"].head(10))

   week        team_owner                name position  espn_id  injured  \
0    17  Fightin' Furries         Tyreek Hill       WR  3116406    False   
1    17  Fightin' Furries       Isiah Pacheco       RB  4361529    False   
2    17  Fightin' Furries           Joe Mixon       RB  3116385    False   
3    17  Fightin' Furries     Patrick Mahomes       QB  3139477    False   
4    17  Fightin' Furries      Dalton Kincaid       TE  4385690    False   
5    17  Fightin' Furries         Zay Flowers       WR  4429615    False   
6    17  Fightin' Furries  Jaxon Smith-Njigba       WR  4430878    False   
7    17  Fightin' Furries         Gus Edwards       RB  3051926    False   
8    17  Fightin' Furries    Matthew Stafford       QB    12483    False   
9    17  Fightin' Furries         Kareem Hunt       RB  3059915    False   

  injury_status  rostered  
0  QUESTIONABLE      True  
1        ACTIVE      True  
2        ACTIVE      True  
3        ACTIVE      True  
4        ACTIVE      Tr

# **QB CatBoost Model**

## **Feature Engineering**
*Need to create features that can be used in both train and test sets since we can't use rush yards, pass yards (any game specific stats) because they have not occurred and need to use like rolling features or averages*

In [80]:
df_qb = weekly_stats_all[weekly_stats_all['position'] == 'QB'].copy()

# Sorting data for time-series purposes
df_qb = df_qb.sort_values(by=['player_id', 'season', 'week'])

# Creating lag/rolling features for model
# Takes into account past 3 games (or less, depending on the data available that week)
rolling_cols = [
    'passing_yards', 'passing_tds', 'interceptions', 'passing_epa',
    'rushing_yards', 'rushing_tds', 'completions', 'attempts',
    'fantasy_points_ppr', 'passing_air_yards', 'passing_yards_after_catch',
    'rushing_fumbles_lost', 'rushing_epa', 'rushing_2pt_conversions', 'passing_2pt_conversions'
]

for col in rolling_cols:
    df_qb[f'{col}_last3'] = df_qb.groupby('player_id')[col].shift(1).rolling(3, min_periods = 1).mean()

# Season averages
expanding_cols = ['passing_yards', 'passing_tds', 'interceptions', 'rushing_yards', 'rushing_tds', 'fantasy_points_ppr'
]

for col in expanding_cols:
    df_qb[col] = pd.to_numeric(df_qb[col], errors='coerce')

for col in expanding_cols:
    df_qb[f'{col}_season_avg'] = (
        df_qb.groupby(['player_id', 'season'])[col]
        .transform(lambda x: x.shift(1).expanding().mean())
    )

df_qb['completions'] = pd.to_numeric(df_qb['completions'], errors='coerce')
df_qb['attempts'] = pd.to_numeric(df_qb['attempts'], errors='coerce')

# creating yards per attempt feature
df_qb['yards_per_attempt'] = df_qb['passing_yards'] / df_qb['attempts'].replace(0, pd.NA)

# Creating completion percentage feature
df_qb['comp%'] = (
    (df_qb['completions'] / df_qb['attempts']) * 100
).replace([pd.NA, np.inf, -np.inf], np.nan)

df_qb['comp%'] = pd.to_numeric(df_qb['comp%'], errors='coerce')

df_qb['yards_per_attempt'] = pd.to_numeric(df_qb['yards_per_attempt'], errors='coerce')

print(df_qb.head(10))

    player_id player_name player_display_name position position_group  \
0  00-0019596     T.Brady           Tom Brady       QB             QB   
1  00-0019596     T.Brady           Tom Brady       QB             QB   
2  00-0019596     T.Brady           Tom Brady       QB             QB   
3  00-0019596     T.Brady           Tom Brady       QB             QB   
4  00-0019596     T.Brady           Tom Brady       QB             QB   
5  00-0019596     T.Brady           Tom Brady       QB             QB   
6  00-0019596     T.Brady           Tom Brady       QB             QB   
7  00-0019596     T.Brady           Tom Brady       QB             QB   
8  00-0019596     T.Brady           Tom Brady       QB             QB   
9  00-0019596     T.Brady           Tom Brady       QB             QB   

                                        headshot_url recent_team  season  \
0  https://static.www.nfl.com/image/private/f_aut...          TB    2021   
1  https://static.www.nfl.com/image/private/

In [81]:
#Creating some efficiency metrics
df_qb['pass_attempts_shifted'] = df_qb.groupby('player_id')['attempts'].shift(1)

df_qb['comp_pct_last3'] = df_qb.groupby('player_id')['comp%'].shift(1).rolling(3,min_periods = 1).mean()
df_qb['yards_per_attempt_last3'] = df_qb.groupby('player_id')['yards_per_attempt'].shift(1).rolling(3,min_periods = 1).mean()
df_qb['td_rate_last3'] = df_qb.groupby('player_id')['passing_tds'].shift(1).rolling(3,min_periods = 1).sum() / df_qb['pass_attempts_shifted']
df_qb['int_rate_last3'] = df_qb.groupby('player_id')['interceptions'].shift(1).rolling(3,min_periods = 1).sum() / df_qb['pass_attempts_shifted']

# Creating rolling/lag features for advanced QB metrics
advanced = ['pacr', 'dakota']
for col in advanced:
    df_qb[f'{col}_trend'] = df_qb.groupby('player_id')[col].shift(1).rolling(3,min_periods = 1).mean()

# Creating defensive lag features
opponent_cols = ['opp_avg_ypg_allowed', 'opp_avg_ppg_allowed', 'opp_int_pg', 'opp_sacks_pg', 'opp_fumbles_pg']
for col in opponent_cols:
    df_qb[f'{col}_trend'] = df_qb.groupby('player_id')[col].shift(1).rolling(3,min_periods = 1).mean()


In [82]:
#print(df_qb.head(10))

***Getting list of columns to select from***

In [83]:
#print(df_qb.columns.tolist())

In [84]:
TARGET_COL = 'fantasy_points_ppr'

# Creating test and train sets
train_df = df_qb[
    (df_qb[TARGET_COL].notna()) &
    (
        (df_qb['season'] < 2024) |
        ((df_qb['season'] == 2024) & (df_qb['week'] < 14))
    )
]

test_df = df_qb[(df_qb['season'] == 2024) & (df_qb['week'] == 14)]


feature_cols = [
    col for col in df_qb.columns
    if col.endswith('_last3') or col.endswith('_trend') or col.endswith('_season_avg')
] + ['player_display_name','season', 'week', 'opponent_team']


X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]

X_test = test_df[feature_cols]

# for RMSE testing purposes on the test set
y_test = test_df[TARGET_COL]

In [85]:
categorical_cols = ['opponent_team', 'season', 'week', 'player_display_name']

# Create Pool objects
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, cat_features=categorical_cols)

# Initialize the CatBoost model
model = CatBoostRegressor(
    iterations = 1000,
    learning_rate = 0.01,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=50  # or 0 to suppress output
)

# Train
model.fit(train_pool)

0:	learn: 8.9456128	total: 5.29ms	remaining: 5.29s
50:	learn: 8.1178320	total: 480ms	remaining: 8.94s
100:	learn: 7.6410595	total: 945ms	remaining: 8.41s
150:	learn: 7.3802963	total: 1.42s	remaining: 7.97s
200:	learn: 7.2176080	total: 1.9s	remaining: 7.55s
250:	learn: 7.0782452	total: 2.38s	remaining: 7.1s
300:	learn: 6.9539157	total: 2.85s	remaining: 6.63s
350:	learn: 6.8611232	total: 3.35s	remaining: 6.19s
400:	learn: 6.7691747	total: 3.83s	remaining: 5.72s
450:	learn: 6.6989539	total: 4.29s	remaining: 5.22s
500:	learn: 6.6265265	total: 4.76s	remaining: 4.75s
550:	learn: 6.5734129	total: 5.23s	remaining: 4.26s
600:	learn: 6.5214721	total: 5.7s	remaining: 3.78s
650:	learn: 6.4766232	total: 6.17s	remaining: 3.31s
700:	learn: 6.4189386	total: 6.66s	remaining: 2.84s
750:	learn: 6.3705137	total: 7.11s	remaining: 2.36s
800:	learn: 6.3211150	total: 7.57s	remaining: 1.88s
850:	learn: 6.2796464	total: 8.03s	remaining: 1.41s
900:	learn: 6.2402344	total: 8.53s	remaining: 937ms
950:	learn: 6.195

<catboost.core.CatBoostRegressor at 0x78564e26ef90>

In [86]:
y_pred = model.predict(test_pool)

# Attach predictions to test set for review
QB_test_pred = X_test.copy()
QB_test_pred['predicted_fantasy_points'] = y_pred
#QB_test_pred['espn_id'] = test_df['espn_id']
QB_test_pred['espn_id'] = test_df['espn_id'].astype(int)
QB_test_pred['position'] = "QB"
#print(QB_test_pred.head(10))
#print(QB_test_pred.columns.tolist())

print(QB_test_pred[['season', 'week', 'opponent_team', 'predicted_fantasy_points', 'player_display_name', 'espn_id', 'position']])

       season  week opponent_team  predicted_fantasy_points  \
16904    2024    14           MIA                 15.398341   
16933    2024    14           BUF                 16.767097   
16953    2024    14           CLE                 16.336262   
16971    2024    14           MIN                 14.764804   
17095    2024    14           ARI                 17.328012   
17141    2024    14           NYG                 16.465678   
17192    2024    14           PIT                 13.248461   
17272    2024    14            KC                  9.573386   
17368    2024    14           CHI                  9.949140   
17510    2024    14            GB                 17.270025   
17624    2024    14           ATL                  8.258414   
17781    2024    14           CIN                 13.767175   
17869    2024    14           LAC                 20.055098   
18439    2024    14            LV                 17.792246   
18457    2024    14            LA                 21.19

### Adding boom/bust prediction interval

In [87]:
# Finding standard deviation residuals from trian data
train_preds = model.predict(train_pool)
residuals = y_train - train_preds
std_resid = residuals.std()

# Creating boom/bust interval
ci_upper = y_pred + 1.645 * std_resid
ci_lower = y_pred - 1.645 * std_resid

# setting minimum to 0
ci_lower = np.maximum(ci_lower, 0)


QB_test_pred['upper_bound'] = ci_upper
QB_test_pred['lower_bound'] = ci_lower

print(QB_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id']])

       season  week opponent_team  lower_bound  predicted_fantasy_points  \
16904    2024    14           MIA     5.165464                 15.398341   
16933    2024    14           BUF     6.534220                 16.767097   
16953    2024    14           CLE     6.103385                 16.336262   
16971    2024    14           MIN     4.531927                 14.764804   
17095    2024    14           ARI     7.095135                 17.328012   
17141    2024    14           NYG     6.232800                 16.465678   
17192    2024    14           PIT     3.015584                 13.248461   
17272    2024    14            KC     0.000000                  9.573386   
17368    2024    14           CHI     0.000000                  9.949140   
17510    2024    14            GB     7.037148                 17.270025   
17624    2024    14           ATL     0.000000                  8.258414   
17781    2024    14           CIN     3.534297                 13.767175   
17869    202

### Evaluating QB CatBoost model predictions

In [88]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Actuals and predictions
y_true = y_test
y_pred = QB_test_pred['predicted_fantasy_points']

# RMSE and MAE
rmse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ MAE: {mae:.2f}")

✅ RMSE: 66.43
✅ MAE: 5.82


# **RB CatBoost Model**

In [89]:
df_rb = weekly_stats_all[weekly_stats_all['position'] == 'RB'].copy()

# Sorting for time-series purposes
df_rb = df_rb.sort_values(by=['player_id', 'season', 'week'])

# Creating lag/rolling features for model
# Takes into account past 3 games (or less, depending on the data available that week)
rolling_cols = [
    'rushing_yards', 'rushing_tds',
    'fantasy_points_ppr', 'carries',
    'rushing_fumbles_lost', 'rushing_epa', 'rushing_2pt_conversions', 'passing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_epa', 'receiving_2pt_conversions', 'target_share', 'offense_snaps', 'offense_pct', 'air_yards_share'
    ]

for col in rolling_cols:
    df_rb[f'{col}_last3'] = df_rb.groupby('player_id')[col].shift(1).rolling(3, min_periods = 1).mean()

# Creating season averages for certain stats
expanding_cols = ['rushing_yards', 'rushing_tds', 'fantasy_points_ppr'
]

for col in expanding_cols:
    df_rb[col] = pd.to_numeric(df_rb[col], errors='coerce')

for col in expanding_cols:
    df_rb[f'{col}_season_avg'] = (
        df_rb.groupby(['player_id', 'season'])[col]
        .transform(lambda x: x.shift(1).expanding().mean())
    )


df_rb['carries'] = pd.to_numeric(df_rb['carries'], errors='coerce')

# Creating yards per carry feature
df_rb['yards_per_attempt'] = df_rb['rushing_yards'] / df_rb['carries'].replace(0, pd.NA)
df_rb['yards_per_attempt'] = pd.to_numeric(df_rb['yards_per_attempt'], errors='coerce')

print(df_rb.head(10))

      player_id player_name player_display_name position position_group  \
68   00-0025394  A.Peterson     Adrian Peterson       RB             RB   
69   00-0025394  A.Peterson     Adrian Peterson       RB             RB   
70   00-0025394  A.Peterson     Adrian Peterson       RB             RB   
71   00-0025394  A.Peterson     Adrian Peterson       RB             RB   
266  00-0027966    M.Ingram         Mark Ingram       RB             RB   
267  00-0027966    M.Ingram         Mark Ingram       RB             RB   
268  00-0027966    M.Ingram         Mark Ingram       RB             RB   
269  00-0027966    M.Ingram         Mark Ingram       RB             RB   
270  00-0027966    M.Ingram         Mark Ingram       RB             RB   
271  00-0027966    M.Ingram         Mark Ingram       RB             RB   

                                          headshot_url recent_team  season  \
68   https://static.www.nfl.com/image/private/f_aut...         TEN    2021   
69   https://stati

In [90]:
# Creating efficiency metrics for RBs
df_rb['carries_shifted'] = df_rb.groupby('player_id')['carries'].shift(1)
df_rb['receptions_shifted'] = df_rb.groupby('player_id')['receptions'].shift(1)


df_rb['yards_per_attempt_last3'] = df_rb.groupby('player_id')['yards_per_attempt'].shift(1).rolling(3,min_periods = 1).mean()
df_rb['rush_td_rate_last3'] = df_rb.groupby('player_id')['rushing_tds'].shift(1).rolling(3,min_periods = 1).sum() / df_rb['carries_shifted']
df_rb['rec_td_rate_last3'] = df_rb.groupby('player_id')['receiving_tds'].shift(1).rolling(3,min_periods = 1).sum() / df_rb['receptions_shifted']

# Crewating lag/rolling features for RB advanced stats
advanced = ['wopr', 'racr']
for col in advanced:
    df_rb[f'{col}_trend'] = df_rb.groupby('player_id')[col].shift(1).rolling(3,min_periods = 1).mean()

# Create lag features for opposing defensive stats
opponent_cols = ['opp_avg_ypg_allowed', 'opp_avg_ppg_allowed', 'opp_int_pg', 'opp_sacks_pg', 'opp_fumbles_pg']
for col in opponent_cols:
    df_rb[f'{col}_trend'] = df_rb.groupby('player_id')[col].shift(1).rolling(3,min_periods = 1).mean()


In [91]:
TARGET_COL = 'fantasy_points_ppr'

# Creating train and test sets
train_df = df_rb[
    (df_rb[TARGET_COL].notna()) &
    (
        (df_rb['season'] < 2024) |
        ((df_rb['season'] == 2024) & (df_rb['week'] < 14))
    )
]

test_df = df_rb[(df_rb['season'] == 2024) & (df_rb['week'] == 14)]


feature_cols = [
    col for col in df_rb.columns
    if col.endswith('_last3') or col.endswith('_trend') or col.endswith('_season_avg')
] + ['player_display_name','season', 'week', 'opponent_team']


X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]

X_test = test_df[feature_cols]

# for RMSE testing purposes on the test set
y_test = test_df[TARGET_COL]

In [92]:
categorical_cols = ['opponent_team', 'season', 'week', 'player_display_name']

# Create Pool objects
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, cat_features=categorical_cols)

# Initialize the CatBoost model
model = CatBoostRegressor(
    #iterations=200,
    iterations = 1000,
    learning_rate = 0.01,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=50  # or 0 to suppress output
)

# Train
model.fit(train_pool)

0:	learn: 7.8844507	total: 35.3ms	remaining: 35.3s
50:	learn: 7.1091038	total: 1.31s	remaining: 24.4s
100:	learn: 6.7186174	total: 2.17s	remaining: 19.3s
150:	learn: 6.5112589	total: 2.76s	remaining: 15.5s
200:	learn: 6.3966107	total: 3.32s	remaining: 13.2s
250:	learn: 6.3243741	total: 3.9s	remaining: 11.7s
300:	learn: 6.2673514	total: 4.48s	remaining: 10.4s
350:	learn: 6.2252575	total: 5.07s	remaining: 9.38s
400:	learn: 6.1873968	total: 5.65s	remaining: 8.44s
450:	learn: 6.1521589	total: 6.26s	remaining: 7.62s
500:	learn: 6.1211165	total: 6.88s	remaining: 6.85s
550:	learn: 6.0919244	total: 7.46s	remaining: 6.07s
600:	learn: 6.0590259	total: 8.05s	remaining: 5.34s
650:	learn: 6.0330075	total: 8.63s	remaining: 4.63s
700:	learn: 6.0075611	total: 9.23s	remaining: 3.94s
750:	learn: 5.9851419	total: 9.8s	remaining: 3.25s
800:	learn: 5.9593712	total: 10.4s	remaining: 2.58s
850:	learn: 5.9339092	total: 11s	remaining: 1.92s
900:	learn: 5.9055668	total: 11.6s	remaining: 1.27s
950:	learn: 5.8808

<catboost.core.CatBoostRegressor at 0x78564e28e150>

In [93]:
y_pred = model.predict(test_pool)

# Attach predictions to test set for review
RB_test_pred = X_test.copy()
RB_test_pred['predicted_fantasy_points'] = y_pred
RB_test_pred['espn_id'] = test_df['espn_id'].apply(lambda x: str(int(x)) if pd.notna(x) else np.nan)
RB_test_pred['position'] = "RB"

print(RB_test_pred[['season', 'week', 'opponent_team', 'predicted_fantasy_points', 'player_display_name', 'espn_id', 'position']])

       season  week opponent_team  predicted_fantasy_points  \
17108    2024    14           CLE                  5.243407   
17290    2024    14            TB                  9.328712   
17470    2024    14           CIN                  5.558331   
17603    2024    14           ATL                 14.405033   
17675    2024    14           LAC                  5.799112   
...       ...   ...           ...                       ...   
22051    2024    14           BUF                  4.237266   
22104    2024    14           MIA                  5.381880   
22117    2024    14           MIA                  4.171702   
22222    2024    14           NYJ                  3.950106   
22444    2024    14           SEA                  4.711971   

         player_display_name  espn_id position  
17108  Cordarrelle Patterson    15807       RB  
17290         Ameer Abdullah  2576336       RB  
17470        Ezekiel Elliott  3051392       RB  
17603            Aaron Jones  3042519       RB 

### Adding boom/bust prediction interval

In [94]:
train_preds = model.predict(train_pool)
residuals = y_train - train_preds
std_resid = residuals.std()

# Creating boom/bust intervals
ci_upper = y_pred + 1.645 * std_resid
ci_lower = y_pred - 1.645 * std_resid

# setting minimum to 0
ci_lower = np.maximum(ci_lower, 0)


RB_test_pred['upper_bound'] = ci_upper
RB_test_pred['lower_bound'] = ci_lower

print(RB_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']])

       season  week opponent_team  lower_bound  predicted_fantasy_points  \
17108    2024    14           CLE     0.000000                  5.243407   
17290    2024    14            TB     0.000000                  9.328712   
17470    2024    14           CIN     0.000000                  5.558331   
17603    2024    14           ATL     4.700067                 14.405033   
17675    2024    14           LAC     0.000000                  5.799112   
...       ...   ...           ...          ...                       ...   
22051    2024    14           BUF     0.000000                  4.237266   
22104    2024    14           MIA     0.000000                  5.381880   
22117    2024    14           MIA     0.000000                  4.171702   
22222    2024    14           NYJ     0.000000                  3.950106   
22444    2024    14           SEA     0.000000                  4.711971   

       upper_bound    player_display_name  espn_id position  
17108    14.948374  Corda

### Evaluating RB CatBoost model predictions

In [95]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Actuals and predictions
y_true = y_test
y_pred = RB_test_pred['predicted_fantasy_points']

# RMSE and MAE
rmse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ MAE: {mae:.2f}")

✅ RMSE: 51.45
✅ MAE: 5.08


# **WR CatBoost Model**

In [96]:
df_wr = weekly_stats_all[weekly_stats_all['position'] == 'WR'].copy()

# Sorting for time series purposes
df_wr = df_wr.sort_values(by=['player_id', 'season', 'week'])

# Creating lag/rolling features to be used as input for the model
rolling_cols = [
    'rushing_yards', 'rushing_tds',
    'fantasy_points_ppr', 'carries',
    'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_epa', 'receiving_2pt_conversions', 'target_share', 'offense_snaps', 'offense_pct', 'air_yards_share'
    ]

for col in rolling_cols:
    df_wr[f'{col}_last3'] = df_wr.groupby('player_id')[col].shift(1).rolling(3, min_periods = 1).mean()

# Creating season averages
expanding_cols = ['receiving_yards', 'receiving_tds', 'fantasy_points_ppr'
]

for col in expanding_cols:
    df_wr[col] = pd.to_numeric(df_wr[col], errors='coerce')

for col in expanding_cols:
    df_wr[f'{col}_season_avg'] = (
        df_wr.groupby(['player_id', 'season'])[col]
        .transform(lambda x: x.shift(1).expanding().mean())
    )


df_wr['receiving_yards'] = pd.to_numeric(df_wr['receiving_yards'], errors='coerce')
df_wr['receptions'] = pd.to_numeric(df_wr['receptions'], errors='coerce')

# Creating yards per catch data
df_wr['yards_per_catch'] = df_wr['receiving_yards'] / df_wr['receptions'].replace(0, pd.NA)
df_wr['yards_per_catch'] = pd.to_numeric(df_wr['yards_per_catch'], errors='coerce')



In [97]:
# Creating WR efficiency metrics
df_wr['carries_shifted'] = df_wr.groupby('player_id')['carries'].shift(1)
df_wr['receptions_shifted'] = df_wr.groupby('player_id')['receptions'].shift(1)


df_wr['yards_per_reception_last3'] = df_wr.groupby('player_id')['yards_per_catch'].shift(1).rolling(3,min_periods = 1).mean()
df_wr['rec_td_rate_last3'] = df_wr.groupby('player_id')['receiving_tds'].shift(1).rolling(3,min_periods = 1).sum() / df_rb['receptions_shifted']

# Creating rolling/lag features for advanced WR stats
advanced = ['wopr', 'racr']
for col in advanced:
    df_wr[f'{col}_trend'] = df_wr.groupby('player_id')[col].shift(1).rolling(3,min_periods = 1).mean()

# Creating lag/rolling features for opposing defense stats
opponent_cols = ['opp_avg_ypg_allowed', 'opp_avg_ppg_allowed', 'opp_int_pg', 'opp_sacks_pg', 'opp_fumbles_pg']
for col in opponent_cols:
    df_wr[f'{col}_trend'] = df_wr.groupby('player_id')[col].shift(1).rolling(3,min_periods = 1).mean()

In [98]:
TARGET_COL = 'fantasy_points_ppr'

# Creating test & trian sets
train_df = df_wr[
    (df_wr[TARGET_COL].notna()) &
    (
        (df_wr['season'] < 2024) |
        ((df_wr['season'] == 2024) & (df_wr['week'] < 14))
    )
]

test_df = df_wr[(df_wr['season'] == 2024) & (df_wr['week'] == 14)]


feature_cols = [
    col for col in df_wr.columns
    if col.endswith('_last3') or col.endswith('_trend') or col.endswith('_season_avg')
] + ['player_display_name','season', 'week', 'opponent_team']


X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]

X_test = test_df[feature_cols]

# for RMSE testing purposes on the test set
y_test = test_df[TARGET_COL]

In [99]:
categorical_cols = ['opponent_team', 'season', 'week', 'player_display_name']

# Create Pool objects
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, cat_features=categorical_cols)

# Initialize the CatBoost model
model = CatBoostRegressor(
    iterations = 1000,
    learning_rate = 0.01,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=50  # or 0 to suppress output
)

# Train
model.fit(train_pool)

0:	learn: 7.7354804	total: 28.6ms	remaining: 28.5s
50:	learn: 7.0373176	total: 995ms	remaining: 18.5s
100:	learn: 6.7128962	total: 1.7s	remaining: 15.1s
150:	learn: 6.5449715	total: 2.39s	remaining: 13.4s
200:	learn: 6.4472930	total: 3.11s	remaining: 12.3s
250:	learn: 6.3861382	total: 3.82s	remaining: 11.4s
300:	learn: 6.3393508	total: 4.54s	remaining: 10.6s
350:	learn: 6.3074649	total: 5.27s	remaining: 9.74s
400:	learn: 6.2821407	total: 6.02s	remaining: 8.99s
450:	learn: 6.2567554	total: 6.73s	remaining: 8.19s
500:	learn: 6.2340853	total: 7.43s	remaining: 7.4s
550:	learn: 6.2107628	total: 8.14s	remaining: 6.63s
600:	learn: 6.1924229	total: 8.86s	remaining: 5.88s
650:	learn: 6.1720275	total: 9.56s	remaining: 5.13s
700:	learn: 6.1550782	total: 10.3s	remaining: 4.38s
750:	learn: 6.1342741	total: 11.6s	remaining: 3.84s
800:	learn: 6.1121091	total: 13.2s	remaining: 3.29s
850:	learn: 6.0932859	total: 14.6s	remaining: 2.55s
900:	learn: 6.0732707	total: 15.3s	remaining: 1.69s
950:	learn: 6.05

<catboost.core.CatBoostRegressor at 0x78564e4114d0>

In [100]:
y_pred = model.predict(test_pool)

# Attach predictions to test set for review
WR_test_pred = X_test.copy()
WR_test_pred['predicted_fantasy_points'] = y_pred
WR_test_pred['espn_id'] = test_df['espn_id'].apply(lambda x: str(int(x)) if pd.notna(x) else np.nan)
WR_test_pred['position'] = "WR"

print(WR_test_pred[['season', 'week', 'opponent_team', 'predicted_fantasy_points', 'player_display_name', 'espn_id', 'position']])

       season  week opponent_team  predicted_fantasy_points  \
16992    2024    14           PHI                 12.697022   
17027    2024    14            SF                 13.145877   
17076    2024    14           LAC                 10.829618   
17120    2024    14           NYJ                  4.354308   
17126    2024    14           CIN                  9.168697   
...       ...   ...           ...                       ...   
22251    2024    14           NYJ                  3.544078   
22285    2024    14           TEN                 14.310319   
22302    2024    14           LAC                  9.110364   
22385    2024    14           CHI                  4.902839   
22424    2024    14            SF                  8.677350   

      player_display_name  espn_id position  
16992        Adam Thielen    16460       WR  
17027        Keenan Allen    15818       WR  
17076     DeAndre Hopkins    15795       WR  
17120       Odell Beckham    16733       WR  
17126       B

### Adding boom/bust prediction interval

In [101]:
train_preds = model.predict(train_pool)
residuals = y_train - train_preds
std_resid = residuals.std()

# Creating boom/bust interval
ci_upper = y_pred + 1.645 * std_resid
ci_lower = y_pred - 1.645 * std_resid

# setting minimum to 0
ci_lower = np.maximum(ci_lower, 0)


WR_test_pred['upper_bound'] = ci_upper
WR_test_pred['lower_bound'] = ci_lower

print(WR_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']])

       season  week opponent_team  lower_bound  predicted_fantasy_points  \
16992    2024    14           PHI     2.831348                 12.697022   
17027    2024    14            SF     3.280202                 13.145877   
17076    2024    14           LAC     0.963944                 10.829618   
17120    2024    14           NYJ     0.000000                  4.354308   
17126    2024    14           CIN     0.000000                  9.168697   
...       ...   ...           ...          ...                       ...   
22251    2024    14           NYJ     0.000000                  3.544078   
22285    2024    14           TEN     4.444645                 14.310319   
22302    2024    14           LAC     0.000000                  9.110364   
22385    2024    14           CHI     0.000000                  4.902839   
22424    2024    14            SF     0.000000                  8.677350   

       upper_bound player_display_name  espn_id position  
16992    22.562697        Ad

### Evaluating the WR CatBoost model preidctions

In [102]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Actuals and predictions
y_true = y_test
y_pred = WR_test_pred['predicted_fantasy_points']

# RMSE and MAE
rmse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ MAE: {mae:.2f}")

✅ RMSE: 57.90
✅ MAE: 5.54


# **TE CatBoost Model**

In [103]:
df_te= weekly_stats_all[weekly_stats_all['position'] == 'TE'].copy()

# Sorting for time series purposes
df_te = df_te.sort_values(by=['player_id', 'season', 'week'])

# Creating lag/rolling features to be used as model input
rolling_cols = [
    'fantasy_points_ppr', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_epa', 'receiving_2pt_conversions', 'target_share', 'offense_snaps', 'offense_pct', 'air_yards_share'
    ]

for col in rolling_cols:
    df_te[f'{col}_last3'] = df_te.groupby('player_id')[col].shift(1).rolling(3, min_periods = 1).mean()

# Creating season averages
expanding_cols = ['receiving_yards', 'receiving_tds', 'fantasy_points_ppr'
]

for col in expanding_cols:
    df_te[col] = pd.to_numeric(df_te[col], errors='coerce')

for col in expanding_cols:
    df_te[f'{col}_season_avg'] = (
        df_te.groupby(['player_id', 'season'])[col]
        .transform(lambda x: x.shift(1).expanding().mean())
    )


df_te['receiving_yards'] = pd.to_numeric(df_te['receiving_yards'], errors='coerce')
df_te['receptions'] = pd.to_numeric(df_te['receptions'], errors='coerce')

# Creating yards per catch feature
df_te['yards_per_catch'] = df_te['receiving_yards'] / df_te['receptions'].replace(0, pd.NA)
df_te['yards_per_catch'] = pd.to_numeric(df_te['yards_per_catch'], errors='coerce')



print(df_te.head(10))

     player_id player_name player_display_name position position_group  \
54  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
55  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
56  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
57  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
58  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
59  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
60  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
61  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
62  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   
63  00-0024243     M.Lewis      Marcedes Lewis       TE             TE   

                                         headshot_url recent_team  season  \
54  https://static.www.nfl.com/image/private/f_aut...          GB    2021   
55  https://static.www.nfl.com/

In [104]:
# Creating efficiency metrics (rolling)
df_te['carries_shifted'] = df_te.groupby('player_id')['carries'].shift(1)
df_te['receptions_shifted'] = df_te.groupby('player_id')['receptions'].shift(1)


df_te['yards_per_reception_last3'] = df_te.groupby('player_id')['yards_per_catch'].shift(1).rolling(3,min_periods = 1).mean()
df_te['rec_td_rate_last3'] = df_te.groupby('player_id')['receiving_tds'].shift(1).rolling(3,min_periods = 1).sum() / df_rb['receptions_shifted']

# Creating rolling features for advanced features
advanced = ['wopr', 'racr']
for col in advanced:
    df_te[f'{col}_trend'] = df_te.groupby('player_id')[col].shift(1).rolling(3,min_periods = 1).mean()

# -------------------------------
# STEP 7: Opponent Defense (already lagged by design)
# -------------------------------
opponent_cols = ['opp_avg_ypg_allowed', 'opp_avg_ppg_allowed', 'opp_int_pg', 'opp_sacks_pg', 'opp_fumbles_pg']
for col in opponent_cols:
    df_te[f'{col}_trend'] = df_te.groupby('player_id')[col].shift(1).rolling(3,min_periods = 1).mean()

In [105]:
TARGET_COL = 'fantasy_points_ppr'

# test and train splits
train_df = df_te[
    (df_te[TARGET_COL].notna()) &
    (
        (df_te['season'] < 2024) |
        ((df_te['season'] == 2024) & (df_te['week'] < 14))
    )
]

test_df = df_te[(df_te['season'] == 2024) & (df_te['week'] == 14)]


feature_cols = [
    col for col in df_te.columns
    if col.endswith('_last3') or col.endswith('_trend') or col.endswith('_season_avg')
] + ['player_display_name','season', 'week', 'opponent_team']


X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]

X_test = test_df[feature_cols]

# for RMSE testing purposes on the test set
y_test = test_df[TARGET_COL]

In [106]:
categorical_cols = ['opponent_team', 'season', 'week', 'player_display_name']

# Create Pool objects
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, cat_features=categorical_cols)

# Initialize the CatBoost model
model = CatBoostRegressor(
    iterations = 1000,
    learning_rate = 0.01,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=50  # or 0 to suppress output
)

# Train
model.fit(train_pool)

0:	learn: 6.0043360	total: 12.1ms	remaining: 12.1s
50:	learn: 5.5139502	total: 524ms	remaining: 9.75s
100:	learn: 5.2612506	total: 1s	remaining: 8.94s
150:	learn: 5.1284839	total: 1.45s	remaining: 8.14s
200:	learn: 5.0491599	total: 1.91s	remaining: 7.61s
250:	learn: 4.9901773	total: 2.39s	remaining: 7.13s
300:	learn: 4.9456479	total: 2.9s	remaining: 6.74s
350:	learn: 4.9109847	total: 3.53s	remaining: 6.54s
400:	learn: 4.8800247	total: 4.22s	remaining: 6.3s
450:	learn: 4.8487963	total: 4.71s	remaining: 5.73s
500:	learn: 4.8237270	total: 5.22s	remaining: 5.2s
550:	learn: 4.7986172	total: 5.73s	remaining: 4.67s
600:	learn: 4.7725064	total: 6.22s	remaining: 4.13s
650:	learn: 4.7482421	total: 6.7s	remaining: 3.59s
700:	learn: 4.7267429	total: 7.63s	remaining: 3.25s
750:	learn: 4.7011546	total: 8.64s	remaining: 2.87s
800:	learn: 4.6761008	total: 9.75s	remaining: 2.42s
850:	learn: 4.6526628	total: 10.8s	remaining: 1.9s
900:	learn: 4.6258754	total: 11.4s	remaining: 1.25s
950:	learn: 4.5970454	

<catboost.core.CatBoostRegressor at 0x7856175d7e10>

In [107]:
y_pred = model.predict(test_pool)

# Attach predictions to test set for review
TE_test_pred = X_test.copy()
TE_test_pred['predicted_fantasy_points'] = y_pred
TE_test_pred['espn_id'] = test_df['espn_id'].apply(lambda x: str(int(x)) if pd.notna(x) else np.nan)
TE_test_pred['position'] = "TE"


print(TE_test_pred[['season', 'week', 'opponent_team', 'predicted_fantasy_points', 'player_display_name', 'espn_id', 'position']])

       season  week opponent_team  predicted_fantasy_points  \
17057    2024    14           LAC                 14.593815   
17229    2024    14           CLE                  2.987043   
17362    2024    14           JAX                  3.234373   
17585    2024    14           CHI                 11.279746   
17659    2024    14           ARI                  2.971119   
17744    2024    14           CHI                  3.839349   
17843    2024    14           NYJ                 11.708264   
17884    2024    14           TEN                 10.905046   
17894    2024    14           PIT                 12.753433   
18053    2024    14            KC                  6.312394   
18089    2024    14           MIA                  7.436870   
18151    2024    14           PIT                  4.685981   
18254    2024    14           DAL                  5.483110   
18340    2024    14           NYJ                  3.332110   
18372    2024    14           DAL                  8.05

### Adding boom/bust interval using the standard deviation

In [108]:
train_preds = model.predict(train_pool)
residuals = y_train - train_preds
std_resid = residuals.std()

# Creating boom/bust interval
ci_upper = y_pred + 1.645 * std_resid
ci_lower = y_pred - 1.645 * std_resid

# setting minimum to 0
ci_lower = np.maximum(ci_lower, 0)


TE_test_pred['upper_bound'] = ci_upper
TE_test_pred['lower_bound'] = ci_lower

print(TE_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']])

       season  week opponent_team  lower_bound  predicted_fantasy_points  \
17057    2024    14           LAC     7.016120                 14.593815   
17229    2024    14           CLE     0.000000                  2.987043   
17362    2024    14           JAX     0.000000                  3.234373   
17585    2024    14           CHI     3.702051                 11.279746   
17659    2024    14           ARI     0.000000                  2.971119   
17744    2024    14           CHI     0.000000                  3.839349   
17843    2024    14           NYJ     4.130568                 11.708264   
17884    2024    14           TEN     3.327350                 10.905046   
17894    2024    14           PIT     5.175737                 12.753433   
18053    2024    14            KC     0.000000                  6.312394   
18089    2024    14           MIA     0.000000                  7.436870   
18151    2024    14           PIT     0.000000                  4.685981   
18254    202

### Evaluating predictions

In [109]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Actuals and predictions
y_true = y_test
y_pred = TE_test_pred['predicted_fantasy_points']

# RMSE and MAE
rmse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ MAE: {mae:.2f}")

✅ RMSE: 13.65
✅ MAE: 2.93


# **K CatBoost Model**

In [110]:
df_k = weekly_stats_all[weekly_stats_all['position'] == 'K'].copy()

# Sorting data for time-series purposes
df_k = df_k.sort_values(by=['player_id', 'season', 'week'])

# Creating lag/rolling features for model
# Takes into account past 3 games (or less, depending on the data available that week)
rolling_cols = [
    'fg_made','fg_att','fg_pct','fg_blocked','fg_long','fg_made_0_19','fg_made_20_29','fg_made_30_39','fg_made_40_49','fg_made_50_59','fg_made_60_',
    'fg_missed_0_19','fg_missed_20_29','fg_missed_30_39','fg_missed_40_49','fg_missed_50_59','fg_missed_60_','pat_made','pat_att','pat_blocked','pat_pct'
]

for col in rolling_cols:
    df_k[f'{col}_last3'] = df_k.groupby('player_id')[col].shift(1).rolling(3, min_periods = 1).mean()

# Season averages
expanding_cols = ['fg_made', 'fg_pct', 'pat_made', 'fantasy_points_ppr'
]

for col in expanding_cols:
    df_k[col] = pd.to_numeric(df_k[col], errors='coerce')

for col in expanding_cols:
    df_k[f'{col}_season_avg'] = (
        df_k.groupby(['player_id', 'season'])[col]
        .transform(lambda x: x.shift(1).expanding().mean())
    )

print(df_k.head(10))

        player_id player_name player_display_name position position_group  \
22446  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22447  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22448  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22449  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22450  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22451  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22452  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22453  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22454  00-0023252     R.Gould        Robbie Gould        K           SPEC   
22455  00-0023252     R.Gould        Robbie Gould        K           SPEC   

                                            headshot_url recent_team  season  \
22446  https://static.www.nfl.com/image/private/f_aut...         NaN    

In [111]:
TARGET_COL = 'fantasy_points_ppr'

# Creating test and train sets
train_df = df_k[
    (df_k[TARGET_COL].notna()) &
    (
        (df_k['season'] < 2024) |
        ((df_k['season'] == 2024) & (df_k['week'] < 14))
    )
]

test_df = df_k[(df_k['season'] == 2024) & (df_k['week'] == 14)]


feature_cols = [
    col for col in df_k.columns
    if col.endswith('_last3') or col.endswith('_trend') or col.endswith('_season_avg')
] + ['player_display_name','season', 'week', 'opponent_team']


X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]

X_test = test_df[feature_cols]

# for RMSE testing purposes on the test set
y_test = test_df[TARGET_COL]

In [112]:
categorical_cols = ['opponent_team', 'season', 'week', 'player_display_name']

for col in categorical_cols:
    X_train[col] = X_train[col].fillna('Unknown').astype(str)  # Fill NaN with 'Unknown' and convert to string
    X_test[col] = X_test[col].fillna('Unknown').astype(str)  # Fill NaN with 'Unknown' and convert to string

# Create Pool objects
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, cat_features=categorical_cols)

# Initialize the CatBoost model
model = CatBoostRegressor(
    iterations = 1000,
    learning_rate = 0.01,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=50  # or 0 to suppress output
)

# Train
model.fit(train_pool)

0:	learn: 4.3116635	total: 4.98ms	remaining: 4.97s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].fillna('Unknown').astype(str)  # Fill NaN with 'Unknown' and convert to string
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].fillna('Unknown').astype(str)  # Fill NaN with 'Unknown' and convert to string
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

50:	learn: 4.2747010	total: 191ms	remaining: 3.56s
100:	learn: 4.2467617	total: 390ms	remaining: 3.48s
150:	learn: 4.2172608	total: 570ms	remaining: 3.21s
200:	learn: 4.1921491	total: 746ms	remaining: 2.96s
250:	learn: 4.1682211	total: 919ms	remaining: 2.74s
300:	learn: 4.1392459	total: 1.09s	remaining: 2.54s
350:	learn: 4.1148018	total: 1.3s	remaining: 2.41s
400:	learn: 4.0889904	total: 1.49s	remaining: 2.22s
450:	learn: 4.0670395	total: 1.66s	remaining: 2.02s
500:	learn: 4.0416744	total: 1.84s	remaining: 1.84s
550:	learn: 4.0176939	total: 2.02s	remaining: 1.65s
600:	learn: 3.9958751	total: 2.2s	remaining: 1.46s
650:	learn: 3.9701747	total: 2.41s	remaining: 1.29s
700:	learn: 3.9464132	total: 2.61s	remaining: 1.11s
750:	learn: 3.9199825	total: 2.8s	remaining: 928ms
800:	learn: 3.8937295	total: 2.99s	remaining: 743ms
850:	learn: 3.8637280	total: 3.17s	remaining: 556ms
900:	learn: 3.8400007	total: 3.38s	remaining: 372ms
950:	learn: 3.8147554	total: 3.57s	remaining: 184ms
999:	learn: 3.78

<catboost.core.CatBoostRegressor at 0x78561753e590>

In [113]:
y_pred = model.predict(test_pool)

# Attach predictions to test set for review
K_test_pred = X_test.copy()
K_test_pred['predicted_fantasy_points'] = y_pred
K_test_pred['espn_id'] = test_df['espn_id'].apply(lambda x: str(int(x)) if pd.notna(x) else np.nan)

K_test_pred['position'] = "K"

print(K_test_pred[['season', 'week', 'opponent_team', 'predicted_fantasy_points', 'player_display_name', 'espn_id', 'position']])

      season week opponent_team  predicted_fantasy_points player_display_name  \
24151   2024   14       Unknown                  8.484485           Nick Folk   
24157   2024   14       Unknown                  8.266133         Graham Gano   
24195   2024   14       Unknown                  8.917875     Brandon McManus   
24213   2024   14       Unknown                  8.336803      Dustin Hopkins   
24229   2024   14       Unknown                  8.539275       Chris Boswell   
24247   2024   14       Unknown                  8.258203        Cairo Santos   
24264   2024   14       Unknown                  8.876205         Jason Myers   
24332   2024   14       Unknown                  8.088940        Younghoe Koo   
24346   2024   14       Unknown                  8.788159        Jake Elliott   
24376   2024   14       Unknown                  8.678935      Daniel Carlson   
24393   2024   14       Unknown                  8.119678        Eddy Pineiro   
24418   2024   14       Unkn

In [114]:
# Finding standard deviation residuals from trian data
train_preds = model.predict(train_pool)
residuals = y_train - train_preds
std_resid = residuals.std()

# Creating boom/bust interval
ci_upper = y_pred + 1.645 * std_resid
ci_lower = y_pred - 1.645 * std_resid

# setting minimum to 0
ci_lower = np.maximum(ci_lower, 0)


K_test_pred['upper_bound'] = ci_upper
K_test_pred['lower_bound'] = ci_lower

print(K_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id']])

      season week opponent_team  lower_bound  predicted_fantasy_points  \
24151   2024   14       Unknown     2.006322                  8.484485   
24157   2024   14       Unknown     1.787970                  8.266133   
24195   2024   14       Unknown     2.439712                  8.917875   
24213   2024   14       Unknown     1.858640                  8.336803   
24229   2024   14       Unknown     2.061112                  8.539275   
24247   2024   14       Unknown     1.780041                  8.258203   
24264   2024   14       Unknown     2.398042                  8.876205   
24332   2024   14       Unknown     1.610777                  8.088940   
24346   2024   14       Unknown     2.309996                  8.788159   
24376   2024   14       Unknown     2.200772                  8.678935   
24393   2024   14       Unknown     1.641515                  8.119678   
24418   2024   14       Unknown     1.914198                  8.392361   
24436   2024   14       Unknown     1.

In [115]:
# Actuals and predictions
y_true = y_test.dropna()  # Drop NaN values from y_true
y_pred = K_test_pred['predicted_fantasy_points'][y_true.index]

# RMSE and MAE
rmse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ MAE: {mae:.2f}")

✅ RMSE: 23.45
✅ MAE: 3.99


# **DEF CatBoost Model**

In [116]:
df_def = weekly_stats_all[weekly_stats_all['player_id'] == 'defense'].copy()

# Sorting data for time-series purposes
df_def = df_def.sort_values(by=['team', 'season', 'week'])

# Creating lag/rolling features for model
# Takes into account past 3 games (or less, depending on the data available that week)
rolling_cols = [
    'interceptions', 'sacks', 'df_tds', 'total_points_allowed', 'blocked_kicks'
]

for col in rolling_cols:
    df_def[f'{col}_last3'] = df_def.groupby('team')[col].shift(1).rolling(3, min_periods = 1).mean()

# Season averages
expanding_cols = [
    'interceptions', 'sacks', 'total_points_allowed', 'fantasy_points'
]

for col in expanding_cols:
    df_def[col] = pd.to_numeric(df_def[col], errors='coerce')

for col in expanding_cols:
    df_def[f'{col}_season_avg'] = (
        df_def.groupby(['team', 'season'])[col]
        .transform(lambda x: x.shift(1).expanding().mean())
    )

print(df_def.head(10))

      player_id player_name player_display_name position position_group  \
24732   defense         NaN                 NaN      NaN            NaN   
24754   defense         NaN                 NaN      NaN            NaN   
24780   defense         NaN                 NaN      NaN            NaN   
24814   defense         NaN                 NaN      NaN            NaN   
24859   defense         NaN                 NaN      NaN            NaN   
24869   defense         NaN                 NaN      NaN            NaN   
24901   defense         NaN                 NaN      NaN            NaN   
24927   defense         NaN                 NaN      NaN            NaN   
24973   defense         NaN                 NaN      NaN            NaN   
24979   defense         NaN                 NaN      NaN            NaN   

      headshot_url recent_team  season  week season_type  ... cur_roster_slot  \
24732          NaN         NaN    2021     1         NaN  ...             NaN   
24754       

In [117]:
TARGET_COL = 'fantasy_points'

# Creating test and train sets
train_df = df_def[
    (df_def[TARGET_COL].notna()) &
    (
        (df_def['season'] < 2024) |
        ((df_def['season'] == 2024) & (df_def['week'] < 14))
    )
]

test_df = df_def[(df_def['season'] == 2024) & (df_def['week'] == 14)]


feature_cols = [
    col for col in df_def.columns
    if col.endswith('_last3') or col.endswith('_trend') or col.endswith('_season_avg')
] + ['team','season', 'week', 'opponent_team']


X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]

X_test = test_df[feature_cols]

# for RMSE testing purposes on the test set
y_test = test_df[TARGET_COL]

In [118]:
categorical_cols = ['season', 'week', 'team']

for col in categorical_cols:
    X_train[col] = X_train[col].fillna('Unknown').astype(str)
    X_test[col] = X_test[col].fillna('Unknown').astype(str)

# Create Pool objects
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, cat_features=categorical_cols)

# Initialize the CatBoost model
model = CatBoostRegressor(
    iterations = 1000,
    learning_rate = 0.01,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=50  # or 0 to suppress output
)

# Train
model.fit(train_pool)

0:	learn: 5.7263801	total: 4.01ms	remaining: 4s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].fillna('Unknown').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].fillna('Unknown').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].fillna('Unknown').astype(str)
A value is trying to be set on a 

50:	learn: 5.6594540	total: 169ms	remaining: 3.15s
100:	learn: 5.6037054	total: 359ms	remaining: 3.2s
150:	learn: 5.5576553	total: 536ms	remaining: 3.01s
200:	learn: 5.5193042	total: 705ms	remaining: 2.8s
250:	learn: 5.4935801	total: 851ms	remaining: 2.54s
300:	learn: 5.4562785	total: 1.01s	remaining: 2.34s
350:	learn: 5.4272227	total: 1.2s	remaining: 2.21s
400:	learn: 5.4059136	total: 1.35s	remaining: 2.02s
450:	learn: 5.3835135	total: 1.5s	remaining: 1.82s
500:	learn: 5.3648388	total: 1.65s	remaining: 1.64s
550:	learn: 5.3383442	total: 1.81s	remaining: 1.47s
600:	learn: 5.3131021	total: 1.97s	remaining: 1.3s
650:	learn: 5.2860897	total: 2.13s	remaining: 1.14s
700:	learn: 5.2634305	total: 2.32s	remaining: 989ms
750:	learn: 5.2369504	total: 2.49s	remaining: 826ms
800:	learn: 5.2149027	total: 2.65s	remaining: 659ms
850:	learn: 5.1916358	total: 2.83s	remaining: 495ms
900:	learn: 5.1677579	total: 2.99s	remaining: 329ms
950:	learn: 5.1477558	total: 3.17s	remaining: 163ms
999:	learn: 5.1257

<catboost.core.CatBoostRegressor at 0x78561a4ca850>

In [119]:
y_pred = model.predict(test_pool)

# Attach predictions to test set for review
DEF_test_pred = X_test.copy()
DEF_test_pred['predicted_fantasy_points'] = y_pred
DEF_test_pred['espn_id'] = test_df['espn_id'].apply(lambda x: str(int(x)) if pd.notna(x) else np.nan)
DEF_test_pred['position'] = "DEF"

print(DEF_test_pred[['season', 'week', 'predicted_fantasy_points', 'team', 'espn_id', 'position']])

      season week  predicted_fantasy_points team espn_id position
26830   2024   14                  4.491918  ARI  -16022      DEF
26824   2024   14                  4.646357  ATL  -16001      DEF
26820   2024   14                  6.261044  BUF  -16002      DEF
26828   2024   14                  3.982936  CAR  -16029      DEF
26831   2024   14                  5.516650  CHI  -16003      DEF
26815   2024   14                  4.107146  CIN  -16004      DEF
26829   2024   14                  4.599554  CLE  -16005      DEF
26813   2024   14                  5.626609  DAL  -16015      DEF
26817   2024   14                  4.760775  DET  -16008      DEF
26816   2024   14                  5.585654   GB  -16009      DEF
26833   2024   14                  3.765487  JAX  -16030      DEF
26821   2024   14                  4.565247   KC  -16012      DEF
26810   2024   14                  4.524818   LA  -16014      DEF
26819   2024   14                  5.078927  LAC  -16024      DEF
26832   20

In [120]:
# Finding standard deviation residuals from train data
train_preds = model.predict(train_pool)
residuals = y_train - train_preds
std_resid = residuals.std()

# Creating boom/bust interval
ci_upper = y_pred + 1.645 * std_resid
ci_lower = y_pred - 1.645 * std_resid

# setting minimum to 0
#ci_lower = np.maximum(ci_lower, 0)


DEF_test_pred['upper_bound'] = ci_upper
DEF_test_pred['lower_bound'] = ci_lower

print(DEF_test_pred[['season', 'week', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'team', 'espn_id']])
print(DEF_test_pred.columns.tolist())

      season week  lower_bound  predicted_fantasy_points  upper_bound team  \
26830   2024   14    -4.216260                  4.491918    13.200095  ARI   
26824   2024   14    -4.061820                  4.646357    13.354535  ATL   
26820   2024   14    -2.447133                  6.261044    14.969222  BUF   
26828   2024   14    -4.725242                  3.982936    12.691113  CAR   
26831   2024   14    -3.191528                  5.516650    14.224827  CHI   
26815   2024   14    -4.601032                  4.107146    12.815324  CIN   
26829   2024   14    -4.108624                  4.599554    13.307731  CLE   
26813   2024   14    -3.081568                  5.626609    14.334787  DAL   
26817   2024   14    -3.947403                  4.760775    13.468952  DET   
26816   2024   14    -3.122524                  5.585654    14.293831   GB   
26833   2024   14    -4.942690                  3.765487    12.473665  JAX   
26821   2024   14    -4.142930                  4.565247    13.2

In [121]:
# Actuals and predictions
y_true = y_test.dropna()  # Drop NaN values from y_true
y_pred = DEF_test_pred['predicted_fantasy_points'][y_true.index]

# RMSE and MAE
rmse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ MAE: {mae:.2f}")

✅ RMSE: 32.36
✅ MAE: 4.49


In [122]:
DEF_test_pred['player_display_name'] = DEF_test_pred['team']
print(DEF_test_pred.head())

print(DEF_test_pred.columns.tolist())

       interceptions_last3  sacks_last3  df_tds_last3  \
26830             0.333333     4.333333      0.000000   
26824             0.000000     2.000000      0.000000   
26820             1.666667     2.333333      0.333333   
26828             1.333333     3.666667      0.000000   
26831             0.333333     2.000000      0.000000   

       total_points_allowed_last3  blocked_kicks_last3  \
26830                   15.666667             0.000000   
26824                   20.000000             0.333333   
26820                   19.666667             0.000000   
26828                   24.666667             0.000000   
26831                   21.333333             0.666667   

       interceptions_season_avg  sacks_season_avg  \
26830                  0.500000          2.833333   
26824                  0.583333          1.250000   
26820                  1.083333          2.333333   
26828                  0.583333          1.750000   
26831                  0.750000          2.

# **Putting together all the prediction data**

In [123]:
QB_test_pred = QB_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']]
RB_test_pred = RB_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']]
WR_test_pred = WR_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']]
TE_test_pred = TE_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']]
K_test_pred = K_test_pred[['season', 'week', 'opponent_team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']]
DEF_test_pred = DEF_test_pred[['season', 'week', 'team', 'lower_bound', 'predicted_fantasy_points', 'upper_bound', 'player_display_name', 'espn_id', 'position']]

all_predictions = pd.concat([QB_test_pred, RB_test_pred, WR_test_pred, TE_test_pred, K_test_pred, DEF_test_pred], axis= 0, ignore_index = True)

#all_predictions['espn_id'] = all_predictions['espn_id'].apply(lambda x: int(x) if pd.notna(x) else np.nan)
#all_predictions['espn_id'] = all_predictions['espn_id'].astype('Int64')

all_predictions['espn_id'] = pd.to_numeric(all_predictions['espn_id'], errors='coerce').astype('Int64')

In [124]:
print(all_predictions)

    season week opponent_team  lower_bound  predicted_fantasy_points  \
0     2024   14           MIA     5.165464                 15.398341   
1     2024   14           BUF     6.534220                 16.767097   
2     2024   14           CLE     6.103385                 16.336262   
3     2024   14           MIN     4.531927                 14.764804   
4     2024   14           ARI     7.095135                 17.328012   
..     ...  ...           ...          ...                       ...   
293   2024   14           NaN    -3.282867                  5.425311   
294   2024   14           NaN    -3.731129                  4.977049   
295   2024   14           NaN    -3.539009                  5.169169   
296   2024   14           NaN    -4.667551                  4.040627   
297   2024   14           NaN    -4.562051                  4.146126   

     upper_bound player_display_name  espn_id position team  
0      25.631218       Aaron Rodgers     8439       QB  NaN  
1      26.9

# Putting dataframes back in excel/google drive
*Saving the season & weekly stats data frames, the data frame containing all the player fantasy point predictions, and the data frame containing the schedule & team record*

In [125]:
season_stats_all.loc[season_stats_all['player_id'] == 'defense', 'position'] = 'DEF'
weekly_stats_all.loc[weekly_stats_all['player_id'] == 'defense', 'position'] = 'DEF'

season_stats_all.loc[season_stats_all['player_id'] == 'defense', 'player_display_name'] = season_stats_all.loc[season_stats_all['player_id'] == 'defense', 'team']
weekly_stats_all.loc[weekly_stats_all['player_id'] == 'defense', 'player_display_name'] = weekly_stats_all.loc[weekly_stats_all['player_id'] == 'defense', 'team']

In [126]:
home = weekly_matchup_data[['week', 'home_team', 'home_score', 'away_score']].copy()
home.columns = ['week', 'fantasy_team', 'points_scored', 'points_allowed']

# Create away team rows
away = weekly_matchup_data[['week', 'away_team', 'away_score', 'home_score']].copy()
away.columns = ['week', 'fantasy_team', 'points_scored', 'points_allowed']

# Combine
team_scoring = pd.concat([home, away], ignore_index=True)

# Optional: sort it
team_scoring = team_scoring.sort_values(by=['fantasy_team', 'week']).reset_index(drop=True)

team_scoring['weekly_avg'] = team_scoring.groupby('week')['points_scored'].transform('mean')

# Show the result
print(team_scoring.head())

   week      fantasy_team  points_scored  points_allowed  weekly_avg
0     1  Bucktown Bandits           73.0            96.0        89.2
1     2  Bucktown Bandits           86.0           109.0        95.3
2     3  Bucktown Bandits           86.0           102.0        83.3
3     4  Bucktown Bandits          106.0            47.0        80.5
4     5  Bucktown Bandits           65.0            99.0        86.5


In [128]:
from itertools import product

# Assume team_scoring is already loaded and looks like:
# week | fantasy_team | points_scored | points_allowed

teams = sorted(team_scoring['fantasy_team'].unique())
results = []

for team_a, team_b in product(teams, repeat=2):
    if team_a == team_b:
        continue

    # Team A's points scored each week
    a_data = team_scoring[team_scoring['fantasy_team'] == team_a].sort_values('week').reset_index(drop=True)
    a_scores = a_data['points_scored']

    # Team B's opponents' points allowed each week
    b_data = team_scoring[team_scoring['fantasy_team'] == team_b].sort_values('week').reset_index(drop=True)
    b_opponent_allowed = b_data['points_allowed']

    # Simulate week-by-week
    wins = 0
    losses = 0

    for week in range(len(a_scores)):
        a_pts = a_scores[week]
        b_allwd = b_opponent_allowed[week]

        if a_pts > b_allwd:
            wins += 1
        else:
            losses += 1

        results.append({
            'team': team_a,
            'schedule_team': team_b,
            'week': week + 1,
            'points_scored': a_pts,
            'hypothetical_opponent_points': b_allwd,
            'wins': wins,
            'losses': losses,
            'record': f"{wins}-{losses}"
        })

# Create the DataFrame
week_by_week_records = pd.DataFrame(results)

In [129]:
print(week_by_week_records)

                       team       schedule_team  week  points_scored  \
0          Bucktown Bandits  Captain Sweatpants     1           73.0   
1          Bucktown Bandits  Captain Sweatpants     2           86.0   
2          Bucktown Bandits  Captain Sweatpants     3           86.0   
3          Bucktown Bandits  Captain Sweatpants     4          106.0   
4          Bucktown Bandits  Captain Sweatpants     5           65.0   
...                     ...                 ...   ...            ...   
1255  pop-pop's bible study     bungalicious  💅    10           72.0   
1256  pop-pop's bible study     bungalicious  💅    11           80.0   
1257  pop-pop's bible study     bungalicious  💅    12           93.0   
1258  pop-pop's bible study     bungalicious  💅    13           94.0   
1259  pop-pop's bible study     bungalicious  💅    14          111.0   

      hypothetical_opponent_points  wins  losses record  
0                             82.0     0       1    0-1  
1                  

In [130]:
import os

output_path = '/content/drive/MyDrive/all_fantasy_data.xlsx'

if os.path.exists(output_path):
    os.remove(output_path)

with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    weekly_stats_all.to_excel(writer, sheet_name='Weekly_Stats', index=False)
    season_stats_all.to_excel(writer, sheet_name='Season_Stats', index=False)
    all_predictions.to_excel(writer, sheet_name='Player_Predictions', index=False)
    weekly_matchup_data.to_excel(writer, sheet_name='Team_Schedules', index=False)
    team_scoring.to_excel(writer, sheet_name='Team_Scoring', index=False)
    week_by_week_records.to_excel(writer, sheet_name='Schedule_Swap_Records', index=False)

