In [228]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor

In [229]:
#Import data
shrine_players = pd.read_parquet('data/shrine_bowl_players.parquet', engine='fastparquet')
shrine_players.rename(columns={'gsis_player_id': 'college_gsis_id'}, inplace=True)
shrine_players['college_gsis_id'] = shrine_players['college_gsis_id'].astype(int)

college_stats = pd.read_csv('data/shrine_bowl_players_college_stats.csv')
nfl_stats = pd.read_csv('data/shrine_bowl_players_nfl_rookie_stats.csv')
sos_stats = pd.read_csv('data/sos_data.csv')
team_offense = pd.read_csv('data/CFB_TeamOff_2017-2024_via_sportsReference.csv')

print(shrine_players.shape, college_stats.shape, nfl_stats.shape, sos_stats.shape)

(338, 33) (1732, 38) (245, 43) (1181, 7)


In [230]:
shrine_players.head()

Unnamed: 0,college_gsis_id,first_name,last_name,football_name,weight,hand_size,last_twenty_of_forty_yd_dash,first_ten_of_forty_yd_dash,standing_broad_jump,three_cone,...,draft_round,draft_pick,draft_overall_selection,hometown,hometown_state,hometown_country,recruiting_stars,team_code,team_name,conference
0,350698,Dwight,McGlothern,Dwight,189.0,8.5,1.8699999,1.55,111.0,7.23,...,,,,Houston,TX,US,4.0,ARUN,Arkansas Razorbacks,Southeastern Conference
1,336849,Hunter,Nourzad,Hunter,319.0,10.75,,,,,...,5.0,24.0,160.0,Marietta,GA,US,,PAST,Penn State Nittany Lions,Big Ten Conference
2,362959,Jaden,Shirden,Jaden,189.0,9.25,1.8799999,1.56,117.0,,...,,,,West Haven,CT,US,,NJMO,Monmouth Hawks,Colonial Athletic Association
3,324852,David,Ugwoegbu,David,243.0,10.25,1.97,1.75,116.0,7.56,...,,,,Katy,TX,US,3.0,TXHO,Houston Cougars,Big Twelve Conference
4,304792,Ernest,Perry,E.J.,208.0,9.0,1.99,1.58,123.0,6.85,...,,,,Andover,MA,US,3.0,RIBR,Brown Bears,Ivy League


In [231]:
college_stats.head()

Unnamed: 0,college_gsis_id,position,player_name,season,team,school_code,passing_attempts,passing_completions,passing_yards,passing_completion_percentage,...,kickreturns_no,kickreturns_touchdowns,kickreturns_yards,puntreturns_avg,puntreturns_long,puntreturns_no,puntreturns_td,puntreturns_yds,total_fumbles,total_fumbles_lost
0,158332,DT,Garrett Marino,2017,UAB,ALBI,,,,,...,,,,,,,,,0.0,0.0
1,158332,DT,Garrett Marino,2018,UAB,ALBI,,,,,...,,,,,,,,,0.0,0.0
2,158332,DT,Garrett Marino,2019,UAB,ALBI,,,,,...,,,,,,,,,0.0,0.0
3,172387,DS,Austin Lee,2017,BYU,UTBY,,,,,...,,,,,,,,,0.0,0.0
4,172387,DS,Austin Lee,2018,BYU,UTBY,1.0,0.0,0.0,0.0,...,,,,,,,,,,


In [232]:
nfl_stats.head()

Unnamed: 0,college_gsis_id,rookie_season,player_name,position,draft_season,draft_round,draft_overall_selection,scrambles,qb_pressures,qb_pressure_to_sack_rate,...,rushing_yards,rushing_touchdowns,receiving_targets,receiving_receptions,receiving_yards,receiving_yards_per_route_run,receiving_touchdowns,receiving_yards_after_catch,defense_interceptions,defense_pass_breakups
0,196601,2020,Khalil Davis,DT,2020,6.0,194.0,,,,...,,,,,,,,,,
1,305707,2024,Dallas Gant,IB,2024,,,,,,...,,,,,,,,,,
2,316846,2023,Moro Ojomo,DT,2023,7.0,249.0,,,,...,,,,,,,,,,
3,196251,2020,Alex Highsmith,DE,2020,3.0,102.0,,,,...,,,,,,,,,1.0,1.0
4,282907,2022,Samori Toure,WR,2022,7.0,258.0,,,,...,,,10.0,5.0,82.0,1.10811,1.0,11.0,,


In [233]:
sos_stats.head()

Unnamed: 0,Rank,Team,Rating,Hi,Lo,Last,Year
0,1,Indiana,17.3,1,68,2,2025
1,2,Ohio St,16.0,2,18,14,2025
2,3,Alabama,15.5,1,21,1,2025
3,4,Oregon,15.4,1,71,3,2025
4,5,Texas,13.8,1,20,8,2025


In [234]:
# Get standerdized SOS values
sos_mean = sos_stats['Rating'].mean()
sos_std = sos_stats['Rating'].std()
sos_stats['Rating'] = (sos_stats['Rating'] - sos_mean) / sos_std
sos_stats.head()

Unnamed: 0,Rank,Team,Rating,Hi,Lo,Last,Year
0,1,Indiana,2.24494,1,68,2,2025
1,2,Ohio St,2.08347,2,18,14,2025
2,3,Alabama,2.02136,1,21,1,2025
3,4,Oregon,2.00894,1,71,3,2025
4,5,Texas,1.81021,1,20,8,2025


In [235]:
print(college_stats.columns)
print(college_stats['position'].unique())

Index(['college_gsis_id', 'position', 'player_name', 'season', 'team',
       'school_code', 'passing_attempts', 'passing_completions',
       'passing_yards', 'passing_completion_percentage', 'passing_touchdowns',
       'passing_interceptions', 'rushing_attempts', 'rushing_yards',
       'rushing_touchdowns', 'receiving_receptions', 'receiving_touchdowns',
       'receiving_yards', 'defense_pass_breakups', 'defense_qb_hurries',
       'defense_sacks', 'defense_solo_tackles', 'defense_total_tackles',
       'defense_tackles_for_loss', 'defense_interceptions',
       'defense_touchdowns', 'kickreturns_avg', 'kickreturns_long',
       'kickreturns_no', 'kickreturns_touchdowns', 'kickreturns_yards',
       'puntreturns_avg', 'puntreturns_long', 'puntreturns_no',
       'puntreturns_td', 'puntreturns_yds', 'total_fumbles',
       'total_fumbles_lost'],
      dtype='object')
['DT' 'DS' 'WR' 'DC' 'QB' 'DE' 'RB' 'OB' 'IB' 'OG' 'TE' 'OT' 'OC' 'FB']


In [236]:
#Get stat shares
college_stats['rec_yds_share'] = np.nan
college_stats['rec_td_share'] = np.nan
college_stats['rec_cmp_share'] = np.nan
college_stats['rush_yds_share'] = np.nan
college_stats['rush_td_share'] = np.nan
college_stats['rush_atmp_share'] = np.nan
college_stats['opp_sos_rating'] = np.nan

for index, row in college_stats.iterrows():
    team = row['team']
    year = row['season']
    team_totals = team_offense[((team_offense['School'] == player_team) & (team_offense['Year'] == year))][['tot_pass_yds','tot_pass_td','tot_pass_cmp','tot_rush_yds','tot_rush_td','tot_rush_atmp']]
    team_sos = sos_stats[(sos_stats['Team'] == team) & (sos_stats['Year'] == year)]['Rating']

    if not team_totals.empty:
        team_totals = team_totals.iloc[0]
        college_stats.loc[index, 'rec_yds_share'] = row['receiving_yards'] / team_totals['tot_pass_yds']
        college_stats.loc[index, 'rec_td_share'] = row['receiving_touchdowns'] / team_totals['tot_pass_td']
        college_stats.loc[index, 'rec_cmp_share'] = row['receiving_receptions'] / team_totals['tot_pass_cmp']
        college_stats.loc[index, 'rush_yds_share'] = row['rushing_yards'] / team_totals['tot_rush_yds']
        college_stats.loc[index, 'rush_td_share'] = row['rushing_touchdowns'] / team_totals['tot_rush_td']
        college_stats.loc[index, 'rush_atmp_share'] = row['rushing_attempts'] / team_totals['tot_rush_atmp']
        college_stats.loc[index, 'opp_sos_rating'] = team_sos.values[0] if not team_sos.empty else np.nan

In [237]:
college_stats.head()

Unnamed: 0,college_gsis_id,position,player_name,season,team,school_code,passing_attempts,passing_completions,passing_yards,passing_completion_percentage,...,puntreturns_yds,total_fumbles,total_fumbles_lost,rec_yds_share,rec_td_share,rec_cmp_share,rush_yds_share,rush_td_share,rush_atmp_share,opp_sos_rating
0,158332,DT,Garrett Marino,2017,UAB,ALBI,,,,,...,,0.0,0.0,,,,,,,-1.71732
1,158332,DT,Garrett Marino,2018,UAB,ALBI,,,,,...,,0.0,0.0,-0.00241,0.0,0.00332,0.00231,0.0,0.00605,-0.8727
2,158332,DT,Garrett Marino,2019,UAB,ALBI,,,,,...,,0.0,0.0,,,,,,,-1.34469
3,172387,DS,Austin Lee,2017,BYU,UTBY,,,,,...,,0.0,0.0,,,,,,,-0.61186
4,172387,DS,Austin Lee,2018,BYU,UTBY,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.12097


In [238]:
college_totals = (
    college_stats
    .groupby("college_gsis_id", as_index=False)
    .agg(
        team=('team', 'first'),
        max_season=('season', 'max'),
        position=('position', 'first'),
        receiving_receptions=('receiving_receptions', 'sum'),
        receiving_yards=('receiving_yards', 'sum'),
        receiving_touchdowns=('receiving_touchdowns', 'sum'),
        rushing_attempts=('rushing_attempts', 'sum'),
        rushing_yards=('rushing_yards', 'sum'),
        rushing_touchdowns=('rushing_touchdowns', 'sum'),
        passing_completions=('passing_completions', 'sum'),
        passing_yards=('passing_yards', 'sum'),
        passing_touchdowns=('passing_touchdowns', 'sum'),
        passing_interceptions=('passing_interceptions', 'sum'),
        seasons_played=('position', 'size'),
        avg_rec_yds_share=('rec_yds_share', 'mean'),
        avg_rec_td_share=('rec_td_share', 'mean'),
        avg_rec_cmp_share=('rec_cmp_share', 'mean'),
        avg_rush_yds_share=('rush_yds_share', 'mean'),
        avg_rush_td_share=('rush_td_share', 'mean'),
        avg_rush_atmp_share=('rush_atmp_share', 'mean'),
        avg_opp_sos_rating=('opp_sos_rating', 'mean')
    )
)
college_totals.head()

Unnamed: 0,college_gsis_id,team,max_season,position,receiving_receptions,receiving_yards,receiving_touchdowns,rushing_attempts,rushing_yards,rushing_touchdowns,...,passing_touchdowns,passing_interceptions,seasons_played,avg_rec_yds_share,avg_rec_td_share,avg_rec_cmp_share,avg_rush_yds_share,avg_rush_td_share,avg_rush_atmp_share,avg_opp_sos_rating
0,158332,UAB,2019,DT,1.0,-9.0,0.0,3.0,5.0,0.0,...,0.0,0.0,3,-0.00241,0.0,0.00332,0.00231,0.0,0.00605,-1.31157
1,172387,BYU,2019,DS,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3,,,,,,,-0.21025
2,172698,Indiana,2019,WR,84.0,1162.0,9.0,0.0,0.0,0.0,...,0.0,0.0,2,0.18405,0.22113,0.1569,,,,0.66128
3,172890,Minnesota,2019,DC,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2,,,,,,,0.7358
4,173519,Clemson,2019,QB,0.0,0.0,0.0,328.0,1037.0,14.0,...,30.0,15.0,3,,,,0.16358,0.20008,0.23576,1.22643


In [239]:
college_wr = college_totals[college_totals['position'] == 'WR']
college_qb = college_totals[college_totals['position'] == 'QB']
college_te = college_totals[college_totals['position'] == 'TE']
college_rb = college_totals[college_totals['position'] == 'RB']

In [240]:
nfl_wr = nfl_stats[nfl_stats['position'] == 'WR']
nfl_qb = nfl_stats[nfl_stats['position'] == 'QB']
nfl_te = nfl_stats[nfl_stats['position'] == 'TE']
nfl_rb = nfl_stats[nfl_stats['position'] == 'RB']

## WR

In [241]:
print(college_wr.shape, nfl_wr.shape)
wr_merged = pd.merge(college_wr, nfl_wr, on='college_gsis_id', suffixes=('_college', '_nfl'), how='left')
print(wr_merged.shape)
print(wr_merged.columns)

(74, 22) (30, 43)
(74, 64)
Index(['college_gsis_id', 'team', 'max_season', 'position_college',
       'receiving_receptions_college', 'receiving_yards_college',
       'receiving_touchdowns_college', 'rushing_attempts_college',
       'rushing_yards_college', 'rushing_touchdowns_college',
       'passing_completions_college', 'passing_yards_college',
       'passing_touchdowns_college', 'passing_interceptions_college',
       'seasons_played', 'avg_rec_yds_share', 'avg_rec_td_share',
       'avg_rec_cmp_share', 'avg_rush_yds_share', 'avg_rush_td_share',
       'avg_rush_atmp_share', 'avg_opp_sos_rating', 'rookie_season',
       'player_name', 'position_nfl', 'draft_season', 'draft_round',
       'draft_overall_selection', 'scrambles', 'qb_pressures',
       'qb_pressure_to_sack_rate', 'inside_runs', 'outside_runs',
       'change_of_direction_runs', 'designed_qb_runs', 'man_coverage_targets',
       'zone_coverage_targets', 'pressures_allowed', 'pressures',
       'pressure_rate', 'run

In [242]:
wr_merged.head()

Unnamed: 0,college_gsis_id,team,max_season,position_college,receiving_receptions_college,receiving_yards_college,receiving_touchdowns_college,rushing_attempts_college,rushing_yards_college,rushing_touchdowns_college,...,rushing_yards_nfl,rushing_touchdowns_nfl,receiving_targets,receiving_receptions_nfl,receiving_yards_nfl,receiving_yards_per_route_run,receiving_touchdowns_nfl,receiving_yards_after_catch,defense_interceptions,defense_pass_breakups
0,172698,Indiana,2019,WR,84.0,1162.0,9.0,0.0,0.0,0.0,...,,,8.0,3.0,33.0,,,5.0,,
1,191527,Louisiana,2019,WR,139.0,1989.0,22.0,3.0,30.0,0.0,...,,,11.0,5.0,60.0,,,9.0,,
2,194682,Missouri,2019,WR,129.0,1755.0,11.0,3.0,21.0,0.0,...,,,,,,,,,,
3,195293,Florida,2019,WR,73.0,961.0,6.0,8.0,133.0,2.0,...,,,,,,,,,,
4,195298,Florida,2019,WR,61.0,884.0,13.0,4.0,27.0,0.0,...,,,21.0,13.0,159.0,,2.0,102.0,,


In [245]:
def wr_metric(mkt_share_comp, efficiency_comp, seasons, sos):
    #Account for market share(+), efficiency(+), and number of college seasons(-)
    #Adjust for strength of schedule

    # Weights
    w1 = 0.45
    w2 = 0.4
    w3 = 0.05
    w4 = 0.1

    return (w1*mkt_share_comp + w2*efficiency_comp - w3*seasons + w4*sos)


In [248]:
for index, row in wr_merged.iterrows():
    #Market Share Component
    mkt_share_comp = (row['avg_rec_yds_share'] + row['avg_rec_td_share'] + row['avg_rec_cmp_share']) / 3

    #Efficiency Component
    if row['receiving_receptions_college'] > 0:
        yds_per_rec = row['receiving_yards_college'] / row['receiving_receptions_college']
        td_per_rec = row['receiving_touchdowns_college'] / row['receiving_receptions_college']
    else:
        yds_per_rec = 0
        td_per_rec = 0

    efficiency_comp = (yds_per_rec / 15) + (td_per_rec / 0.1)  # Normalizing factors

    seasons = row['seasons_played']
    sos = row['avg_opp_sos_rating']

    wr_merged.loc[index, 'wr_metric'] = wr_metric(mkt_share_comp, efficiency_comp, seasons, sos)

In [250]:
# Add a percentile column for wr_metric
wr_merged['wr_metric_percentile'] = wr_merged['wr_metric'].rank(pct=True)

In [251]:
wr_merged.head()

Unnamed: 0,college_gsis_id,team,max_season,position_college,receiving_receptions_college,receiving_yards_college,receiving_touchdowns_college,rushing_attempts_college,rushing_yards_college,rushing_touchdowns_college,...,receiving_targets,receiving_receptions_nfl,receiving_yards_nfl,receiving_yards_per_route_run,receiving_touchdowns_nfl,receiving_yards_after_catch,defense_interceptions,defense_pass_breakups,wr_metric,wr_metric_percentile
0,172698,Indiana,2019,WR,84.0,1162.0,9.0,0.0,0.0,0.0,...,8.0,3.0,33.0,,,5.0,,,0.8479,0.75472
1,191527,Louisiana,2019,WR,139.0,1989.0,22.0,3.0,30.0,0.0,...,11.0,5.0,60.0,,,9.0,,,0.87788,0.79245
2,194682,Missouri,2019,WR,129.0,1755.0,11.0,3.0,21.0,0.0,...,,,,,,,,,0.68581,0.5283
3,195293,Florida,2019,WR,73.0,961.0,6.0,8.0,133.0,2.0,...,,,,,,,,,0.67697,0.49057
4,195298,Florida,2019,WR,61.0,884.0,13.0,4.0,27.0,0.0,...,21.0,13.0,159.0,,2.0,102.0,,,1.25295,1.0


### TODO NEXT
Find correlation between wr_metric and nfl performance metrics like receptions, yards, touchdowns.
- Model for both playing time and production separately
- Does this metric seperate opportunty from abillity?
- THIS SEPERATION is how we find undervalued players

## TODO LONG TERM
Use ML models to adjust metric weights to better correlate with NFL success
   - Try Linear regression model
   - Try XGBoost regression model
   - Use SHAP values to interpret feature importance and adjust weights accordingly

Repeat this process for other positions (QB, RB, TE)