# Project 4: Clustering the Future - Leveraging Collegiate and Athletic Data to Find NFL Comparisons for College Players

In [1]:
import pandas as pd
import numpy as np
import nfl_data_py as nfl
import cfbd
import requests
from bs4 import BeautifulSoup
from time import sleep
from fuzzywuzzy import process
import os
from dotenv import load_dotenv

import requests
from time import sleep
from great_tables import GT, system_fonts, style, loc, md
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# I. Getting our data & pre-processing

In [2]:
# Load our environment variables
load_dotenv()

True

In [3]:
# Set up our college football API
configuration = cfbd.Configuration(
    host = "https://apinext.collegefootballdata.com"
)

# Load environment variables from the .env file
load_dotenv()

configuration = cfbd.Configuration(
    access_token = os.getenv('CFBD')
)

In [4]:
all_RAS: pd.DataFrame = pd.read_csv('./datasets/RAS Scores.csv')
RAS_2025: pd.DataFrame = pd.read_csv('./datasets/RAS 2025 Draft Class.csv').drop_duplicates(subset=['RAS', 'Name'])
all_RAS

Unnamed: 0,Link,Name,Pos,Year,College,RAS,Alltime
0,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Justin Fargas,RB,2003,Southern California,10.0,10.0
1,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Calvin Johnson,WR,2007,Georgia Tech,10.0,10.0
2,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",David Buehler,PK,2009,Southern California,10.0,10.0
3,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Anthony Richardson,QB,2023,Florida,10.0,10.0
4,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Lorenzo Carter,LB,2018,Georgia,10.0,10.0
...,...,...,...,...,...,...,...
10101,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Sataoa Laumea,OG,2024,Utah,,
10102,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Will Reichard,PK,2024,Alabama,,
10103,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Nick Samac,OC,2024,Michigan State,,
10104,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",LaDarius Henderson,OG,2024,Michigan,,


In [5]:
RAINBOW_COLORS: list = ["#ffadad","#ffd6a5","#fdffb6","#caffbf","#9bf6ff","#a0c4ff","#bdb2ff","#ffc6ff"]
POS_RAINBOW_COLOR_SEQ: list = [RAINBOW_COLORS[i % len(RAINBOW_COLORS)] for i in range(len(np.unique(RAS_2025['Pos'])))]

RAS_2025_by_pos = px.bar(
    data_frame=pd.DataFrame(
        {
            'Position': [val[0] for val in RAS_2025.value_counts(subset = ['Pos']).index.to_flat_index()],
            'Counts': RAS_2025.value_counts(subset = ['Pos']).values
        }
    ),
    x = 'Position',
    y = 'Counts',
    template = 'plotly_dark'
)

RAS_2025_by_pos.update_layout(
    font_family = "Raleway, sans-serif",
    title = dict(text = '<b>2025 RAS Scores by Position</b> <br><em>As of 4/15/25</em>', x = .5, xanchor='center', font = {'size': 20})
)

for bar in RAS_2025_by_pos.data:
    bar.marker.color = POS_RAINBOW_COLOR_SEQ

# RAS_2025_by_pos.write_image('../img/RAS_scores_by_position.png', engine = 'kaleido', scale = 6)

![RAS Scores by Position](../img/RAS_scores_by_position.png?123)

In [6]:
RAS_parsed: pd.DataFrame = all_RAS.loc[(all_RAS['Year'] >= 2021) & (all_RAS['Pos'] == 'WR')].dropna(axis = 0, subset = ['RAS', 'Name'])
RAS_parsed

Unnamed: 0,Link,Name,Pos,Year,College,RAS,Alltime
33,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Adonai Mitchell,WR,2024,Texas,9.99,9.99
90,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Christian Watson,WR,2022,North Dakota State,9.96,9.96
91,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Andrei Iosivas,WR,2023,Princeton,9.96,9.96
129,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Dareke Young,WR,2022,Lenoir-Rhyne,9.93,9.94
164,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Rome Odunze,WR,2024,Washington,9.92,9.92
...,...,...,...,...,...,...,...
5769,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Montrell Washington,WR,2022,Samford,4.32,4.51
5833,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Derius Davis,WR,2023,Texas Christian,4.29,4.40
5962,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",David Bell,WR,2022,Purdue,3.98,4.18
6898,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Dazz Newsome,WR,2021,North Carolina,2.13,2.41


In [7]:
RAS_2025_receivers: pd.DataFrame = RAS_2025.loc[RAS_2025['Pos'] == 'WR'].dropna(axis = 0, subset = ['RAS', 'Name'])
RAS_2025_receivers

Unnamed: 0,Link,Name,Pos,Year,College,RAS,Alltime RAS
5,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Landon Parker,WR,2025,Troy,9.97,9.97
7,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Isaac TeSlaa,WR,2025,Arkansas,9.93,9.93
13,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Isaiah Neyor,WR,2025,Nebraska,9.90,9.90
14,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Sam Brown,WR,2025,Miami,9.90,9.90
18,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Dont'e Thornton,WR,2025,Tennessee,9.85,9.85
...,...,...,...,...,...,...,...
2519,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Steven Alaniz,WR,2025,Capital,0.12,0.12
2522,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Ty Wiley,WR,2025,Northern State,0.10,0.10
2523,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Terez Traynor,WR,2025,Charlotte,0.10,0.10
2536,"<a class=""nt_btn "" style=""color: rgba(0, 0, ...",Dejuan Bell,WR,2025,Fort Valley State,0.07,0.07


In [8]:
YEARS = [num for num in range(2021, 2025)]
STAT_YEARS = [num for num in range(2019, 2025)]

In [9]:
from typing import Callable, Any, Type
from cfbd.api_response import ApiResponse  

import cfbd.api_response

def get_cfbd_data(config: cfbd.Configuration, years: list[int], api_instance: str, api_call: str, filepath: str, display_usage: bool = False, load: bool = True, **kwargs) -> pd.DataFrame:
    """call the CFBD api multiple times, conglomerate the data, save it to a CSV and dataframe, and then return the dataframe

    Args:
        config (cfbd.Configuration): configuration object to authenticate the CFBD api
        years (list[int]): years to collect the data for 
        api_instance (str): the specific unauthenticated API type from CFBD where the specific API call is housed
        api_call (str): specific API call that you'd like to make for each year
        filepath (str): filepath to either write the file to (if load = True) or read from (if false)
        display_usage (bool, optional): display the number of API calls remaining for a given API key. Defaults to False.
        load (bool, optional): whether to load a dataset from an existing filepath. Defaults to True.

    Returns:
        pd.DataFrame: dataframe of requested statistics, either loaded or collected from the API
    """
    data: list = []
    if not load:
        with cfbd.ApiClient(config) as api:
            # Get the desired class from the cfbd module
            api_class: Type[Any] = getattr(cfbd, api_instance)
            authenticated_api_instance: Any = api_class(api)
            for year in years:
                # Get the method from the class
                retrieved_api_call: Callable[...,Any] = getattr(authenticated_api_instance, api_call)

                # Make the API call (finally...)
                response: ApiResponse = retrieved_api_call(year = year, **kwargs)

                # Add this data to our list
                data.extend([dict(player) for player in response.data])

                if display_usage:
                    print(f"The amount of API calls left is: {response.headers['X-Calllimit-Remaining']}")
        
        df: pd.DataFrame = pd.DataFrame(data)
        
        # To avoid having to run these API calls every time, we save the file
        df.to_csv(filepath)
    else:
        # In case we already have the file saved, just read it
        df: pd.DataFrame = pd.read_csv(filepath)
    
    return df

In [10]:
player_season_stats: pd.DataFrame = get_cfbd_data(
    config = configuration,
    years = STAT_YEARS,
    api_instance = "StatsApi",
    api_call = 'get_player_season_stats_with_http_info',
    filepath = './datasets/cfbd_player_season_stats_2021_2025.csv',
    display_usage = True,
    load = True,
)

# There's some weird extra whitespace that the following line will remove
player_season_stats = player_season_stats.applymap(lambda x: x.strip() if isinstance(x, str) else x)

player_season_stats

Unnamed: 0.1,Unnamed: 0,season,player_id,player,team,conference,category,stat_type,stat
0,0,2019,-109170,Team,Kennesaw State,Big South,rushing,CAR,1.0
1,1,2019,-109170,Team,Kennesaw State,Big South,rushing,LONG,0.0
2,2,2019,-109170,Team,Kennesaw State,Big South,rushing,TD,0.0
3,3,2019,-109170,Team,Kennesaw State,Big South,rushing,YDS,-2.0
4,4,2019,-109170,Team,Kennesaw State,Big South,rushing,YPC,-2.0
...,...,...,...,...,...,...,...,...,...
733345,733345,2024,5257802,Fred Davis Ii,Jacksonville State,Conference USA,defensive,SACKS,0.0
733346,733346,2024,5257802,Fred Davis Ii,Jacksonville State,Conference USA,defensive,SOLO,1.0
733347,733347,2024,5257802,Fred Davis Ii,Jacksonville State,Conference USA,defensive,TD,0.0
733348,733348,2024,5257802,Fred Davis Ii,Jacksonville State,Conference USA,defensive,TFL,1.0


In [11]:
player_predicted_points_added: pd.DataFrame = get_cfbd_data(
    config = configuration,
    years = STAT_YEARS,
    api_instance = "MetricsApi",
    api_call = 'get_predicted_points_added_by_player_season_with_http_info',
    filepath = './datasets/cfbd_player_predicted_points_added_2021_2025.csv',
    display_usage = True,
    load = True,
)

player_predicted_points_added

Unnamed: 0.1,Unnamed: 0,season,id,name,position,team,conference,average_ppa,total_ppa
0,0,2019,4030745,Maurice Lewis Jr.,WR,Bowling Green,Mid-American,passing_downs=0 standard_downs=-4.924 third_do...,passing_downs=0 standard_downs=-4.924 third_do...
1,1,2019,4258190,Ben Mason,TE,Michigan,Big Ten,passing_downs=0 standard_downs=-4.393 third_do...,passing_downs=0 standard_downs=-8.786 third_do...
2,2,2019,4372368,T.K. Wilkerson,RB,San José State,Mountain West,passing_downs=0 standard_downs=-3.958 third_do...,passing_downs=0 standard_downs=-3.958 third_do...
3,3,2019,4036529,Jonathan Ifedi,WR,Arkansas State,Sun Belt,passing_downs=0 standard_downs=-3.811 third_do...,passing_downs=0 standard_downs=-3.811 third_do...
4,4,2019,4240546,Michael Marchese,TE,UConn,American Athletic,passing_downs=0 standard_downs=-3.811 third_do...,passing_downs=0 standard_downs=-3.811 third_do...
...,...,...,...,...,...,...,...,...,...
15324,15324,2024,5159888,Chris Elko,WR,Georgia Tech,ACC,passing_downs=0 standard_downs=4.865 third_dow...,passing_downs=0 standard_downs=4.865 third_dow...
15325,15325,2024,5193217,Jonathan Bibbs,WR,UL Monroe,Sun Belt,passing_downs=0 standard_downs=5.243 third_dow...,passing_downs=0 standard_downs=10.485 third_do...
15326,15326,2024,5147367,Joseph McVay,WR,Vanderbilt,SEC,passing_downs=0 standard_downs=5.243 third_dow...,passing_downs=0 standard_downs=10.485 third_do...
15327,15327,2024,5079382,Jonathan Echols,TE,South Florida,American Athletic,passing_downs=0 standard_downs=6.007 third_dow...,passing_downs=0 standard_downs=12.015 third_do...


In [12]:
receiving_stats: pd.DataFrame = player_season_stats.loc[(player_season_stats['player'] != 'Team') & (player_season_stats['category'] == 'receiving')]

# Instead of each player have multiple rows with every row being a different stat, just have one row with each stat being its own column 
receiving_stats = receiving_stats.pivot(
    index = ['player_id','player', 'season'],
    columns = 'stat_type',
    values = 'stat',
).reset_index()

In [13]:
receiving_stats

stat_type,player_id,player,season,LONG,REC,TD,YDS,YPR
0,102597,Will Rogers,2022,13.0,1.0,0.0,13.0,13.0
1,107494,Trey Sanders,2020,0.0,1.0,0.0,-2.0,-2.0
2,107494,Trey Sanders,2021,36.0,6.0,0.0,55.0,9.2
3,107494,Trey Sanders,2022,10.0,2.0,0.0,13.0,6.5
4,107494,Trey Sanders,2023,8.0,5.0,0.0,20.0,4.0
...,...,...,...,...,...,...,...,...
20637,5243750,Tj Holmes,2024,8.0,1.0,0.0,8.0,8.0
20638,5243771,Carlos Mann,2024,21.0,2.0,0.0,30.0,15.0
20639,5245626,Brennen Benedict,2024,6.0,1.0,0.0,6.0,6.0
20640,5245627,Cj Broy,2024,61.0,9.0,2.0,143.0,15.9


In [14]:
def extract_ppa_from_table(val: str, prefix: str) -> dict:
    vals: list[str] = val.split(' ')
    extracted_dict: dict[str, float] = {}
    for val in vals:
        extract = val.split('=')
        extracted_dict[f'{prefix}_{extract[0]}_ppa'] = float(extract[1])
    
    return extracted_dict


In [15]:
# We have to unpack our player_predicted_points_added dataframe a bit
player_predicted_points_added_expanded: pd.DataFrame = pd.json_normalize(player_predicted_points_added['average_ppa'].map(lambda x: extract_ppa_from_table(x, 'average')))

# Let's also extract the total PPA values 
player_predicted_points_added_total: pd.DataFrame = pd.json_normalize(player_predicted_points_added['total_ppa'].map(lambda x: extract_ppa_from_table(x, 'total')))

# Now join these expanded datasets with the original player data
player_predicted_points_combined = pd.concat([
    player_predicted_points_added.drop(['average_ppa', 'total_ppa'], axis=1).reset_index(drop=True),
    player_predicted_points_added_expanded.reset_index(drop=True),
    player_predicted_points_added_total.reset_index(drop=True)
], axis=1)


player_predicted_points_combined

Unnamed: 0.1,Unnamed: 0,season,id,name,position,team,conference,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,...,average_var_pass_ppa,average_all_ppa,total_passing_downs_ppa,total_standard_downs_ppa,total_third_down_ppa,total_second_down_ppa,total_first_down_ppa,total_rush_ppa,total_var_pass_ppa,total_all_ppa
0,0,2019,4030745,Maurice Lewis Jr.,WR,Bowling Green,Mid-American,0.0,-4.924,0.000,...,-4.924,-4.924,0.0,-4.924,0.000,-4.924,0.000,0.000,-4.924,-4.924
1,1,2019,4258190,Ben Mason,TE,Michigan,Big Ten,0.0,-4.393,0.000,...,0.000,-4.393,0.0,-8.786,0.000,-8.786,0.000,-8.786,0.000,-8.786
2,2,2019,4372368,T.K. Wilkerson,RB,San José State,Mountain West,0.0,-3.958,-3.958,...,0.000,-3.958,0.0,-3.958,-3.958,0.000,0.000,-3.958,0.000,-3.958
3,3,2019,4036529,Jonathan Ifedi,WR,Arkansas State,Sun Belt,0.0,-3.811,0.000,...,-3.811,-3.811,0.0,-3.811,0.000,0.000,-3.811,0.000,-3.811,-3.811
4,4,2019,4240546,Michael Marchese,TE,UConn,American Athletic,0.0,-3.811,0.000,...,-3.811,-3.811,0.0,-3.811,0.000,0.000,-3.811,0.000,-3.811,-3.811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15324,15324,2024,5159888,Chris Elko,WR,Georgia Tech,ACC,0.0,4.865,0.000,...,4.865,4.865,0.0,4.865,0.000,0.000,0.000,0.000,4.865,4.865
15325,15325,2024,5193217,Jonathan Bibbs,WR,UL Monroe,Sun Belt,0.0,5.243,0.000,...,5.243,5.243,0.0,10.485,0.000,0.000,10.485,0.000,10.485,10.485
15326,15326,2024,5147367,Joseph McVay,WR,Vanderbilt,SEC,0.0,5.243,0.000,...,5.243,5.243,0.0,10.485,0.000,0.000,10.485,0.000,10.485,10.485
15327,15327,2024,5079382,Jonathan Echols,TE,South Florida,American Athletic,0.0,6.007,6.007,...,6.007,6.007,0.0,12.015,12.015,0.000,0.000,0.000,12.015,12.015


In [16]:
def get_combine_data(years: list[int], positions: list[str], filepath: str, load: bool = True) -> pd.DataFrame:
    """scrape nflcombineresults.com to get combine data 

    Args:
        years (list[int]): list of integers for combine data to get
        positions (list[str]): list of positions to get combine data for
        filepath (str): filepath to either write the file to (if load = True) or read from (if false)
        load (bool, optional): whether to load a dataset from an existing filepath. Defaults to True.
    Returns:
        pd.DataFrame: data frame of all combine data
    """
    if not load:
        all_data = []
        
        for year in years:
            for position in positions:
                url = f'https://nflcombineresults.com/nflcombinedata.php?year={year}&pos={position}&college='
                
                # Add a small delay to avoid overloading the server
                sleep(1)
                
                try:
                    response = requests.get(url) # even though this throws a 404 error, the website still loads?? bizarre
                    
                    # Parse the HTML table with BeautifulSoup
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    # Find the table containing combine data
                    table = soup.find('table')
                    
                    if table:
                        # Extract data rows
                        rows = table.find_all('tr')[1:]
                        for row in rows: 
                            cells = row.find_all('td')
                            if len(cells) == 13:
                                row_data = {
                                    'year': cells[0].text,
                                    'name': cells[1].text,
                                    'college': cells[2].text,
                                    'pos': cells[3].text,
                                    'height': cells[4].text,
                                    'weight': cells[5].text,
                                    'forty': cells[7].text,
                                    'bench': cells[8].text,
                                    'vertical': cells[9].text,
                                    'broad jump': cells[10].text,
                                    'shuttle': cells[11].text,
                                    '3-cone': cells[12].text
                                }
                                
                                
                                all_data.append(row_data)
                        
                        print(f"Successfully scraped data for {year} {position}")
                    else:
                        print(f"No data table found for {year} {position}")
                        
                except requests.exceptions.RequestException as e:
                    print(f"Error fetching data for {year} {position}: {e}")
        
        
            df = pd.DataFrame(all_data)
            df.to_csv(filepath)
    else:
        df = pd.read_csv(filepath)

    
    return df
        

In [17]:
nfl_data: pd.DataFrame = nfl.import_seasonal_rosters(years = YEARS)

In [19]:
combine_data: pd.DataFrame = get_combine_data(years = YEARS, positions=['WR'], filepath = './datasets/combine_data_2021_2025.csv', load = True)

In [20]:
combine_data

Unnamed: 0.1,Unnamed: 0,year,name,college,pos,height,weight,forty,bench,vertical,broad jump,shuttle,3-cone
0,0,2021,Jonathan Adams,Arkansas State,WR,74.00,210,4.59,,39.0,132.0,4.38,7.04
1,1,2021,Tutu Atwell,Louisville,WR,68.88,155,4.39,,33.0,117.0,4.09,6.87
2,2,2021,Jhamon Ausbon,Texas A&M,WR,74.38,217,4.72,,33.0,118.0,4.28,6.89
3,3,2021,Kawaan Baker,South Alabama,WR,72.50,210,4.45,21.0,39.5,129.0,4.41,7.42
4,4,2021,Rashod Bateman,Minnesota,WR,72.38,190,4.43,,36.0,123.0,4.35,6.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,196,2024,Casey Washington,Illinois,WR,72.88,201,4.46,13.0,39.5,127.0,4.43,7.39
196,197,2021,Dax Milne,Brigham Young,WR,72.63,193,4.56,14.0,31.0,114.0,4.22,6.85
197,198,2021,Demetric Felton,UCLA,WR,68.62,189,4.59,10.0,31.5,114.0,4.50,7.31
198,199,2022,Samori Toure,Nebraska,WR,72.75,191,4.48,7.0,34.5,124.0,4.22,6.77


In [21]:
def convert_fiie_to_in(fiie: int) -> float:
    """convert a fiie height to inches

    Args:
        fiie (int): height in fiie

    Returns:
        float: height in inches, rounded to closest hundredth of an inch
    """
    fiie_str: str = str(fiie)
    feet: str = fiie_str[0]
    inches: str = fiie_str[1:3]
    eighth_of_inch: str = fiie_str[3]

    height: float = (int(feet) * 12) + int(inches) + (int(eighth_of_inch) * (1/8))

    return round(height, 2)

In [22]:
combine_data_2025 = pd.read_csv('./datasets/2025 Combine for WRs.csv')

# Convert FIIE heights to inches to match the rest of the data 
combine_data_2025['height'] = combine_data_2025['height'].apply(func = convert_fiie_to_in)

combine_data_2025

Unnamed: 0,name,School,year,Position,Drafted By,Age,height,weight,Arm Length (inches),Hand Span (inches),Bench Press (reps),forty,10-yard Split (seconds),20-yard Split (seconds),20-yard Shuttle (seconds),Three-cone Drill (seconds),Vertical Jump (inches),Broad Jump (FFII),RAS,Production
0,Andrew Armstrong,Arkansas,2025,WR,,,75.75,202,32.125,9.5,11.0,4.51,1.53,,4.18,6.97,37.5,1004.0,9.06,7.0
1,Elic Ayomanor,Stanford,2025,WR,,,73.75,206,32.375,10.0,,4.44,1.58,,,,38.5,1007.0,9.67,7.3
2,Elijhah Badger,Florida,2025,WR,,,73.38,200,32.375,9.5,,4.43,1.53,,,,35.5,,9.44,6.8
3,Jack Bech,TCU,2025,WR,,,73.25,214,31.5,9.0,,,,,4.21,6.84,34.5,1005.0,9.46,6.8
4,Isaiah Bond,Texas,2025,WR,,,70.62,180,30.5,8.5,,4.39,1.51,,,,,,,6.5
5,Ja'Corey Brooks,Louisville,2025,WR,,,74.0,184,31.625,9.0,,,,,,,,,,6.8
6,Sam Brown Jr.,Miami,2025,WR,,,74.25,200,31.5,9.38,,4.44,1.52,,,,41.5,,9.87,5.8
7,Pat Bryant,Illinois,2025,WR,,,74.25,204,31.125,9.5,,4.61,1.56,,,,37.5,1004.0,8.19,7.1
8,Luther Burden III,Missouri,2025,WR,,,72.0,206,31.25,8.5,,4.41,1.54,,,,,,,7.8
9,Beaux Collins,Notre Dame,2025,WR,,,75.0,201,31.5,9.5,,,,,,,,,,5.8


In [None]:
def get_val(player_name: str, col_name: str, dataframe: pd.DataFrame, year: int, year_name_col: str ='year', player_name_col: str = 'name', verbose: bool = False) -> float:
    """Filter a dataframe to find the specific statistic for a player in a particular year
    Args:
        player_name (str): name of the player
        col_name (str): name of the column that holds the desired statistic
        dataframe (pd.DataFrame): dataframe to filter with
        year (int): year where statistic was captured
        year_name_col (str, optional): name of the year column for the dataframe. Defaults to 'year'.
        player_name_col (str, optional): name of the player_name column for the dataframe. Defaults to 'name'.
        verbose (bool, optional): display instances where no data was found for a particular player. Defaults to False.

    Returns:
        float: _description_
    """
    player_stats = dataframe.loc[(dataframe[player_name_col] == player_name) & (dataframe[year_name_col] == year)]

    # the pandemic messed up some of the players due to COVID opt-outs or some players had injuries, if that is the case, we can try looking at a year before the provided date
    if player_stats.empty:
        player_stats = dataframe.loc[(dataframe[player_name_col] == player_name) & (dataframe[year_name_col] == year - 1)]

    
    # Filter by additional parameters if desired
    # for col, val in args:
    #     player_stats = player_stats.loc[player_stats[col] == val]
    
    # Convert to array and grab the value if possible
    if len(player_stats) == 1:
        return player_stats[col_name].to_numpy()[0]
    else:
        if verbose:
            print(f'No data for {player_name} for {col_name} in year {year}')
        return None



In [24]:
receiving_stats

stat_type,player_id,player,season,LONG,REC,TD,YDS,YPR
0,102597,Will Rogers,2022,13.0,1.0,0.0,13.0,13.0
1,107494,Trey Sanders,2020,0.0,1.0,0.0,-2.0,-2.0
2,107494,Trey Sanders,2021,36.0,6.0,0.0,55.0,9.2
3,107494,Trey Sanders,2022,10.0,2.0,0.0,13.0,6.5
4,107494,Trey Sanders,2023,8.0,5.0,0.0,20.0,4.0
...,...,...,...,...,...,...,...,...
20637,5243750,Tj Holmes,2024,8.0,1.0,0.0,8.0,8.0
20638,5243771,Carlos Mann,2024,21.0,2.0,0.0,30.0,15.0
20639,5245626,Brennen Benedict,2024,6.0,1.0,0.0,6.0,6.0
20640,5245627,Cj Broy,2024,61.0,9.0,2.0,143.0,15.9


In [25]:
rows = []

# In order to make the most high-quality predictions possible, I am only going to focus on players that I have RAS scores as that is my smallest dataset for and then get college data from their previous data from along with their more advanced metrics

df: pd.DataFrame

for player, year, RAS in RAS_parsed[['Name', 'Year', 'RAS']].to_numpy():
    # print(player, year)
    row = {
        'player_name': player,
        'headshot_url': get_val(player, 'headshot_url', nfl_data, year, 'season', 'player_name'),
        'receptions': get_val(player, 'REC', receiving_stats, year - 1, 'season', 'player'),
        'yards': get_val(player, 'YDS', receiving_stats, year - 1, 'season', 'player'),
        'touchdowns': get_val(player, 'TD', receiving_stats, year - 1, 'season', 'player'),
        'yards_per_reception': get_val(player, 'YPR', receiving_stats, year - 1, 'season', 'player'),
        'average_passing_downs_ppa': get_val(player, 'average_passing_downs_ppa', player_predicted_points_combined, year - 1, 'season', 'name'),
        'average_standard_downs_ppa': get_val(player, 'average_standard_downs_ppa', player_predicted_points_combined, year - 1, 'season', 'name'),
        'average_third_down_ppa': get_val(player, 'average_third_down_ppa', player_predicted_points_combined, year - 1, 'season', 'name'),
        'height': get_val(player, 'height', combine_data, year, 'year', 'name'),
        'weight': get_val(player, 'weight', combine_data, year, 'year', 'name'),
        'forty': get_val(player, 'forty', combine_data, year, 'year', 'name'),
        'RAS': RAS,
    }
    rows.append(row)

df: pd.DataFrame = pd.DataFrame(rows)

In [26]:
df

Unnamed: 0,player_name,headshot_url,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height,weight,forty,RAS
0,Adonai Mitchell,https://static.www.nfl.com/image/upload/f_auto...,55.0,845.0,11.0,15.4,1.689,0.673,2.063,74.25,205.0,4.34,9.99
1,Christian Watson,https://static.www.nfl.com/image/private/f_aut...,,,,,,,,76.13,208.0,4.36,9.96
2,Andrei Iosivas,https://static.www.nfl.com/image/private/f_aut...,66.0,943.0,7.0,14.3,,,,75.13,205.0,4.43,9.96
3,Dareke Young,https://static.www.nfl.com/image/private/f_aut...,,,,,,,,,,,9.93
4,Rome Odunze,https://static.www.nfl.com/image/upload/f_auto...,92.0,1640.0,13.0,17.8,1.407,1.068,1.862,74.88,212.0,4.45,9.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Montrell Washington,https://static.www.nfl.com/image/private/f_aut...,10.0,124.0,1.0,12.4,,,,68.88,176.0,4.48,4.32
118,Derius Davis,https://static.www.nfl.com/image/private/f_aut...,42.0,531.0,5.0,12.6,0.887,0.442,0.910,68.38,165.0,4.36,4.29
119,David Bell,https://static.www.nfl.com/image/private/f_aut...,93.0,1286.0,6.0,13.8,1.306,0.741,1.518,72.88,212.0,4.68,3.98
120,Dazz Newsome,https://static.www.nfl.com/image/private/f_aut...,54.0,684.0,6.0,12.7,1.448,0.854,1.425,70.13,190.0,4.59,2.13


In [27]:
# Drop columns where more than 70% of values are null
df = df[df.isnull().mean(axis = 1) < .3]

df

Unnamed: 0,player_name,headshot_url,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height,weight,forty,RAS
0,Adonai Mitchell,https://static.www.nfl.com/image/upload/f_auto...,55.0,845.0,11.0,15.4,1.689,0.673,2.063,74.25,205.0,4.34,9.99
2,Andrei Iosivas,https://static.www.nfl.com/image/private/f_aut...,66.0,943.0,7.0,14.3,,,,75.13,205.0,4.43,9.96
4,Rome Odunze,https://static.www.nfl.com/image/upload/f_auto...,92.0,1640.0,13.0,17.8,1.407,1.068,1.862,74.88,212.0,4.45,9.92
5,Ricky Pearsall,https://static.www.nfl.com/image/upload/f_auto...,64.0,970.0,4.0,15.2,1.072,1.133,1.248,73.00,189.0,4.41,9.91
6,Xavier Legette,https://static.www.nfl.com/image/upload/f_auto...,71.0,1255.0,7.0,17.7,1.249,1.206,1.853,73.00,221.0,4.39,9.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Montrell Washington,https://static.www.nfl.com/image/private/f_aut...,10.0,124.0,1.0,12.4,,,,68.88,176.0,4.48,4.32
118,Derius Davis,https://static.www.nfl.com/image/private/f_aut...,42.0,531.0,5.0,12.6,0.887,0.442,0.910,68.38,165.0,4.36,4.29
119,David Bell,https://static.www.nfl.com/image/private/f_aut...,93.0,1286.0,6.0,13.8,1.306,0.741,1.518,72.88,212.0,4.68,3.98
120,Dazz Newsome,https://static.www.nfl.com/image/private/f_aut...,54.0,684.0,6.0,12.7,1.448,0.854,1.425,70.13,190.0,4.59,2.13


In [28]:
df.isnull().sum()

player_name                   0
headshot_url                  7
receptions                    0
yards                         0
touchdowns                    0
yards_per_reception           0
average_passing_downs_ppa     3
average_standard_downs_ppa    3
average_third_down_ppa        3
height                        0
weight                        0
forty                         0
RAS                           0
dtype: int64

In [29]:
# Very few values are missing, so we can just impute them with the median for the column
df['average_passing_downs_ppa'].fillna(df['average_passing_downs_ppa'].median(), inplace=True)
df['average_standard_downs_ppa'].fillna(df['average_standard_downs_ppa'].median(), inplace=True)
df['average_third_down_ppa'].fillna(df['average_third_down_ppa'].median(), inplace=True)

df



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,player_name,headshot_url,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height,weight,forty,RAS
0,Adonai Mitchell,https://static.www.nfl.com/image/upload/f_auto...,55.0,845.0,11.0,15.4,1.689,0.673,2.063,74.25,205.0,4.34,9.99
2,Andrei Iosivas,https://static.www.nfl.com/image/private/f_aut...,66.0,943.0,7.0,14.3,1.236,0.841,1.425,75.13,205.0,4.43,9.96
4,Rome Odunze,https://static.www.nfl.com/image/upload/f_auto...,92.0,1640.0,13.0,17.8,1.407,1.068,1.862,74.88,212.0,4.45,9.92
5,Ricky Pearsall,https://static.www.nfl.com/image/upload/f_auto...,64.0,970.0,4.0,15.2,1.072,1.133,1.248,73.00,189.0,4.41,9.91
6,Xavier Legette,https://static.www.nfl.com/image/upload/f_auto...,71.0,1255.0,7.0,17.7,1.249,1.206,1.853,73.00,221.0,4.39,9.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Montrell Washington,https://static.www.nfl.com/image/private/f_aut...,10.0,124.0,1.0,12.4,1.236,0.841,1.425,68.88,176.0,4.48,4.32
118,Derius Davis,https://static.www.nfl.com/image/private/f_aut...,42.0,531.0,5.0,12.6,0.887,0.442,0.910,68.38,165.0,4.36,4.29
119,David Bell,https://static.www.nfl.com/image/private/f_aut...,93.0,1286.0,6.0,13.8,1.306,0.741,1.518,72.88,212.0,4.68,3.98
120,Dazz Newsome,https://static.www.nfl.com/image/private/f_aut...,54.0,684.0,6.0,12.7,1.448,0.854,1.425,70.13,190.0,4.59,2.13


In [30]:
# Time to repeat the same process for the players that are about to get drafted

rows = []

for player, year, RAS in RAS_2025_receivers[['Name', 'Year', 'RAS']].to_numpy():
    # print(player, year)
    row = {
        'player_name': player,
        'receptions': get_val(player, 'REC', receiving_stats, year - 1, 'season', 'player'),
        'yards': get_val(player, 'YDS', receiving_stats, year - 1, 'season', 'player'),
        'touchdowns': get_val(player, 'TD', receiving_stats, year - 1, 'season', 'player'),
        'yards_per_reception': get_val(player, 'YPR', receiving_stats, year - 1, 'season', 'player'),
        'average_passing_downs_ppa': get_val(player, 'average_passing_downs_ppa', player_predicted_points_combined, year - 1, 'season', 'name'),
        'average_standard_downs_ppa': get_val(player, 'average_standard_downs_ppa', player_predicted_points_combined, year - 1, 'season', 'name'),
        'average_third_down_ppa': get_val(player, 'average_third_down_ppa', player_predicted_points_combined, year - 1, 'season', 'name'),
        'height': get_val(player, 'height', combine_data_2025, year, 'year', 'name'),
        'weight': get_val(player, 'weight', combine_data_2025, year, 'year', 'name'),
        'forty': get_val(player, 'forty', combine_data_2025, year, 'year', 'name'),
        'RAS': RAS,
    }
    rows.append(row)

df_25_prospects: pd.DataFrame = pd.DataFrame(rows)

In [31]:
df_25_prospects

Unnamed: 0,player_name,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height,weight,forty,RAS
0,Landon Parker,27.0,350.0,0.0,13.0,1.608,0.777,2.194,,,,9.97
1,Isaac TeSlaa,,,,,,,,75.62,214.0,4.43,9.93
2,Isaiah Neyor,34.0,455.0,5.0,13.4,1.866,1.155,1.982,76.25,218.0,4.40,9.90
3,Sam Brown,36.0,509.0,2.0,14.1,0.803,1.436,0.938,,,,9.90
4,Dont'e Thornton,,,,,,,,,,,9.85
...,...,...,...,...,...,...,...,...,...,...,...,...
409,Steven Alaniz,,,,,,,,,,,0.12
410,Ty Wiley,,,,,,,,,,,0.10
411,Terez Traynor,2.0,18.0,0.0,9.0,1.521,0.358,1.521,,,,0.10
412,Dejuan Bell,,,,,,,,,,,0.07


In [32]:
df_25_prospects = df_25_prospects[df_25_prospects.isnull().mean(axis = 1) < .2]

df_25_prospects

Unnamed: 0,player_name,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height,weight,forty,RAS
2,Isaiah Neyor,34.0,455.0,5.0,13.4,1.866,1.155,1.982,76.25,218.0,4.4,9.9
5,Tory Horton,25.0,331.0,1.0,13.2,1.473,0.449,1.13,74.5,196.0,4.41,9.83
6,Jaylin Noel,81.0,1194.0,8.0,14.7,1.187,1.034,1.133,70.25,194.0,4.39,9.75
8,Emeka Egbuka,81.0,1011.0,10.0,12.5,1.136,0.974,0.898,72.88,202.0,,9.72
9,Chimere Dike,42.0,783.0,2.0,18.6,1.074,0.909,1.488,72.62,196.0,4.34,9.72
10,Elic Ayomanor,63.0,831.0,6.0,13.2,1.388,0.951,1.732,73.75,206.0,4.44,9.71
11,Jayden Higgins,87.0,1183.0,9.0,13.6,1.575,0.966,1.934,76.12,214.0,4.47,9.63
14,Jaylin Lane,38.0,466.0,2.0,12.3,1.29,0.767,1.024,69.75,191.0,4.34,9.56
18,Jack Bech,62.0,1034.0,9.0,16.7,1.521,0.992,1.555,73.25,214.0,,9.51
22,Elijhah Badger,39.0,806.0,4.0,20.7,1.396,0.817,1.354,73.38,200.0,4.43,9.32


In [33]:
df_25_prospects['forty'].fillna(df['forty'].median(), inplace=True)

df_25_prospects



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,player_name,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height,weight,forty,RAS
2,Isaiah Neyor,34.0,455.0,5.0,13.4,1.866,1.155,1.982,76.25,218.0,4.4,9.9
5,Tory Horton,25.0,331.0,1.0,13.2,1.473,0.449,1.13,74.5,196.0,4.41,9.83
6,Jaylin Noel,81.0,1194.0,8.0,14.7,1.187,1.034,1.133,70.25,194.0,4.39,9.75
8,Emeka Egbuka,81.0,1011.0,10.0,12.5,1.136,0.974,0.898,72.88,202.0,4.46,9.72
9,Chimere Dike,42.0,783.0,2.0,18.6,1.074,0.909,1.488,72.62,196.0,4.34,9.72
10,Elic Ayomanor,63.0,831.0,6.0,13.2,1.388,0.951,1.732,73.75,206.0,4.44,9.71
11,Jayden Higgins,87.0,1183.0,9.0,13.6,1.575,0.966,1.934,76.12,214.0,4.47,9.63
14,Jaylin Lane,38.0,466.0,2.0,12.3,1.29,0.767,1.024,69.75,191.0,4.34,9.56
18,Jack Bech,62.0,1034.0,9.0,16.7,1.521,0.992,1.555,73.25,214.0,4.46,9.51
22,Elijhah Badger,39.0,806.0,4.0,20.7,1.396,0.817,1.354,73.38,200.0,4.43,9.32


# II. Data Understanding / Visualization

In [34]:
def download_headshots(names: pd.Series, headshots: pd.Series, filepath: str, download: bool = False) -> list[str]:
    """Download headshots of players and save to filepath

    Args:
        names (pd.Series): names of the players; will serve as filepath (i.e. "Tom Brady" -> "tom-brady.png")
        headshots (pd.Series): headshot urls
        filepath (str): filepath to save the headshots to
        download (bool): to just get an array of file paths if the headshots have already been downloaded. Defaults to False
    
    Returns: 
        list[str]: list of all filepaths
    """
    img_filepaths: list[str] = []
    for name, headshot in zip(names, headshots):
        path = f"{filepath}{name.lower().replace(' ', '-')}.png"
        img_filepaths.append(path)
        if download:
            img = requests.get(headshot).content
            with open(path, 'wb') as handler:
                handler.write(img)
            print(f'Saved the headshot of {name} to {path}')
    
    return img_filepaths


In [35]:
puka_vs_mingo_combine: pd.DataFrame = combine_data.loc[combine_data['name'].isin(['Jonathan Mingo', 'Puka Nacua'])].drop(columns = ['Unnamed: 0','year', 'pos']).reset_index()
names_with_headshots: pd.DataFrame = nfl.import_seasonal_rosters(years = [2023])[['player_name', 'headshot_url', 'draft_number']]
puka_vs_mingo_headshots_df: pd.DataFrame = names_with_headshots.loc[names_with_headshots['player_name'].isin(['Jonathan Mingo', 'Puka Nacua'])]
puka_vs_mingo_combine['img_filepath'] = download_headshots(puka_vs_mingo_headshots_df['player_name'], puka_vs_mingo_headshots_df['headshot_url'], '../img/nfl_headshots/')
puka_vs_mingo_combine['RAS'] = RAS_parsed.loc[RAS_parsed['Name'].isin(['Jonathan Mingo', 'Puka Nacua']), 'RAS'].to_numpy()
puka_vs_mingo_combine['Draft Position'] = puka_vs_mingo_headshots_df['draft_number'].to_numpy()
puka_vs_mingo_combine

Unnamed: 0,index,name,college,height,weight,forty,bench,vertical,broad jump,shuttle,3-cone,img_filepath,RAS,Draft Position
0,133,Jonathan Mingo,Mississippi,73.75,220,4.46,22.0,39.5,129.0,4.25,7.04,../img/nfl_headshots/jonathan-mingo.png,9.87,39.0
1,135,Puka Nacua,Brigham Young,73.63,210,4.57,15.0,33.0,121.0,4.36,7.32,../img/nfl_headshots/puka-nacua.png,5.18,177.0


In [36]:
puka_vs_mingo_combine_table: GT = (
    GT(puka_vs_mingo_combine)
    .tab_header(
        title = md('**Jonathan Mingo vs Puka Nacua: Physical and Combine Comparison**'),
    )
    .fmt_image(
        columns = 'img_filepath',
        height = 75,
    )
    .cols_move_to_start(
        columns = 'img_filepath'
    )
    .cols_label(
        img_filepath = ''
    )
    .tab_options(
        table_font_names=system_fonts("industrial")
    )
    .data_color(
        rows = [0,1],
        columns = ['height', 'weight', 'bench', 'vertical', 'broad jump', 'RAS'],
        palette = 'BuPu',
    )
    .data_color(
        rows = [0,1],
        columns = ['forty', 'shuttle', '3-cone', 'Draft Position'],
        palette = 'BuPu',
        reverse = True
    )
    .cols_hide (
        columns = ['index']
    )

)

# puka_vs_mingo_combine_table.save(file = '../img/puka_vs_mingo_combine_table.png', scale = 6, web_driver = 'firefox')

![Jonathan Mingo vs Puka Nacua in Combine and Physical Stats](../img/puka_vs_mingo_combine_table.png)

In [37]:
puka_vs_mingo_stats: pd.DataFrame = player_predicted_points_combined.loc[(player_predicted_points_combined['name'].isin(['Jonathan Mingo', 'Puka Nacua'])) & (player_predicted_points_combined['season'] == 2022)][['name','average_passing_downs_ppa', 'average_standard_downs_ppa', 'average_third_down_ppa', 'average_first_down_ppa', 'average_second_down_ppa', 'average_all_ppa']].reset_index()

puka_vs_mingo_stats['img_filepath'] = download_headshots(puka_vs_mingo_headshots_df['player_name'], puka_vs_mingo_headshots_df['headshot_url'], '../img/nfl_headshots/')

puka_vs_mingo_stats

Unnamed: 0,index,name,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,average_first_down_ppa,average_second_down_ppa,average_all_ppa,img_filepath
0,9308,Jonathan Mingo,0.182,1.173,0.368,1.318,0.218,0.955,../img/nfl_headshots/jonathan-mingo.png
1,9386,Puka Nacua,0.958,1.032,1.256,0.976,0.831,1.012,../img/nfl_headshots/puka-nacua.png


In [38]:
puka_vs_mingo_stats_table: GT = (
    GT(puka_vs_mingo_stats)
    .tab_header(
        title = md('**Jonathan Mingo vs Puka Nacua: Final Season Statistical Comparison**'),
        subtitle = md('*All Metrics are Average PPA in that scenario*')
    )
    .fmt_image(
        columns = 'img_filepath',
        height = 75,
    )
    .cols_move_to_start(
        columns = 'img_filepath'
    )
    .cols_label(
        img_filepath = '',
        average_passing_downs_ppa = 'Passing Downs',
        average_standard_downs_ppa = 'Standard Downs',
        average_third_down_ppa = '3rd Down',
        average_first_down_ppa = '1st Down',
        average_second_down_ppa = '2nd Down',
        average_all_ppa = 'Overall'
    )
    .data_color(
        columns = ['average_passing_downs_ppa', 'average_standard_downs_ppa', 'average_third_down_ppa', 'average_first_down_ppa', 'average_second_down_ppa', 'average_all_ppa'],
        palette = 'BuPu'
    )
    .tab_options(
        table_font_names=system_fonts("industrial")
    )
    .cols_hide(
        columns = ['index']
    )
)

# puka_vs_mingo_stats_table.save(file = '../img/puka_vs_mingo_stats_table.png', scale = 6, web_driver = 'firefox')

![Jonathan Mingo vs Puka Nacua in Final College Season Stats](../img/puka_vs_mingo_stats_table.png)

# III. Clustering and Modeling our Data

In [39]:
# First, we likely need to scale our data 
# Keep player_name and headshot_url separate, scale all numeric features
scaler: StandardScaler =  StandardScaler()
features_scaled: np.ndarray[float, float] = scaler.fit_transform(df.iloc[:, 2:])

# Create a DataFrame with the scaled features
df_scaled = pd.DataFrame(features_scaled, columns=df.columns[2:])

# Add player names back as the first column
df_scaled.insert(0, 'headshot_url', df['headshot_url'].reset_index(drop = True))
df_scaled.insert(0, 'player_name', df['player_name'].reset_index(drop = True))

df_scaled

Unnamed: 0,player_name,headshot_url,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height,weight,forty,RAS
0,Adonai Mitchell,https://static.www.nfl.com/image/upload/f_auto...,-0.030432,0.053605,0.982966,0.172620,0.908736,-0.541970,1.167761,0.887910,0.623008,-1.396796,1.202541
1,Andrei Iosivas,https://static.www.nfl.com/image/private/f_aut...,0.452626,0.336226,0.023153,-0.196775,0.066130,-0.075922,0.085598,1.269029,0.623008,-0.351491,1.185513
2,Rome Odunze,https://static.www.nfl.com/image/upload/f_auto...,1.594399,2.346296,1.462873,0.978574,0.384200,0.553796,0.826828,1.160757,1.092470,-0.119201,1.162810
3,Ricky Pearsall,https://static.www.nfl.com/image/upload/f_auto...,0.364797,0.414091,-0.696706,0.105457,-0.238919,0.734112,-0.214626,0.346548,-0.450048,-0.583781,1.157134
4,Xavier Legette,https://static.www.nfl.com/image/upload/f_auto...,0.672198,1.235999,0.023153,0.944993,0.090311,0.936620,0.811563,0.346548,1.696064,-0.816071,1.151458
...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,Montrell Washington,https://static.www.nfl.com/image/private/f_aut...,-2.006577,-2.025679,-1.416566,-0.834822,0.066130,-0.075922,0.085598,-1.437783,-1.321906,0.229233,-2.015686
110,Derius Davis,https://static.www.nfl.com/image/private/f_aut...,-0.601318,-0.851936,-0.456753,-0.767659,-0.583030,-1.182784,-0.787935,-1.654328,-2.059632,-1.164506,-2.032714
111,David Bell,https://static.www.nfl.com/image/private/f_aut...,1.638313,1.325400,-0.216800,-0.364682,0.196334,-0.353331,0.243342,0.294577,1.092470,2.552133,-2.208666
112,Dazz Newsome,https://static.www.nfl.com/image/private/f_aut...,-0.074346,-0.410701,-0.216800,-0.734078,0.460462,-0.039859,0.085598,-0.896420,-0.382982,1.506828,-3.258705


In [40]:
kmeans = KMeans(n_clusters = 4).fit(df_scaled.iloc[:, 2:])
df['cluster'] = kmeans.labels_

df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,player_name,headshot_url,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height,weight,forty,RAS,cluster
0,Adonai Mitchell,https://static.www.nfl.com/image/upload/f_auto...,55.0,845.0,11.0,15.4,1.689,0.673,2.063,74.25,205.0,4.34,9.99,0
2,Andrei Iosivas,https://static.www.nfl.com/image/private/f_aut...,66.0,943.0,7.0,14.3,1.236,0.841,1.425,75.13,205.0,4.43,9.96,0
4,Rome Odunze,https://static.www.nfl.com/image/upload/f_auto...,92.0,1640.0,13.0,17.8,1.407,1.068,1.862,74.88,212.0,4.45,9.92,0
5,Ricky Pearsall,https://static.www.nfl.com/image/upload/f_auto...,64.0,970.0,4.0,15.2,1.072,1.133,1.248,73.00,189.0,4.41,9.91,0
6,Xavier Legette,https://static.www.nfl.com/image/upload/f_auto...,71.0,1255.0,7.0,17.7,1.249,1.206,1.853,73.00,221.0,4.39,9.90,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Montrell Washington,https://static.www.nfl.com/image/private/f_aut...,10.0,124.0,1.0,12.4,1.236,0.841,1.425,68.88,176.0,4.48,4.32,1
118,Derius Davis,https://static.www.nfl.com/image/private/f_aut...,42.0,531.0,5.0,12.6,0.887,0.442,0.910,68.38,165.0,4.36,4.29,1
119,David Bell,https://static.www.nfl.com/image/private/f_aut...,93.0,1286.0,6.0,13.8,1.306,0.741,1.518,72.88,212.0,4.68,3.98,1
120,Dazz Newsome,https://static.www.nfl.com/image/private/f_aut...,54.0,684.0,6.0,12.7,1.448,0.854,1.425,70.13,190.0,4.59,2.13,1


In [173]:
pca: PCA = PCA(n_components = 3).fit(df_scaled.iloc[:, 2:])

pca_curr: np.ndarray = pca.transform(df_scaled.iloc[:, 2:])
df['pca1'] = pca_curr[:, 0]
df['pca2'] = pca_curr[:, 1]
df['pca3'] = pca_curr[:, 2]

clusters: px.scatter_3d = px.scatter_3d(
    df, 
    x='pca1', 
    y='pca2', 
    z = 'pca3', 
    color='cluster', 
    hover_name = 'player_name', 
)

# Write scatterplot to HTML so we can display on the website
clusters.write_html(file = '../plotly/NFL_WRs_clusters.html')

# Write clusters to image so we can display in the notebook
# clusters.write_image(file = '../img/NFL_WRs_clusters.png')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



![NFL Wide Recivers, clustered](../img/NFL_WRs_clusters.png?12)

In [42]:
new_picks_scaled: np.ndarray = scaler.transform(df_25_prospects.iloc[:, 1:])

new_clusters: np.ndarray = kmeans.predict(new_picks_scaled)

df_25_prospects['cluster'] = new_clusters


X does not have valid feature names, but KMeans was fitted with feature names



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [175]:
new_pca = pca.transform(new_picks_scaled)
df_25_prospects['pca1'] = new_pca[:, 0]
df_25_prospects['pca2'] = new_pca[:, 1]
df_25_prospects['pca3'] = new_pca[:, 2]

clusters_new: px.scatter_3d = px.scatter_3d(
    df_25_prospects, 
    x='pca1', 
    y='pca2', 
    z = 'pca3', 
    color='cluster', 
    hover_name = 'player_name', 
)

# Write scatterplot to HTML so we can display on the website
clusters_new.write_html(file = '../plotly/NFL_2025_WRs_clusters.html')

# Write clusters to image so we can display in the notebook
# clusters_new.write_image(file = '../img/NFL_2025_WRs_clusters.png')


X does not have valid feature names, but PCA was fitted with feature names



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



!["2025 NFL Wide Receivers, clustered"](../img/NFL_2025_WRs_clusters.png?12)

In [123]:
df_with_stats = pd.merge(left = df, right = nfl_data, how = 'left', on = 'player_name').drop_duplicates(subset = ['player_name'], keep = 'first').sort_values(by = 'draft_number').groupby('cluster').head(7)

df_with_stats

Unnamed: 0,player_name,headshot_url_x,receptions,yards,touchdowns,yards_per_reception,average_passing_downs_ppa,average_standard_downs_ppa,average_third_down_ppa,height_x,...,status_description_abbr,football_name,esb_id,gsis_it_id,smart_id,entry_year,rookie_year,draft_club,draft_number,age
17,Ja'Marr Chase,https://static.www.nfl.com/image/private/f_aut...,84.0,1780.0,20.0,21.2,1.684,1.279,1.357,72.38,...,A01,Ja'Marr,CHA694469,53434,32004348-4169-4469-b021-d2c20a7d7cf5,2021.0,2021.0,CIN,5.0,21.0
28,Malik Nabers,https://static.www.nfl.com/image/upload/f_auto...,89.0,1569.0,14.0,17.6,1.314,1.103,1.278,71.75,...,A01,Malik,NAB400459,57127,32004e41-4240-0459-a77d-033b6932cbbb,2024.0,2024.0,NYG,6.0,21.0
3,Rome Odunze,https://static.www.nfl.com/image/upload/f_auto...,92.0,1640.0,13.0,17.8,1.407,1.068,1.862,74.88,...,A01,Rome,ODU426541,57130,32004f44-5542-6541-431c-1a51e8792659,2024.0,2024.0,CHI,9.0,22.0
146,Garrett Wilson,https://static.www.nfl.com/image/private/f_aut...,70.0,1058.0,12.0,15.1,1.207,1.097,1.151,71.75,...,A01,Garrett,WIL691962,54475,32005749-4c69-1962-4085-d9c826d8709c,2022.0,2022.0,NYJ,10.0,22.0
104,Chris Olave,https://static.www.nfl.com/image/private/f_aut...,65.0,936.0,13.0,14.4,1.635,0.666,1.85,72.38,...,A01,Chris,OLA659325,54476,32004f4c-4165-9325-c259-01a521202e2f,2022.0,2022.0,NO,11.0,22.0
207,Jahan Dotson,https://static.www.nfl.com/image/private/f_aut...,91.0,1182.0,12.0,13.0,0.816,0.886,1.047,70.63,...,A01,Jahan,DOT282798,54481,3200444f-5428-2798-d9f2-9e8608edcb8a,2022.0,2022.0,WAS,16.0,22.0
221,Treylon Burks,https://static.www.nfl.com/image/private/f_aut...,66.0,1104.0,11.0,16.7,1.236,1.296,1.905,73.88,...,A01,Treylon,BUR321327,54483,32004255-5232-1327-f36e-987913d19348,2022.0,2022.0,TEN,18.0,22.0
124,Jaxon Smith-Njigba,https://static.www.nfl.com/image/private/f_aut...,5.0,43.0,0.0,8.6,-0.274,0.019,-0.608,72.63,...,A01,Jaxon,SMI829636,55884,3200534d-4982-9636-a18c-5da6fbdaa80c,2023.0,2023.0,SEA,20.0,21.0
65,Kadarius Toney,https://static.www.nfl.com/image/private/f_aut...,70.0,984.0,10.0,14.1,1.628,0.675,1.952,71.63,...,A01,Kadarius,TON593115,53449,3200544f-4e59-3115-b817-44100e7d8fff,2021.0,2021.0,NYG,20.0,22.0
89,Quentin Johnston,https://static.www.nfl.com/image/private/f_aut...,60.0,1069.0,6.0,17.8,1.208,0.696,1.073,74.75,...,A01,Quentin,JOH823261,55885,32004a4f-4882-3261-dcd8-a43c8f7c5608,2023.0,2023.0,LAC,21.0,21.0


In [124]:
df_players: pd.DataFrame = df_with_stats[['player_name', 'headshot_url_x', 'cluster', 'draft_number']]

df_players.sort_values(by = ['cluster', 'draft_number'], inplace = True)

df_players



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,player_name,headshot_url_x,cluster,draft_number
17,Ja'Marr Chase,https://static.www.nfl.com/image/private/f_aut...,0,5.0
28,Malik Nabers,https://static.www.nfl.com/image/upload/f_auto...,0,6.0
3,Rome Odunze,https://static.www.nfl.com/image/upload/f_auto...,0,9.0
221,Treylon Burks,https://static.www.nfl.com/image/private/f_aut...,0,18.0
89,Quentin Johnston,https://static.www.nfl.com/image/private/f_aut...,0,21.0
16,Brian Thomas Jr.,https://static.www.nfl.com/image/upload/f_auto...,0,23.0
4,Ricky Pearsall,https://static.www.nfl.com/image/upload/f_auto...,0,31.0
204,Jayden Reed,https://static.www.nfl.com/image/private/f_aut...,1,50.0
227,Amari Rodgers,https://static.www.nfl.com/image/private/f_aut...,1,85.0
252,David Bell,https://static.www.nfl.com/image/private/f_aut...,1,99.0


In [102]:
df_final: pd.DataFrame = (
    pd.merge(
    left = df, 
    right = nfl_data, 
    how = 'left', 
    on = 'player_name'
)
    .drop_duplicates(subset = ['player_name'], keep = 'first')
    .sort_values(by = 'draft_number')
    .groupby('cluster').mean().round(2)
)

df_final = df_final[['height_x', 'weight_x', 'forty', 'RAS', 'draft_number']].reset_index()

df_final


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0,cluster,height_x,weight_x,forty,RAS,draft_number
0,0,73.28,199.26,4.44,8.85,100.42
1,1,70.94,192.36,4.52,5.5,164.77
2,2,73.75,206.41,4.47,8.65,138.62
3,3,70.16,182.74,4.43,7.65,101.96


In [166]:
clusters_overview: GT = (
    GT(df_final)
    .tab_header(
        title = md('Wide Receiver Cluster Breakdown: Physical Characteristics'),
        subtitle = md('*All Metrics are the average among all the players in that cluster*')
    )
    .tab_options(
        table_font_names=system_fonts("industrial")
    )
    .cols_label(
        cluster = 'Cluster',
        height_x = 'Height',
        weight_x = 'Weight',
        forty = 'Forty Time',
        draft_number = 'Draft Position'
    )
    .data_color(
        columns = ['height_x', 'weight_x', 'RAS'],
        palette = 'BuPu',
    )
    .data_color(
        columns = ['draft_number', 'forty'],
        reverse = True,
        palette = 'BuPu'
    )
)

# clusters_overview.save(file = '../img/wide_receiver_cluster_overview.png', scale = 6, web_driver = 'firefox')


![Wide Receiver Cluster Overview](../img/wide_receiver_cluster_overview.png)

In [163]:
df_25_final: pd.DataFrame = df_25_prospects.sort_values(by = 'RAS', ascending = False).groupby('cluster').head(7).sort_values(by = 'cluster').reset_index()

df_25_final = df_25_final[['player_name', 'RAS', 'cluster']]

df_25_final

Unnamed: 0,player_name,RAS,cluster
0,Isaiah Neyor,9.9,0
1,Emeka Egbuka,9.72,0
2,Chimere Dike,9.72,0
3,Elic Ayomanor,9.71,0
4,Jayden Higgins,9.63,0
5,Jack Bech,9.51,0
6,Elijhah Badger,9.32,0
7,Josh Kelly,3.98,1
8,Isaiah Bond,4.23,1
9,Traeshon Holden,5.42,1


In [170]:
clusters_25_overview: GT = (
    GT(df_25_final)
    .tab_header(
        title = md('2025 Wide Receiver Draft Prospect Cluster Breakdown'),
        subtitle = md('*RAS Scores and the Classified Cluster for a Subset of the Incoming Draft Class*')
    )
    .tab_options(
        table_font_names=system_fonts("industrial")
    )
    .cols_label(
        cluster = 'Cluster',
        player_name = 'Name',
    )
    .data_color(
        columns = ['RAS'],
        palette = 'BuPu',
    )
)

# clusters_25_overview.save('../img/wide_receiver_25_clusters.png', scale = 2, web_driver = 'firefox')

![2025 Draft Class Clusters](../img/wide_receiver_25_clusters.png?12)

In [154]:
from sklearn.metrics.pairwise import euclidean_distances

amon = df_scaled.loc[df_scaled['player_name'] == 'Amon-Ra St. Brown'].iloc[:, 2:].to_numpy()
ja = df_scaled.loc[df_scaled['player_name'] == 'Jordan Addison'].iloc[:, 2:].to_numpy()
euclidean_distances(df_scaled.loc[df_scaled['player_name'] == 'Rondale Moore'].iloc[:, 2:].to_numpy(), df_scaled.loc[df_scaled['player_name'] == 'Jaxon Smith-Njigba'].iloc[:, 2:].to_numpy())

array([[4.84692861]])

In [148]:
euclidean_distances(df_25_prospects.loc[df_25_prospects['player_name'] == 'Emeka Egbuka'].iloc[:, 2:13].to_numpy(), df_scaled.loc[df_scaled['player_name'] == 'Jaxon Smith-Njigba'].iloc[:, 2:].to_numpy())

array([[1036.17366042]])