Using Euclidean Distance and Cosine Similarity to Compare NFL Draft Prospects with Pros

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Running Back Comparison

In [68]:
df = pd.read_csv("NFL Draft Data - RB.csv")
# Define function to calculate aggregated stats for a player based on years played
def aggregate_stats(df, player_name):
    player_stats = df[df['Name'] == player_name]
    years_played = player_stats['YP'].iloc[0]
    agg_stats = player_stats[['Att', 'RushYds', 'RushTD', 'Rec', 'RecYds', 'RecTD']].sum()
    agg_stats /= years_played
    return agg_stats

# Create separate DataFrames for prospects and professionals
prospects = df[df['Prospect'] == 1]
professionals = df[df['Prospect'] == 0]

# Iterate over each prospect and find the most similar professional
for _, prospect in prospects.iterrows():
    prospect_stats = aggregate_stats(df, prospect['Name']).values.reshape(1, -1)
    professional_stats = np.vstack(professionals.apply(lambda x: aggregate_stats(df, x['Name']), axis=1).values)
    similarity_scores = cosine_similarity(prospect_stats, professional_stats)
    most_similar_idx = np.argmax(similarity_scores)
    most_similar = professionals.iloc[most_similar_idx]
    similarity_score = similarity_scores[0][most_similar_idx] * 100
    print(f"Most similar professional for {prospect['Name']} is {most_similar['Name']} with a similarity score of {similarity_score:.2f}")

Most similar professional for Israel Abanikanda is Zack Moss with a similarity score of 100.00
Most similar professional for Devon Achane is D'Andre Swift with a similarity score of 100.00
Most similar professional for Tank Bigsby is Zonovan Knight with a similarity score of 100.00
Most similar professional for Chase Brown is Brian Robinson with a similarity score of 99.99
Most similar professional for Zach Charbonnet is Zack Moss with a similarity score of 99.99
Most similar professional for Travis Dye is Chris Evans with a similarity score of 99.99
Most similar professional for Zach Evans is Trey Sermon with a similarity score of 100.00
Most similar professional for Jahmyr Gibbs is Alvin Kamara with a similarity score of 99.94
Most similar professional for Eric Gray is Chris Evans with a similarity score of 100.00
Most similar professional for Evan Hull is Dare Ogunbowale with a similarity score of 99.98
Most similar professional for Mohamed Ibrahim is Kenneth Walker with a similarit

In [69]:
# Define function to calculate Euclidean distance between two players
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

    # Compute min and max values of distance metric
min_distance_range = np.inf
max_distance_range = -np.inf
for _, player1 in df.iterrows():
    for _, player2 in df.iterrows():
        if player1['Name'] != player2['Name']:
            distance = euclidean_distance(aggregate_stats(df, player1['Name']), aggregate_stats(df, player2['Name']))
            if distance < min_distance_range:
                min_distance_range = distance
            if distance > max_distance_range:
                max_distance_range = distance

# Define function to calculate aggregated stats for a player based on years played
def aggregate_stats(df, player_name):
    player_stats = df[df['Name'] == player_name]
    years_played = player_stats['YP'].iloc[0]
    agg_stats = player_stats[['Att', 'RushYds', 'RushTD', 'Rec', 'RecYds', 'RecTD']].sum()
    agg_stats /= years_played
    return agg_stats

# Create separate DataFrames for prospects and professionals
prospects = df[df['Prospect'] == 1]
professionals = df[df['Prospect'] == 0]

# Iterate over each prospect and find the most similar professional
for _, prospect in prospects.iterrows():
    prospect_stats = aggregate_stats(df, prospect['Name'])
    min_distance = np.inf
    most_similar = None
    for _, professional in professionals.iterrows():
        professional_stats = aggregate_stats(df, professional['Name'])
        distance = euclidean_distance(prospect_stats, professional_stats)
        if distance < min_distance:
            min_distance = distance
            most_similar = professional
    similarity_score = 100 * (1 - (min_distance - min_distance_range) / (max_distance_range - min_distance_range))
    print(f"Most similar professional for {prospect['Name']} is {most_similar['Name']} with a similarity score of {similarity_score:.2f}")

Most similar professional for Israel Abanikanda is Rakeem Boyd with a similarity score of 99.63
Most similar professional for Devon Achane is Elijah Mitchell with a similarity score of 97.61
Most similar professional for Tank Bigsby is Cam Akers with a similarity score of 98.82
Most similar professional for Chase Brown is Salvon Ahmed with a similarity score of 98.96
Most similar professional for Zach Charbonnet is Elijah Mitchell with a similarity score of 98.78
Most similar professional for Travis Dye is Dare Ogunbowale with a similarity score of 96.68
Most similar professional for Zach Evans is Salvon Ahmed with a similarity score of 98.79
Most similar professional for Jahmyr Gibbs is Alvin Kamara with a similarity score of 95.12
Most similar professional for Eric Gray is Jerrion Ealy with a similarity score of 97.97
Most similar professional for Evan Hull is La'Mical Perine with a similarity score of 97.47
Most similar professional for Mohamed Ibrahim is Larry Rountree with a simil

QB Comparison

In [78]:
df = pd.read_csv("NFL Draft Data - QB.csv")
# Define function to calculate aggregated stats for a player based on years played
def aggregate_stats(df, player_name):
    player_stats = df[df['Name'] == player_name]
    years_played = player_stats['YP'].iloc[0]
    agg_stats = player_stats[['Cmp','PassAtt','Pct','PassTD','Int', 'RushYds', 'RushTD']].sum()
    agg_stats /= years_played
    return agg_stats

# Create separate DataFrames for prospects and professionals
prospects = df[df['Prospect'] == 1]
professionals = df[df['Prospect'] == 0]

# Iterate over each prospect and find the most similar professional
for _, prospect in prospects.iterrows():
    prospect_stats = aggregate_stats(df, prospect['Name']).values.reshape(1, -1)
    professional_stats = np.vstack(professionals.apply(lambda x: aggregate_stats(df, x['Name']), axis=1).values)
    similarity_scores = cosine_similarity(prospect_stats, professional_stats)
    most_similar_idx = np.argmax(similarity_scores)
    most_similar = professionals.iloc[most_similar_idx]
    similarity_score = similarity_scores[0][most_similar_idx] * 100
    print(f"Most similar professional for {prospect['Name']} is {most_similar['Name']} with a similarity score of {similarity_score:.2f}")


Most similar professional for Stetson Bennett is Pat Mahomes with a similarity score of 99.91
Most similar professional for Malik Cunningham is Kyler Murray with a similarity score of 99.95
Most similar professional for Max Duggan is Zac Thomas with a similarity score of 99.92
Most similar professional for Jake Haener is Jack Coan with a similarity score of 99.94
Most similar professional for Jaren Hall is Skylar Thompson with a similarity score of 99.81
Most similar professional for Hendon Hooker is Dustin Crum with a similarity score of 99.93
Most similar professional for Will Levis is Russell Wilson with a similarity score of 99.91
Most similar professional for Tanner McKee is Jack Coan with a similarity score of 99.94
Most similar professional for Aidan O'Connell is Carson Strong with a similarity score of 99.98
Most similar professional for Anthony Richardson is Kyler Murray with a similarity score of 99.91
Most similar professional for C.J. Stroud is Teddy Bridgewater with a simi

In [79]:
df = pd.read_csv("NFL Draft Data - QB.csv")
# Define function to calculate Euclidean distance between two players
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

    # Compute min and max values of distance metric
min_distance_range = np.inf
max_distance_range = -np.inf
for _, player1 in df.iterrows():
    for _, player2 in df.iterrows():
        if player1['Name'] != player2['Name']:
            distance = euclidean_distance(aggregate_stats(df, player1['Name']), aggregate_stats(df, player2['Name']))
            if distance < min_distance_range:
                min_distance_range = distance
            if distance > max_distance_range:
                max_distance_range = distance

# Define function to calculate aggregated stats for a player based on years played
def aggregate_stats(df, player_name):
    player_stats = df[df['Name'] == player_name]
    years_played = player_stats['YP'].iloc[0]
    agg_stats = player_stats[['Cmp','PassAtt','Pct','PassTD','Int', 'RushYds', 'RushTD']].sum()
    agg_stats /= years_played
    return agg_stats

# Create separate DataFrames for prospects and professionals
prospects = df[df['Prospect'] == 1]
professionals = df[df['Prospect'] == 0]

# Iterate over each prospect and find the most similar professional
for _, prospect in prospects.iterrows():
    prospect_stats = aggregate_stats(df, prospect['Name'])
    min_distance = np.inf
    most_similar = None
    for _, professional in professionals.iterrows():
        professional_stats = aggregate_stats(df, professional['Name'])
        distance = euclidean_distance(prospect_stats, professional_stats)
        if distance < min_distance:
            min_distance = distance
            most_similar = professional
    similarity_score = 100 * (1 - (min_distance - min_distance_range) / (max_distance_range - min_distance_range))
    print(f"Most similar professional for {prospect['Name']} is {most_similar['Name']} with a similarity score of {similarity_score:.2f}")

Most similar professional for Stetson Bennett is Tua Tagovailoa with a similarity score of 99.08
Most similar professional for Malik Cunningham is Malik Willis with a similarity score of 92.32
Most similar professional for Max Duggan is Sam Ehlinger with a similarity score of 95.60
Most similar professional for Jake Haener is Jacob Eason with a similarity score of 98.48
Most similar professional for Jaren Hall is Skylar Thompson with a similarity score of 99.43
Most similar professional for Hendon Hooker is Dustin Crum with a similarity score of 99.41
Most similar professional for Will Levis is Skylar Thompson with a similarity score of 98.48
Most similar professional for Tanner McKee is Jacob Eason with a similarity score of 99.55
Most similar professional for Aidan O'Connell is Carson Strong with a similarity score of 100.00
Most similar professional for Anthony Richardson is D'Eriq King with a similarity score of 96.44
Most similar professional for C.J. Stroud is Kyle Trask with a s

WR Comparison

In [4]:
df = pd.read_csv("NFL Draft Data - WR.csv")

# Replace NaN values with column means
df.fillna(df.mean(), inplace=True)

# Define function to calculate aggregated stats for a player based on years played
def aggregate_stats(df, player_name):
    player_stats = df[df['Name'] == player_name]
    years_played = player_stats['YP'].iloc[0]
    agg_stats = player_stats[['Rec','Yds','TD']].sum()
    agg_stats /= years_played
    return agg_stats

# Create separate DataFrames for prospects and professionals
prospects = df[df['Prospect'] == 1]
professionals = df[df['Prospect'] == 0]

# Iterate over each prospect and find the most similar professional
for _, prospect in prospects.iterrows():
    prospect_stats = aggregate_stats(df, prospect['Name']).values.reshape(1, -1)
    professional_stats = np.vstack(professionals.apply(lambda x: aggregate_stats(df, x['Name']), axis=1).values)
    similarity_scores = cosine_similarity(prospect_stats, professional_stats)
    most_similar_idx = np.argmax(similarity_scores)
    most_similar = professionals.iloc[most_similar_idx]
    similarity_score = similarity_scores[0][most_similar_idx] * 100
    print(f"Most similar professional for {prospect['Name']} is {most_similar['Name']} with a similarity score of {similarity_score:.2f}")


  df.fillna(df.mean(), inplace=True)


Most similar professional for Jordan Addison is Chase Claypool with a similarity score of 100.00
Most similar professional for Ronnie Bell is AJ Brown with a similarity score of 100.00
Most similar professional for Jake Bobo is Anthony Schwartz with a similarity score of 100.00
Most similar professional for Kayshon Boutte is DJ Moore with a similarity score of 100.00
Most similar professional for Jalen Brooks is Austin Mack with a similarity score of 100.00
Most similar professional for Jason Brownlee is Tyrie Cleveland with a similarity score of 100.00
Most similar professional for Jacob Copeland is Erik Ezukanma with a similarity score of 100.00
Most similar professional for Derius Davis is John Metchie with a similarity score of 100.00
Most similar professional for Nathaniel Dell is DJ Moore with a similarity score of 100.00
Most similar professional for Dontay Demus Jr. is Erik Ezukanma with a similarity score of 100.00
Most similar professional for Demario Douglas is Amon-Ra St. B

In [6]:
df = pd.read_csv("NFL Draft Data - WR.csv")
# Define function to calculate Euclidean distance between two players
def euclidean_distance(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

    # Compute min and max values of distance metric
min_distance_range = np.inf
max_distance_range = -np.inf
for _, player1 in df.iterrows():
    for _, player2 in df.iterrows():
        if player1['Name'] != player2['Name']:
            distance = euclidean_distance(aggregate_stats(df, player1['Name']), aggregate_stats(df, player2['Name']))
            if distance < min_distance_range:
                min_distance_range = distance
            if distance > max_distance_range:
                max_distance_range = distance

# Define function to calculate aggregated stats for a player based on years played
def aggregate_stats(df, player_name):
    player_stats = df[df['Name'] == player_name]
    years_played = player_stats['YP'].iloc[0]
    agg_stats = player_stats[['Rec','Yds','TD']].sum()
    agg_stats /= years_played
    return agg_stats

# Create separate DataFrames for prospects and professionals
prospects = df[df['Prospect'] == 1]
professionals = df[df['Prospect'] == 0]

# Iterate over each prospect and find the most similar professional
for _, prospect in prospects.iterrows():
    prospect_stats = aggregate_stats(df, prospect['Name'])
    min_distance = np.inf
    most_similar = None
    for _, professional in professionals.iterrows():
        professional_stats = aggregate_stats(df, professional['Name'])
        distance = euclidean_distance(prospect_stats, professional_stats)
        if distance < min_distance:
            min_distance = distance
            most_similar = professional
    similarity_score = 100 * (1 - (min_distance - min_distance_range) / (max_distance_range - min_distance_range))
    print(f"Most similar professional for {prospect['Name']} is {most_similar['Name']} with a similarity score of {similarity_score:.2f}")

Most similar professional for Jordan Addison is JaMarr Chase with a similarity score of 98.17
Most similar professional for Ronnie Bell is George Pickens with a similarity score of 99.47
Most similar professional for Jake Bobo is Kyle Philips with a similarity score of 99.45
Most similar professional for Kayshon Boutte is K.J. Hill with a similarity score of 98.81
Most similar professional for Jalen Brooks is Austin Mack with a similarity score of 99.88
Most similar professional for Jason Brownlee is Seth Williams with a similarity score of 99.18
Most similar professional for Jacob Copeland is Jalen Nailor with a similarity score of 98.59
Most similar professional for Derius Davis is Terry McLaurin with a similarity score of 98.96
Most similar professional for Nathaniel Dell is JaMarr Chase with a similarity score of 97.86
Most similar professional for Dontay Demus Jr. is Ihmir Smith-Marsette with a similarity score of 99.73
Most similar professional for Demario Douglas is Van Jefferso

Takeaways

More features and more data could yield better results. My goal for the 2024 NFL Draft is to increase this dataset and perhaps find a more efficient way to do so since most of my work on this project was spent attempting to collect data. 

In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = 'NFL Combine and College Player Stats - QB.csv'
df = pd.read_csv(file_path)

# Filter for the 2023 draft class and other draft classes
draft_class_2023 = df[df['draft_class'] == 2023]
other_draft_classes = df[df['draft_class'] != 2023]

# Selecting relevant numeric features for comparison
numeric_features = draft_class_2023.select_dtypes(include=['float64', 'int64']).columns.to_list()
numeric_features.remove('draft_class')  # Removing 'draft_class' as it's the same for all

# Handling missing values with mean imputation for both datasets
imputer = SimpleImputer(strategy='mean')
draft_class_2023_imputed = imputer.fit_transform(draft_class_2023[numeric_features])
other_draft_classes_imputed = imputer.transform(other_draft_classes[numeric_features])

# Normalizing the data for both datasets
scaler = StandardScaler()
draft_class_2023_scaled = scaler.fit_transform(draft_class_2023_imputed)
other_draft_classes_scaled = scaler.transform(other_draft_classes_imputed)

# Using Nearest Neighbors to find the most similar player from other draft classes
nbrs = NearestNeighbors(n_neighbors=1)  # Only the closest neighbor
nbrs.fit(other_draft_classes_scaled)

# Finding the nearest player for each 2023 draft class player
distances, indices = nbrs.kneighbors(draft_class_2023_scaled)

# Creating a mapping of 2023 draft class players to their most similar counterparts from other draft classes
similar_players_cross_class = {}
for i in range(len(draft_class_2023)):
    player_index = indices[i][0]  # Index 0 as we're now comparing with a different class
    player_name = draft_class_2023.iloc[i]['name']
    similar_player_name = other_draft_classes.iloc[player_index]['name']
    similar_players_cross_class[player_name] = similar_player_name

# Displaying the mapping
for player, similar_player in similar_players_cross_class.items():
    print(f"{player}: Most similar to {similar_player}")


Stetson Bennett: Most similar to Matt Corral
Malik Cunningham: Most similar to Jordan Travis
Max Duggan: Most similar to Michael Pratt
Jake Haener: Most similar to Garrett Grayson
Jaren Hall: Most similar to Zach Wilson
Hendon Hooker: Most similar to Joe Burrow
Will Levis: Most similar to Drake Maye
Tanner McKee: Most similar to Brock Osweiler
Aidan O'Connell: Most similar to Carson Strong
Anthony Richardson: Most similar to Cole Kelley
C.J. Stroud: Most similar to Joe Burrow
Dorian Thompson-Robinson: Most similar to Michael Pratt
Clayton Tune: Most similar to Russell Wilson
Bryce Young: Most similar to Zach Wilson
Sean Clifford: Most similar to Michael Pratt


In [8]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = 'NFL Combine and College Player Stats - QB.csv'
df = pd.read_csv(file_path)

# Convert cumulative stats columns to numeric (if they're not already)
cumulative_stats = ['pass_yards', 'pass_td', 'int', 'rush_yards', 'rush_td']
for stat in cumulative_stats:
    df[stat] = pd.to_numeric(df[stat], errors='coerce')  # Coerce non-numeric values to NaN

# Create per-game statistics for relevant features
for stat in cumulative_stats:
    per_game_stat = f"{stat}_per_game"
    df[per_game_stat] = df[stat] / df['games_played'].replace(0, np.nan)

# Advanced Feature Engineering: Adding Interaction Terms
# Avoiding division by zero by replacing 0 with NaN
df['pass_yards_per_td'] = df['pass_yards'] / df['pass_td'].replace(0, np.nan)
df['rush_yards_per_td'] = df['rush_yards'] / df['rush_td'].replace(0, np.nan)

# Filter for the 2023 draft class and other draft classes
draft_class_2023 = df[df['draft_class'] == 2023]
other_draft_classes = df[df['draft_class'] != 2023]

# Selecting relevant numeric features for comparison
numeric_features = draft_class_2023.select_dtypes(include=['float64', 'int64']).columns.to_list()
numeric_features.remove('draft_class')  # Removing 'draft_class' as it's the same for all
numeric_features.extend([f"{stat}_per_game" for stat in cumulative_stats])  # Adding per-game stats
numeric_features.extend(['pass_yards_per_td', 'rush_yards_per_td'])  # Adding interaction terms

# Handling missing values with mean imputation for both datasets
imputer = SimpleImputer(strategy='mean', add_indicator=True)
draft_class_2023_imputed = imputer.fit_transform(draft_class_2023[numeric_features])
other_draft_classes_imputed = imputer.transform(other_draft_classes[numeric_features])

# Normalizing the data for both datasets
scaler = StandardScaler()
draft_class_2023_scaled = scaler.fit_transform(draft_class_2023_imputed)
other_draft_classes_scaled = scaler.transform(other_draft_classes_imputed)

# Using Nearest Neighbors with a different distance metric
nbrs = NearestNeighbors(n_neighbors=1, metric='manhattan')  # Using Manhattan distance
nbrs.fit(other_draft_classes_scaled)

# Finding the nearest player for each 2023 draft class player
distances, indices = nbrs.kneighbors(draft_class_2023_scaled)

# Creating a mapping of 2023 draft class players to their most similar counterparts from other draft classes
similar_players_cross_class = {}
for i in range(len(draft_class_2023)):
    player_index = indices[i][0]  # Index 0 as we're now comparing with a different class
    player_name = draft_class_2023.iloc[i]['name']
    similar_player_name = other_draft_classes.iloc[player_index]['name']
    similar_players_cross_class[player_name] = similar_player_name

# Displaying the mapping
for player, similar_player in similar_players_cross_class.items():
    print(f"{player}: Most similar to {similar_player}")


Stetson Bennett: Most similar to Ian Book
Malik Cunningham: Most similar to Jordan Travis
Max Duggan: Most similar to Trace McSorley
Jake Haener: Most similar to Carson Strong
Jaren Hall: Most similar to J.J. McCarthy
Hendon Hooker: Most similar to Joe Burrow
Will Levis: Most similar to CJ Beathard
Tanner McKee: Most similar to Blaine Gabbert
Aidan O'Connell: Most similar to Brad Kaaya
Anthony Richardson: Most similar to Josh Allen
C.J. Stroud: Most similar to Tua Tagovailoa
Dorian Thompson-Robinson: Most similar to Desmond Ridder
Clayton Tune: Most similar to Russell Wilson
Bryce Young: Most similar to Trevor Lawrence
Sean Clifford: Most similar to Andy Dalton
