In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
data = pd.read_csv('C:/Users/RaymondCarpenter/Documents/GitHub/14thstreetanalytics/nfl-draft/qb_sheet.csv')

# Ensure all columns are numeric or convert them as needed
numeric_fields = [
    'completions', 'pass_attempts', 'completion_percentage', 'pass_yards',
    'yards_per_attempt', 'air_yards_per_attempt', 'pass_td', 'int',
    'pass_rate', 'rush_attempts', 'rush_yards', 'rush_avg', 'rush_td',
    'games_played', 'years_played'
]

for field in numeric_fields:
    data[field] = pd.to_numeric(data[field], errors='coerce')

# Normalize specified fields by 'games_played'
fields_to_normalize = [
    'completions', 'pass_attempts', 'completion_percentage', 'pass_yards',
    'yards_per_attempt', 'air_yards_per_attempt', 'pass_td', 'int',
    'pass_rate', 'rush_attempts', 'rush_yards', 'rush_avg', 'rush_td'
]

def normalize_fields(df, fields, norm_by='games_played'):
    for field in fields:
        normalized_field_name = f'{field}_per_game'
        df[normalized_field_name] = df[field] / df[norm_by]
    return df

# Apply normalization
data = normalize_fields(data, fields_to_normalize)

# Define features for similarity comparison, including normalized fields
features = [f'{field}_per_game' for field in fields_to_normalize] + ['games_played', 'years_played']

# Separate datasets
data_2024 = data[data['draft_class'] == 2024]
data_not_2024 = data[data['draft_class'] != 2024]

# Function to clean and prepare dataset (assuming the data is already numeric)
def clean_and_prepare_dataset(df, column_names):
    # Fill missing values with the mean of each column
    df_filled = df[column_names].fillna(df[column_names].mean(), inplace=False)
    return df_filled

# Prepare datasets
data_2024_prepared = clean_and_prepare_dataset(data_2024, features)
data_not_2024_prepared = clean_and_prepare_dataset(data_not_2024, features)

# Normalize the feature values using StandardScaler
scaler = StandardScaler()
data_2024_scaled = scaler.fit_transform(data_2024_prepared)
data_not_2024_scaled = scaler.transform(data_not_2024_prepared)

# Calculate cosine similarity between the 2024 prospects and other quarterbacks
similarity = cosine_similarity(data_2024_scaled, data_not_2024_scaled)

# Convert the similarity matrix to a DataFrame for easier manipulation
similarity_df = pd.DataFrame(similarity, index=data_2024['name'], columns=data_not_2024['name'])

# Find the most similar quarterback for each 2024 prospect
most_similar_qbs = {name: similarity_df.loc[name].idxmax() for name in similarity_df.index}

# Print the results
for prospect, similar_qb in most_similar_qbs.items():
    print(f"{prospect} is most similar to {similar_qb}.")

Devin Leary is most similar to Rusty Smith.
Michael Penix is most similar to Landry Jones.
Spencer Rattler is most similar to Shane Carden.
J.J. McCarthy is most similar to Nathan Peterman.
Joe Milton III is most similar to Carson Wentz.
Caleb Williams is most similar to Trevor Lawrence.
Sam Hartman is most similar to Kenny Pickett.
Michael Pratt is most similar to Trace McSorley.
Drake Maye is most similar to DeShone Kizer.
Bo Nix is most similar to Kenny Pickett.
Jordan Travis is most similar to D'Eriq King.
Jayden Daniels is most similar to Colin Kaepernick.


In [34]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
data = pd.read_csv('C:/Users/RaymondCarpenter/Documents/GitHub/14thstreetanalytics/nfl-draft/qb_sheet.csv')

# Fields to normalize by 'games_played' and fields used directly
fields_to_normalize = [
    'completions', 'pass_attempts', 'pass_yards', 'pass_td', 'int',
    'rush_attempts', 'rush_yards', 'rush_td'
]
fields_not_normalized = [
    'completion_percentage', 'yards_per_attempt', 'air_yards_per_attempt',
    'pass_rate', 'rush_avg', 'height_in', 'weight_lb'
]

# Ensure all columns are numeric or convert them as needed
for field in fields_to_normalize + fields_not_normalized + ['games_played', 'years_played']:
    data[field] = pd.to_numeric(data[field], errors='coerce')

def normalize_fields(df, fields_to_normalize, norm_by='games_played'):
    for field in fields_to_normalize:
        df[field + '_per_game'] = df[field] / df[norm_by]
    return df

# Apply normalization
data = normalize_fields(data, fields_to_normalize)

# Prepare the features list for similarity comparison
features_normalized = [f'{field}_per_game' for field in fields_to_normalize]
features = features_normalized + fields_not_normalized + ['games_played', 'years_played']

# Fill missing values with the mean of each column
data_filled = data[features].fillna(data[features].mean(), inplace=False)

# Separate datasets
data_2024 = data[data['draft_class'] == 2024]
data_not_2024 = data[data['draft_class'] != 2024]

# Normalize the feature values using StandardScaler
scaler = StandardScaler()
data_2024_scaled = scaler.fit_transform(data_filled.loc[data_2024.index])
data_not_2024_scaled = scaler.transform(data_filled.loc[data_not_2024.index])

# Calculate cosine similarity
similarity = cosine_similarity(data_2024_scaled, data_not_2024_scaled)

# Input for a specific 2024 player's name
player_name = input("Enter the name of the 2024 player to compare: ")

try:
    player_index = data_2024[data_2024['name'].str.lower() == player_name.lower()].index[0]
    similarity_scores = similarity[player_index - data_2024.index[0]]
    most_similar_index = similarity_scores.argmax()
    most_similar_player_name = data_not_2024.iloc[most_similar_index]['name']
    percentage_similarity = (similarity_scores[most_similar_index] + 1) / 2 * 100

    print(f"\nThe most similar player to {player_name} is {most_similar_player_name} with {percentage_similarity:.2f}% similarity.\n")

    # Gather stats for comparison
    player_stats = data_filled.loc[data_2024.index][features].iloc[player_index - data_2024.index[0]]
    similar_player_stats = data_filled.loc[data_not_2024.index][features].iloc[most_similar_index]

    comparison_df = pd.DataFrame({
        player_name: player_stats.values,
        most_similar_player_name: similar_player_stats.values
    }, index=features)
    
    print(comparison_df)
except IndexError:
    print("Player not found in the 2024 draft class. Please check the name and try again.")



The most similar player to Joe Milton III is Carson Wentz with 90.08% similarity.

                        Joe Milton III  Carson Wentz
completions_per_game          9.302326      9.333333
pass_attempts_per_game       15.116279     14.571429
pass_yards_per_game         220.967379    121.785714
pass_td_per_game              0.860465      1.071429
int_per_game                  0.255814      0.333333
rush_attempts_per_game        4.046512      5.142857
rush_yards_per_game          15.372093     24.476190
rush_td_per_game              0.279070      0.309524
completion_percentage        61.500000     64.100000
yards_per_attempt             8.200000      8.400000
air_yards_per_attempt         8.600000      8.800000
pass_rate                   146.100000    153.900000
rush_avg                      3.800000      4.800000
height_in                    77.000000     77.000000
weight_lb                   235.000000    237.000000
games_played                 43.000000     42.000000
years_played   