## Is the high schools recruiting rating a good indicator of a high performaning athlete?  


In [9]:
# Uncomment to get the correct versions
# from IPython.display import clear_output
# !pip install -r requirements.txt
# clear_output() 


In [10]:
import numpy as np
import pandas as pd
import altair as alt
import cfbd
import warnings 

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')


In [11]:
draft = pd.read_csv('../data/draft.csv')
recruits = pd.read_csv('../data/recruits.csv')

In [12]:
# Sometime there are two players with the same name - this will create duplicates
# About 2k of the 39k records are duplicated due to shared names.

merged_df = pd.merge(left = recruits, right = draft, how = 'left', left_on = 'name', right_on = 'Player')
col = ['name', 'rating', 'stars', 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ', 'CommonSequence']

In [13]:
# Function to check if there are at least 4 sequential characters in common
# If the name of the school they committed to matches the name they were drafted from, then return a 1:
def has_common_sequence(str1, str2, min_seq_length=4):
    if pd.isna(str1) or pd.isna(str2):
        return 0

    for i in range(len(str1) - min_seq_length + 1):
        sequence = str1[i:i + min_seq_length]
        if sequence in str2:
            return 1
    return 0

# Apply the function to the DataFrame
merged_df['CommonSequence'] = merged_df.apply(lambda row: has_common_sequence(row['committed_to'], row['College/Univ']), axis=1)

merged_df['RN'] = merged_df.sort_values(['name','CommonSequence'], ascending=[True,False]) \
                           .groupby(['name', 'rating']) \
                           .cumcount() + 1

col = ['name', 'rating', 'stars', 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ', 'CommonSequence', 'RN']
merged_df = merged_df[col]

In [14]:
# Remove duplicates

merged_df = merged_df[merged_df['RN'] == 1]
merged_df['is_drafted'] = np.where(merged_df['Rnd'].isna(), 0.0, 1.0)

In [15]:
merged_df['rating_round'] = merged_df['rating'].round(3)

draft_likelihood = merged_df.groupby(by = ['stars', 'rating_round']).agg({'is_drafted': 'mean', 'name': 'count'}).reset_index()

rename_dict = {'stars': 'HS Recruiting Stars', 
               'rating_round': 'HS Recruiting Rating',
               'is_drafted': '% of Players Drafted',
               'name': 'count'}

draft_likelihood = draft_likelihood.rename(rename_dict, axis = 1)

In [16]:
alt.Chart(draft_likelihood).mark_circle(size = 60).encode(
    x = alt.X('HS Recruiting Rating', scale=alt.Scale(domain=[.6, 1]), title = 'Recuiting Ranking (rounded to 3 digits)'),
    y = alt.Y('% of Players Drafted', title = '% of Players Who Get Drafted', scale=alt.Scale(domain=[0, 1])),
    color = 'HS Recruiting Stars:N').properties(
    width = 600, height = 400, title = 'High School Recruit Ranking vs Draft Likelihood')