## Is the high schools recruiting rating a good indicator of a high performaning athlete?  


In [1]:
# Uncomment to get the correct versions
# from IPython.display import clear_output
# !pip install -r requirements.txt
# clear_output() 


In [86]:
import numpy as np
import pandas as pd
import altair as alt
import cfbd
import warnings 

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')


In [87]:
draft = pd.read_csv('../../data/draft.csv')
recruits = pd.read_csv('../../data/recruits.csv')

In [88]:
# Sometime there are two players with the same name - this will create duplicates
# About 2k of the 39k records are duplicated due to shared names.

merged_df = pd.merge(left = recruits, right = draft, how = 'left', left_on = 'name', right_on = 'Player')
# col = ['name', 'rating', 'stars', 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ', 'CommonSequence']

In [89]:
col = ['name', 'rating', 'ranking', 'recruit_type', 'year', 'position', 'height', 'weight','latitude', 'longitude', 'state_province', 'stars', 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ']
merged_df = merged_df[col]

merged_df.head()

Unnamed: 0,name,rating,ranking,recruit_type,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,athlete_id,Rnd,Pick,Player,draft_year,College/Univ
0,Trenton Thompson,0.9992,1.0,HighSchool,2015,DT,74.0,313.0,31.578206,-84.155681,GA,5,Georgia,3915192.0,,,,,
1,Trent Thompson,0.9991,1.0,HighSchool,2015,DT,74.5,313.0,31.578206,-84.155681,GA,5,Georgia,,,,,,
2,Martez Ivey,0.999,2.0,HighSchool,2015,OT,77.5,275.0,28.677968,-81.511521,FL,5,Florida,-1009710.0,,,,,
3,Byron Cowart,0.9987,3.0,HighSchool,2015,SDE,76.0,250.0,27.998541,-82.274884,FL,5,Auburn,3916922.0,5.0,159.0,Byron Cowart,2019.0,Maryland
4,Iman Marshall,0.9985,4.0,HighSchool,2015,CB,73.0,190.0,33.769016,-118.191605,CA,5,USC,3912545.0,4.0,127.0,Iman Marshall,2019.0,USC


In [90]:
merged_df.shape

(31886, 19)

In [91]:
# Function to check if there are at least 4 sequential characters in common
# If the name of the school they committed to matches the name they were drafted from, then return a 1:
def has_common_sequence(str1, str2, min_seq_length=4):
    """
    This is a complicated function. 
    By joining from recruit data to draft data on the player's name, we create a many-to-many relationship.

    This occurs because some (but very few) recruits appear twice in the dataset (ex - Ron Smith), 
    and football players sometimes have the same name (ex - David Long). 

    This function first de-duplicates based on the combination of name and college of choice. 
    In other words we are assuming no two players at one school had the same name. 

    Next, we join to the draft dataset by looking for the player's name and college they attended, and the name and college they were drafted from. 

    More information can be found here in our milestone I project on slide 5:
    https://docs.google.com/presentation/d/1_CfHYqeOniPscvbb8VfQqQUgyf4xCSeC1spL_9M0ejw/edit#slide=id.g2b8248144f9_0_127

    """

    
    if pd.isna(str1) or pd.isna(str2):
        return 0

    for i in range(len(str1) - min_seq_length + 1):
        sequence = str1[i:i + min_seq_length]
        if sequence in str2:
            return 1
    return 0

# Apply the function to the DataFrame
merged_df['CommonSequence'] = merged_df.apply(lambda row: has_common_sequence(row['committed_to'], row['College/Univ']), axis=1)

merged_df['RN'] = merged_df.sort_values(['name', 'committed_to', 'CommonSequence'], ascending=[True, True,False]) \
                           .groupby(['name', 'rating']) \
                           .cumcount() + 1

#col = ['name', 'rating', 'stars', 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ', 'CommonSequence', 'RN']
#merged_df = merged_df[col]

In [92]:
# Examples of de-duplication logic in action

# merged_df[merged_df['name'] == 'Ron Smith']
# merged_df[merged_df['name'] == 'David Long']

In [93]:
# Remove duplicates

merged_df = merged_df[merged_df['RN'] == 1]
merged_df['is_drafted'] = np.where(merged_df['Rnd'].isna(), 0.0, 1.0)

In [94]:
# Get features about the school

df_teams = pd.read_csv('../../data/teams.csv')
df_teams =  df_teams[['school', 'conference', 'latitude_school', 'longitude_school']]
df_teams.head()

Unnamed: 0,school,conference,latitude_school,longitude_school
0,Air Force,Mountain West,38.99697,-104.843616
1,Akron,Mid-American,41.072553,-81.508341
2,Alabama,SEC,33.208275,-87.550384
3,Appalachian State,Sun Belt,36.211427,-81.685428
4,Arizona,Big 12,32.228805,-110.948868


In [95]:
# Add team features to our recruit-draft dataset

merged_df = pd.merge(left = merged_df, right = df_teams, left_on = 'committed_to', right_on = 'school')

In [98]:
merged_df.groupby(by = 'recruit_type').agg({'is_drafted': 'mean'})

Unnamed: 0_level_0,is_drafted
recruit_type,Unnamed: 1_level_1
HighSchool,0.05223


In [99]:
# Drop unneeded columns:
drop_cols = ['name', 'recruit_type', 'athlete_id', 'Rnd', 'Pick', 'Player'
             , 'draft_year', 'College/Univ', 'CommonSequence', 'RN', 'school']

merged_df = merged_df.drop(columns = drop_cols)

merged_df.sample(5)

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school
8667,0.8611,1092.0,2023,S,71.0,170.0,32.929966,-97.227125,TX,3,Washington State,0.0,Pac-12,46.731831,-117.160499
217,0.9478,110.0,2023,OT,79.0,290.0,33.567056,-84.581042,GA,4,Georgia,0.0,SEC,33.94982,-83.373381
10713,0.7894,2167.0,2022,IOL,77.0,275.0,33.563521,-101.879336,TX,2,Tulsa,0.0,American Athletic,36.148918,-95.943785
22562,0.7823,2621.0,2017,TE,78.0,230.0,29.4246,-98.495141,TX,2,New Mexico State,0.0,Conference USA,32.27962,-106.741115
1489,0.854,732.0,2017,WR,74.0,185.0,26.137844,-81.753998,FL,3,Texas,0.0,SEC,30.283681,-97.732534


In [103]:
merged_df.to_csv('../../data/M2_final.csv', index = False)
print('merged_df: ' + str(merged_df.shape))

merged_df: (22784, 15)


In [101]:
merged_df.sample(5)

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school
3055,0.968,67.0,2023,DL,78.0,255.0,38.952944,-76.940865,MD,4,Ohio State,0.0,Big Ten,40.001645,-83.019727
22776,0.8539,1302.0,2023,DL,75.0,275.0,33.706205,-86.670079,AL,3,UAB,0.0,American Athletic,33.520682,-86.802433
1248,0.9374,139.0,2016,OG,76.0,347.0,30.438804,-90.441476,LA,4,LSU,0.0,SEC,30.412035,-91.183816
3846,0.8919,321.0,2017,CB,72.0,180.0,33.61011,-84.287978,GA,4,North Carolina,0.0,ACC,35.906929,-79.047889
5017,0.8986,314.0,2019,SDE,76.5,240.0,40.982875,-74.96045,NJ,4,Michigan,1.0,Big Ten,42.265836,-83.748696
