In [61]:
import pandas as pd
import numpy as np
import re
import time


# import math

In [9]:
pd.set_option('display.max_rows', None)

In [58]:
def clean_year_df(df):
    df = df.loc[df['Player'] != 'Player'].reset_index(drop = True)

    split_draft_info = df['Drafted (tm/rnd/yr)'].apply(lambda x: x.split(' / ') if not pd.isnull(x) else [np.nan for _ in range(4)])

    draft_team, draft_round, pick_num, draft_year = map(list, zip(*split_draft_info))

    draft_year = map(lambda x: int(x) if not pd.isnull(x) else np.nan, draft_year)

    def strip_suffixes(proc_list): # remove 'st', 'nd', 'pick', etc.
        return [int(re.findall(string = pick_str, pattern = '(\\d+)\\w+')[0]) if not pd.isnull(pick_str) else np.nan for pick_str in proc_list]

    draft_round = strip_suffixes(draft_round)
    pick_num = strip_suffixes(pick_num)
    

    df.drop(columns = ['College', 'Drafted (tm/rnd/yr)'], inplace = True)

    df = pd.concat([df, pd.DataFrame({'draft_team':draft_team, 'draft_round':draft_round,
                                              'pick_num':pick_num, 'draft_year':draft_year})], axis = 1)
    
    return df
    

In [67]:
def scrape_years(start_year, end_year):

    df_list = []

    for year in range(start_year, end_year + 1):
        
        temp_df = clean_year_df(pd.read_html(f'https://www.pro-football-reference.com/draft/{year}-combine.htm')[0])

        df_list.append(temp_df)

        time.sleep(10)

        print(f'finished {year}')
    
    full_df = pd.concat(df_list, axis = 0).reset_index(drop = True)

    return full_df




In [68]:
full_combine_dat = scrape_years(2000, 2023)

finished 2000
finished 2001
finished 2002
finished 2003
finished 2004
finished 2005
finished 2006
finished 2007
finished 2008
finished 2009
finished 2010
finished 2011
finished 2012
finished 2013
finished 2014
finished 2015
finished 2016
finished 2017
finished 2018
finished 2019
finished 2020
finished 2021
finished 2022
finished 2023


In [72]:
full_combine_dat.to_csv('/Users/aidancook/Documents/bdb2024/data/agg_data/full_combine_dat.csv', index=None)

In [73]:
# full_combine_dat.loc[full_combine_dat['draft_year'] == 2023]

Unnamed: 0,Player,Pos,School,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,draft_team,draft_round,pick_num,draft_year
7680,Israel Abanikanda,RB,Pittsburgh,5-10,216,,,,,,,New York Jets,5.0,143.0,2023.0
7681,Yasir Abdullah,LB,Louisville,6-1,237,4.47,36.5,,129.0,,,Jacksonville Jaguars,5.0,136.0,2023.0
7682,Devon Achane,RB,Texas A&M,5-9,188,4.32,33.0,,,,,Miami Dolphins,3.0,84.0,2023.0
7683,Jordan Addison,WR,USC,5-11,173,4.49,34.0,,122.0,,,Minnesota Vikings,1.0,23.0,2023.0
7684,Adetomiwa Adebawore,DE,Northwestern,6-2,282,4.49,37.5,27.0,125.0,,,Indianapolis Colts,4.0,110.0,2023.0
7686,Davis Allen,TE,Clemson,6-6,245,,38.5,,125.0,,,Los Angeles Rams,5.0,175.0,2023.0
7688,Jake Andrews,C,Troy,6-3,305,5.15,26.0,,102.0,,4.73,New England Patriots,4.0,107.0,2023.0
7689,Felix Anudike-Uzomah,EDGE,Kansas St.,6-3,255,,,,,,,Kansas City Chiefs,1.0,31.0,2023.0
7690,Malaesala Aumavae-Laulu,OT,Oregon,6-6,317,5.23,28.5,,106.0,7.65,4.75,Baltimore Ravens,6.0,199.0,2023.0
7691,Alex Austin,CB,Oregon St.,6-1,195,4.55,33.0,,122.0,,4.33,Buffalo Bills,7.0,252.0,2023.0
