In [None]:
import os
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time

# Function to extract URLs from table cells
def url_extract(tds):
    results = []
    for td in tds:
        if td.a:
            results.append(td.a['href'])
        else:
            results.append(None)
    return results

# Define headers for different types of data
draft_header = ['round', 'pick', 'team', 'player', 'pos', 'age', 'to', 'ap1', 'pb', 'st', 'carav', 'drav', 'games', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_tds', 'pass_ints', 'rush_att', 'rush_yds', 'rush_tds', 'receptions', 'rec_yds', 'rec_tds', 'tackles', 'ints', 'sacks', 'college', 'stats']

# Function to parse tables from Pro-Football-Reference
def parse_pfr_tables(tables):
    results = []
    for tbl in tables:
        id_ = tbl.get('id')
        if id_ in headers:
            df = pd.read_html(str(tbl))[0].iloc[1:]
            df.columns = headers[id_]
            results.append(df)
    return pd.concat(results)

# Create directory for caching HTML files
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

def read_html_cache(url, year):
    fn = f"{year}_{url.split('/')[-1]}"
    fn_path = os.path.join(cache_dir, fn)
    if not os.path.exists(fn_path):
        with urlopen(url) as response:
            html = response.read().decode('utf-8')
            with open(fn_path, 'w') as f:
                f.write(html)
    with open(fn_path, 'r') as f:
        return BeautifulSoup(f.read(), 'html.parser')

# Function to extract player URLs from table cells
def player_url_extract(tds):
    results = []
    for td in tds:
        if td.find('a'):
            player_url = td.find('a')['href']
            if player_url.startswith('/players'):
                # Construct full player URL and append to results
                full_url = f"http://www.pro-football-reference.com{player_url}"
                results.append(full_url)
            else:
                results.append(None)
        else:
            results.append(None)
    return results

# Define years to process
years_to_process = range(2000, 2024)

# Scrape draft data
draft_data = []
for year in years_to_process:
    time.sleep(4)
    url = f'http://www.pro-football-reference.com/years/{year}/draft.htm'
    soup = read_html_cache(url, year)
    tables = soup.find_all('table')
    urls = url_extract(tables[0].find_all('td', {'data-stat': 'college_link'}))
    player_urls = player_url_extract(tables[0].find_all('td', {'data-stat': 'player'}))
    draft_table = pd.read_html(str(tables[0]))[0]
    draft_table.columns = draft_header
    draft_table = draft_table[draft_table['pos'] != 'Pos']
    draft_table['url'] = urls
    draft_table['player_url'] = player_urls
    # Add a new column 'Year' to the DataFrame
    draft_table['Year'] = [year for _ in range(len(draft_table))]
    draft_data.append(draft_table)

# Concatenate all draft data DataFrames
draft_table = pd.concat(draft_data, ignore_index=True)




In [None]:
draft_table

Unnamed: 0,round,pick,team,player,pos,age,to,ap1,pb,st,...,rec_yds,rec_tds,tackles,ints,sacks,college,stats,url,player_url,Year
0,1,1,CLE,Courtney Brown,DE,22,2005,0,0,4,...,0,0,156,,19.0,Penn St.,College Stats,http://www.sports-reference.com/cfb/players/co...,http://www.pro-football-reference.com/players/...,2000
1,1,2,WAS,LaVar Arrington,LB,22,2006,0,3,5,...,0,0,338,3,23.5,Penn St.,College Stats,http://www.sports-reference.com/cfb/players/la...,http://www.pro-football-reference.com/players/...,2000
2,1,3,WAS,Chris Samuels,T,23,2009,0,6,9,...,0,0,,,,Alabama,College Stats,http://www.sports-reference.com/cfb/players/ch...,http://www.pro-football-reference.com/players/...,2000
3,1,4,CIN,Peter Warrick,WR,23,2005,0,0,4,...,2991,18,3,,,Florida St.,College Stats,http://www.sports-reference.com/cfb/players/pe...,http://www.pro-football-reference.com/players/...,2000
4,1,5,BAL,Jamal Lewis,RB,21,2009,1,1,9,...,1879,4,,,,Tennessee,College Stats,http://www.sports-reference.com/cfb/players/ja...,http://www.pro-football-reference.com/players/...,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6125,7,255,SFO,Jalen Graham,S,22,2023,0,0,0,...,0,0,1,,,Purdue,College Stats,http://www.sports-reference.com/cfb/players/ja...,http://www.pro-football-reference.com/players/...,2023
6126,7,256,GNB,Grant Dubose,WR,22,,0,0,0,...,,,,,,Charlotte,College Stats,http://www.sports-reference.com/cfb/players/gr...,http://www.pro-football-reference.com/players/...,2023
6127,7,257,DEN,Alex Forsyth,OL,24,,0,0,0,...,,,,,,Oregon,College Stats,http://www.sports-reference.com/cfb/players/al...,http://www.pro-football-reference.com/players/...,2023
6128,7,258,CHI,Kendall Williamson,S,23,,0,0,0,...,,,,,,Stanford,College Stats,http://www.sports-reference.com/cfb/players/ke...,http://www.pro-football-reference.com/players/...,2023


In [None]:

draft_table.to_csv('draft_data.csv', index=False)
