In [1]:
import os
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time

# Function to extract href attribute values from anchor tags, skipping header rows
def url_extract(tds):
    results = []
    for td in tds:
        # Skip rows with specific attributes or text content indicating header rows
        if td.has_attr('aria-label') or 'header' in td.get('class', []) or 'header' in td.get('class', []):
            continue
        a_tag = td.find('a')  # Find the <a> tag within the <td> or <th>
        if a_tag:
            href_value = a_tag['href']  # Get the value of the href attribute
            results.append(href_value)
        else:
            results.append(None)
    return results

# Define headers for different types of data
combine_header = ['player', 'pos', 'college', 'stats', 'height', 'weight', 'forty', 'vertical', 'bench', 'broad', 'threecone', 'shuttle', 'drafted']

# Create directory for caching HTML files
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

# Function to read HTML content with caching
def read_html_cache(url, year):
    fn = f"{year}_combine.htm"
    fn_path = os.path.join(cache_dir, fn)
    if not os.path.exists(fn_path):
        with urlopen(url) as response:
            html = response.read().decode('utf-8')
            with open(fn_path, 'w') as f:
                f.write(html)
    with open(fn_path, 'r') as f:
        return BeautifulSoup(f.read(), 'html.parser')

# Function to extract player URLs from table cells
def player_url_extract(tds):
    results = []
    for td in tds:
        # Find 'a' tag within 'td' and extract the href attribute
        if td.find('a'):
            player_url = td.find('a')['href']
            if player_url.startswith('/players'):
                # Construct full player URL and append to results
                full_url = f"http://www.pro-football-reference.com{player_url}"
                results.append(full_url)
            else:
                results.append(None)
        else:
            results.append(None)
    return results

# Scrape combine data with player names
combine_data = []
for year in range(2000, 2025):  # Adjust the range as needed
    time.sleep(4)
    url = f'http://www.pro-football-reference.com/draft/{year}-combine.htm'
    soup = read_html_cache(url, year)
    tables = soup.find_all('table')
    college_cells = tables[0].find_all('td', {'data-stat': 'college'})  # Adjusted to target 'college' attribute
    player_cells = tables[0].find_all(['td', 'th'], {'data-stat': 'player'})  # Target both 'td' and 'th' elements
    college_urls = url_extract(college_cells)
    player_names = url_extract(player_cells)  # Extract player names using the modified function
    combine_table = pd.read_html(str(tables[0]))[0]
    combine_table.columns = combine_header
    combine_table = combine_table[combine_table['pos'] != 'Pos']

    # Add new columns for college stats and player names
    combine_table['college_stats_url'] = college_urls
    combine_table['nfl_stats'] = player_names

    # Add a new column 'Year' to the DataFrame
    combine_table['Year'] = year

    combine_data.append(combine_table)

# Concatenate all combine data DataFrames
combine_table = pd.concat(combine_data, ignore_index=True)






  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_table = pd.read_html(str(tables[0]))[0]
  combine_

In [2]:
combine_table.head()

Unnamed: 0,player,pos,college,stats,height,weight,forty,vertical,bench,broad,threecone,shuttle,drafted,college_stats_url,nfl_stats,Year
0,John Abraham,OLB,South Carolina,,6-4,252,4.55,,,,,,New York Jets / 1st / 13th pick / 2000,,/players/A/AbraJo00.htm,2000
1,Shaun Alexander,RB,Alabama,College Stats,6-0,218,4.58,,,,,,Seattle Seahawks / 1st / 19th pick / 2000,https://www.sports-reference.com/cfb/players/s...,/players/A/AlexSh00.htm,2000
2,Darnell Alford,OT,Boston Col.,,6-4,334,5.56,25.0,23.0,94.0,8.48,4.98,Kansas City Chiefs / 6th / 188th pick / 2000,,/players/A/AlfoDa20.htm,2000
3,Kyle Allamon,TE,Texas Tech,,6-2,253,4.97,29.0,,104.0,7.29,4.49,,,,2000
4,Rashard Anderson,CB,Jackson State,,6-2,206,4.55,34.0,,123.0,7.18,4.15,Carolina Panthers / 1st / 23rd pick / 2000,,/players/A/AndeRa21.htm,2000


In [3]:
import pandas as pd


# Apply conditional check and concatenate URL prefix
combine_table['nfl_stats'] = combine_table['nfl_stats'].apply(lambda x: f"https://www.pro-football-reference.com/{x}" if x else None)
combine_table.head()

Unnamed: 0,player,pos,college,stats,height,weight,forty,vertical,bench,broad,threecone,shuttle,drafted,college_stats_url,nfl_stats,Year
0,John Abraham,OLB,South Carolina,,6-4,252,4.55,,,,,,New York Jets / 1st / 13th pick / 2000,,https://www.pro-football-reference.com//player...,2000
1,Shaun Alexander,RB,Alabama,College Stats,6-0,218,4.58,,,,,,Seattle Seahawks / 1st / 19th pick / 2000,https://www.sports-reference.com/cfb/players/s...,https://www.pro-football-reference.com//player...,2000
2,Darnell Alford,OT,Boston Col.,,6-4,334,5.56,25.0,23.0,94.0,8.48,4.98,Kansas City Chiefs / 6th / 188th pick / 2000,,https://www.pro-football-reference.com//player...,2000
3,Kyle Allamon,TE,Texas Tech,,6-2,253,4.97,29.0,,104.0,7.29,4.49,,,,2000
4,Rashard Anderson,CB,Jackson State,,6-2,206,4.55,34.0,,123.0,7.18,4.15,Carolina Panthers / 1st / 23rd pick / 2000,,https://www.pro-football-reference.com//player...,2000


In [4]:

combine_table['height'].fillna('0-0', inplace=True)

# Splitting the 'height' column into 'feet' and 'inches'
combine_table[['feet', 'inches']] = combine_table['height'].str.split('-', expand=True)
combine_table['feet'] = combine_table['feet'].astype(int)
combine_table['inches'] = combine_table['inches'].astype(int)

# Converting feet to inches and adding to inches
combine_table['total_height_inches'] = combine_table['feet'] * 12 + combine_table['inches']

# Drop the 'feet' and 'inches' columns if needed
combine_table.drop(['feet', 'inches', 'height'], axis=1, inplace=True)

combine_table.head()

Unnamed: 0,player,pos,college,stats,weight,forty,vertical,bench,broad,threecone,shuttle,drafted,college_stats_url,nfl_stats,Year,total_height_inches
0,John Abraham,OLB,South Carolina,,252,4.55,,,,,,New York Jets / 1st / 13th pick / 2000,,https://www.pro-football-reference.com//player...,2000,76
1,Shaun Alexander,RB,Alabama,College Stats,218,4.58,,,,,,Seattle Seahawks / 1st / 19th pick / 2000,https://www.sports-reference.com/cfb/players/s...,https://www.pro-football-reference.com//player...,2000,72
2,Darnell Alford,OT,Boston Col.,,334,5.56,25.0,23.0,94.0,8.48,4.98,Kansas City Chiefs / 6th / 188th pick / 2000,,https://www.pro-football-reference.com//player...,2000,76
3,Kyle Allamon,TE,Texas Tech,,253,4.97,29.0,,104.0,7.29,4.49,,,,2000,74
4,Rashard Anderson,CB,Jackson State,,206,4.55,34.0,,123.0,7.18,4.15,Carolina Panthers / 1st / 23rd pick / 2000,,https://www.pro-football-reference.com//player...,2000,74


In [5]:
combine_table.columns = [f"combine_{col}" for col in combine_table.columns]

combine_table.head()

Unnamed: 0,combine_player,combine_pos,combine_college,combine_stats,combine_weight,combine_forty,combine_vertical,combine_bench,combine_broad,combine_threecone,combine_shuttle,combine_drafted,combine_college_stats_url,combine_nfl_stats,combine_Year,combine_total_height_inches
0,John Abraham,OLB,South Carolina,,252,4.55,,,,,,New York Jets / 1st / 13th pick / 2000,,https://www.pro-football-reference.com//player...,2000,76
1,Shaun Alexander,RB,Alabama,College Stats,218,4.58,,,,,,Seattle Seahawks / 1st / 19th pick / 2000,https://www.sports-reference.com/cfb/players/s...,https://www.pro-football-reference.com//player...,2000,72
2,Darnell Alford,OT,Boston Col.,,334,5.56,25.0,23.0,94.0,8.48,4.98,Kansas City Chiefs / 6th / 188th pick / 2000,,https://www.pro-football-reference.com//player...,2000,76
3,Kyle Allamon,TE,Texas Tech,,253,4.97,29.0,,104.0,7.29,4.49,,,,2000,74
4,Rashard Anderson,CB,Jackson State,,206,4.55,34.0,,123.0,7.18,4.15,Carolina Panthers / 1st / 23rd pick / 2000,,https://www.pro-football-reference.com//player...,2000,74


In [7]:
# Drop the 'combine_stats' column
combine_table.drop('combine_stats', axis=1, inplace=True)

combine_table.head()


Unnamed: 0,combine_player,combine_pos,combine_college,combine_weight,combine_forty,combine_vertical,combine_bench,combine_broad,combine_threecone,combine_shuttle,combine_drafted,combine_college_stats_url,combine_nfl_stats,combine_Year,combine_total_height_inches
0,John Abraham,OLB,South Carolina,252,4.55,,,,,,New York Jets / 1st / 13th pick / 2000,,https://www.pro-football-reference.com//player...,2000,76
1,Shaun Alexander,RB,Alabama,218,4.58,,,,,,Seattle Seahawks / 1st / 19th pick / 2000,https://www.sports-reference.com/cfb/players/s...,https://www.pro-football-reference.com//player...,2000,72
2,Darnell Alford,OT,Boston Col.,334,5.56,25.0,23.0,94.0,8.48,4.98,Kansas City Chiefs / 6th / 188th pick / 2000,,https://www.pro-football-reference.com//player...,2000,76
3,Kyle Allamon,TE,Texas Tech,253,4.97,29.0,,104.0,7.29,4.49,,,,2000,74
4,Rashard Anderson,CB,Jackson State,206,4.55,34.0,,123.0,7.18,4.15,Carolina Panthers / 1st / 23rd pick / 2000,,https://www.pro-football-reference.com//player...,2000,74


In [8]:
# Specify the directory where you want to save the CSV file
data_directory = r"C:\Users\benfi\Defensive_Players_NFL\data"

# Define the path to save the CSV file
csv_path = os.path.join(data_directory, "combine_table.csv")

# Save the DataFrame as a CSV file
combine_table.to_csv(csv_path, index=False)

print("CSV file saved successfully!")

CSV file saved successfully!
