In [1]:
import os
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time

# Function to extract href attribute values from anchor tags, skipping header rows
def url_extract(tds):
    results = []
    for td in tds:
        # Skip rows with specific attributes or text content indicating header rows
        if td.has_attr('aria-label') or 'header' in td.get('class', []) or 'header' in td.get('class', []):
            continue
        a_tag = td.find('a')  # Find the <a> tag within the <td> or <th>
        if a_tag:
            href_value = a_tag['href']  # Get the value of the href attribute
            results.append(href_value)
        else:
            results.append(None)
    return results

# Define headers for different types of data
combine_header = ['player', 'pos', 'college', 'stats', 'height', 'weight', 'forty', 'vertical', 'bench', 'broad', 'threecone', 'shuttle', 'drafted']

# Create directory for caching HTML files
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

# Function to read HTML content with caching
def read_html_cache(url, year):
    fn = f"{year}_combine.htm"
    fn_path = os.path.join(cache_dir, fn)
    if not os.path.exists(fn_path):
        with urlopen(url) as response:
            html = response.read().decode('utf-8')
            with open(fn_path, 'w') as f:
                f.write(html)
    with open(fn_path, 'r') as f:
        return BeautifulSoup(f.read(), 'html.parser')

# Function to extract player URLs from table cells
def player_url_extract(tds):
    results = []
    for td in tds:
        # Find 'a' tag within 'td' and extract the href attribute
        if td.find('a'):
            player_url = td.find('a')['href']
            if player_url.startswith('/players'):
                # Construct full player URL and append to results
                full_url = f"http://www.pro-football-reference.com{player_url}"
                results.append(full_url)
            else:
                results.append(None)
        else:
            results.append(None)
    return results

# Scrape combine data with player names
combine_data = []
for year in range(2024, 2025):  # Adjust the range as needed
    time.sleep(4)
    url = f'http://www.pro-football-reference.com/draft/{year}-combine.htm'
    soup = read_html_cache(url, year)
    tables = soup.find_all('table')
    college_cells = tables[0].find_all('td', {'data-stat': 'college'})  # Adjusted to target 'college' attribute
    player_cells = tables[0].find_all(['td', 'th'], {'data-stat': 'player'})  # Target both 'td' and 'th' elements
    college_urls = url_extract(college_cells)
    player_names = url_extract(player_cells)  # Extract player names using the modified function
    combine_table = pd.read_html(str(tables[0]))[0]
    combine_table.columns = combine_header
    combine_table = combine_table[combine_table['pos'] != 'Pos']

    # Add new columns for college stats and player names
    combine_table['college_stats_url'] = college_urls
    combine_table['nfl_stats'] = player_names

    # Add a new column 'Year' to the DataFrame
    combine_table['Year'] = year

    combine_data.append(combine_table)

# Concatenate all combine data DataFrames
combine_table = pd.concat(combine_data, ignore_index=True)






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combine_table['college_stats_url'] = college_urls
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combine_table['nfl_stats'] = player_names
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combine_table['Year'] = year


In [2]:
import pandas as pd


# Apply conditional check and concatenate URL prefix
combine_table['nfl_stats'] = combine_table['nfl_stats'].apply(lambda x: f"https://www.pro-football-reference.com/{x}" if x else None)


In [3]:
combine_table


Unnamed: 0,player,pos,college,stats,height,weight,forty,vertical,bench,broad,threecone,shuttle,drafted,college_stats_url,nfl_stats,Year
0,Kris Abrams-Draine,CB,Missouri,College Stats,5-11,179,4.44,33.5,,,,,,https://www.sports-reference.com/cfb/players/k...,,2024
1,Isaiah Adams,G,Illinois,College Stats,6-4,315,5.22,24.5,,102,7.77,4.73,,https://www.sports-reference.com/cfb/players/i...,,2024
2,Rasheen Ali,RB,Marshall,College Stats,5-11,206,,,,,,,,https://www.sports-reference.com/cfb/players/r...,,2024
3,Erick All,TE,Iowa,College Stats,6-4,252,,,,,,,,https://www.sports-reference.com/cfb/players/e...,,2024
4,Braelon Allen,RB,Wisconsin,College Stats,6-1,235,,32.0,26,117,,,,https://www.sports-reference.com/cfb/players/b...,,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,Roman Wilson,WR,Michigan,College Stats,5-11,185,4.39,,12,,,,,https://www.sports-reference.com/cfb/players/r...,,2024
317,Mekhi Wingo,DT,LSU,College Stats,6-0,284,4.85,31.5,25,109,,,,https://www.sports-reference.com/cfb/players/m...,,2024
318,Xavier Worthy,WR,Texas,College Stats,5-11,165,4.21,41.0,,131,,,,https://www.sports-reference.com/cfb/players/x...,,2024
319,Jaylen Wright,RB,Tennessee,College Stats,5-11,210,4.38,38.0,,134,,,,https://www.sports-reference.com/cfb/players/j...,,2024


In [7]:
# Assuming you have a DataFrame named combine_table
# Filter the DataFrame to include only players with position 'RB'
combine_table_rb = combine_table[combine_table['pos'] == 'RB']


# Display the resulting DataFrame
combine_table_rb


Unnamed: 0,player,pos,college,stats,height,weight,forty,vertical,bench,broad,threecone,shuttle,drafted,college_stats_url,nfl_stats,Year
2,Rasheen Ali,RB,Marshall,College Stats,5-11,206,,,,,,,,https://www.sports-reference.com/cfb/players/r...,,2024
4,Braelon Allen,RB,Wisconsin,College Stats,6-1,235,,32.0,26.0,117.0,,,,https://www.sports-reference.com/cfb/players/b...,,2024
10,Emani Bailey,RB,TCU,College Stats,5-7,202,4.61,33.5,,116.0,,,,https://www.sports-reference.com/cfb/players/e...,,2024
18,Trey Benson,RB,Florida St.,College Stats,6-0,216,4.39,33.5,,122.0,,,,https://www.sports-reference.com/cfb/players/t...,,2024
29,Jonathon Brooks,RB,Texas,College Stats,6-0,216,,,,,,,,https://www.sports-reference.com/cfb/players/j...,,2024
51,Blake Corum,RB,Michigan,College Stats,5-8,205,4.53,35.5,27.0,,6.82,4.12,,https://www.sports-reference.com/cfb/players/b...,,2024
58,Isaiah Davis,RB,South Dakota St.,,6-0,218,4.57,34.5,23.0,119.0,,,,,,2024
59,Re'Mahn Davis,RB,Kentucky,College Stats,5-8,211,4.52,35.0,,119.0,,4.51,,https://www.sports-reference.com/cfb/players/r...,,2024
71,Sevarian Edwards,RB,Georgia,,5-10,207,,,,114.0,,,,,,2024
75,Audric Estimé,RB,Notre Dame,College Stats,5-11,221,4.71,38.0,23.0,125.0,,,,https://www.sports-reference.com/cfb/players/a...,,2024


In [12]:
# Extract all college_stats_url values into a list
college_stats_urls = combine_table_rb['college_stats_url'].tolist()

# Display the list
print(college_stats_urls)


['https://www.sports-reference.com/cfb/players/rasheen-ali-1.html', 'https://www.sports-reference.com/cfb/players/braelon-allen-1.html', 'https://www.sports-reference.com/cfb/players/emani-bailey-1.html', 'https://www.sports-reference.com/cfb/players/trey-benson-2.html', 'https://www.sports-reference.com/cfb/players/jonathon-brooks-1.html', 'https://www.sports-reference.com/cfb/players/blake-corum-1.html', None, 'https://www.sports-reference.com/cfb/players/remahn-davis-1.html', None, 'https://www.sports-reference.com/cfb/players/audric-estime-1.html', None, 'https://www.sports-reference.com/cfb/players/isaac-guerendo-1.html', 'https://www.sports-reference.com/cfb/players/george-holani-1.html', 'https://www.sports-reference.com/cfb/players/markeise-irving-1.html', 'https://www.sports-reference.com/cfb/players/dillon-johnson-1.html', 'https://www.sports-reference.com/cfb/players/jawhar-jordan-1.html', 'https://www.sports-reference.com/cfb/players/tyrone-tracy-jr-1.html', None, 'https://

In [13]:
pip install fake-useragent




In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent
import time


# Initialize UserAgent object
user_agent = UserAgent()


# List to store DataFrames
dataframes = []

# Iterate through quarterback URLs
for url in college_stats_urls:
    try:
        # Define proxy (replace 'http://proxy:port' with your proxy address)
        proxy = 'http://proxy:port'

        # Set up proxy and user agent for the request
        response = requests.get(url, proxies={'http': proxy}, headers={'User-Agent': user_agent.random})

        # Check if the request was successful
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            passing_table = soup.find('table', {'id': 'rushing'})

            # Check if passing table exists
            if passing_table:
                table_rows = passing_table.find_all('tr')
                data = []

                # Iterate through rows of the table starting from the second row
                for row in table_rows[1:]:
                    row_data = [cell.text.strip() for cell in row.find_all(['th', 'td'])]
                    data.append(row_data)

                # Create a DataFrame for the table data
                df = pd.DataFrame(data)

                # Add URL column and populate it with the current URL
                df['URL'] = url

                # Append the DataFrame to the list
                dataframes.append(df)
            else:
                print(f"No table found for {url}")
        else:
            print(f"Failed to fetch {url}")
    except Exception as e:
        print(f"Error scraping {url}: {e}")

    # Introduce a time delay of 4 seconds per iteration
    time.sleep(4)

# Concatenate all DataFrames into a single DataFrame
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(combined_df)
else:
    print("No data to display.")


Error scraping None: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?
Error scraping None: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?
Error scraping None: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?
Error scraping None: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?
Error scraping None: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?
          0          1         2      3    4   5    6     7    8   9   10  \
0      Year     School      Conf  Class  Pos   G  Att   Yds  Avg  TD  Rec   
1     *2020   Marshall      CUSA     FR   RB   6    5    22  4.4   0    0   
2     *2021   Marshall      CUSA     FR   RB  13  250  1401  5.6  23   45   
3     *2022   Marshall  Sun Belt     SO        3   47   273  5.8   1    2   
4     *2023   Marshall  Sun Belt     SR   RB  12  212  1135  5.4  15   28   
..      ...        ...       ...    ...  ...  ..  ...   ...  ...  ..  ...

In [15]:
# Filter out rows with 'Year' and 'Career' in column 0
#combined_df_filtered = combined_df3[~combined_df3[0].isin(['Year', 'Career'])]
combined_df_filtered = combined_df
# Define the new column names
new_column_names = {
    0: 'Year',
    1: 'School',
    2: 'Conf',
    3: 'Class',
    4: 'Pos',
    5: 'G',
    6: 'college_rush_att',
    7: 'college_rush_yds',
    8: 'college_rush_avg',
    9: 'college_rush_TD',
    10: 'college_rec_rec',
    11: 'college_rec_yds',
    12: 'college_rec_avg',
    13: 'college_rec_TD',
    14: 'college_scrim_plays',
    15: 'college_scrim_yds',
    16: 'college_scrim_avg',
    17: 'college_scrim_TD',
}

# Rename the columns
combined_df_filtered = combined_df_filtered.rename(columns=new_column_names)

In [16]:
combined_df_filtered['Year'] = combined_df_filtered['Year'].str.replace('*', '')

# Show the resulting DataFrame
print(combined_df_filtered)

       Year     School      Conf  Class  Pos   G college_rush_att  \
0      Year     School      Conf  Class  Pos   G              Att   
1      2020   Marshall      CUSA     FR   RB   6                5   
2      2021   Marshall      CUSA     FR   RB  13              250   
3      2022   Marshall  Sun Belt     SO        3               47   
4      2023   Marshall  Sun Belt     SR   RB  12              212   
..      ...        ...       ...    ...  ...  ..              ...   
160    Year     School      Conf  Class  Pos   G              Att   
161    2021  Tennessee       SEC     FR   RB   9               85   
162    2022  Tennessee       SEC     SO       13              146   
163    2023  Tennessee       SEC     SR   RB  12              137   
164  Career  Tennessee                                        368   

    college_rush_yds college_rush_avg college_rush_TD college_rec_rec  \
0                Yds              Avg              TD             Rec   
1                 22     

  combined_df_filtered['Year'] = combined_df_filtered['Year'].str.replace('*', '')


In [17]:
columns_to_convert = ["Year",'G', 'college_rush_att', 'college_rush_yds', 'college_rush_avg', 'college_rush_TD',
                      'college_rec_rec', 'college_rec_yds', 'college_rec_avg', 'college_rec_TD',
                      'college_scrim_plays', 'college_scrim_yds', 'college_scrim_avg', 'college_scrim_TD']


# Iterate over each column and convert to numeric
for col in columns_to_convert:
    # Replace any non-numeric values with NaN and coerce to numeric
    combined_df_filtered[col] = pd.to_numeric(combined_df_filtered[col], errors='coerce')

# Verify the conversion
print(combined_df_filtered.dtypes)

Year                   float64
School                  object
Conf                    object
Class                   object
Pos                     object
G                      float64
college_rush_att       float64
college_rush_yds       float64
college_rush_avg       float64
college_rush_TD        float64
college_rec_rec        float64
college_rec_yds        float64
college_rec_avg        float64
college_rec_TD         float64
college_scrim_plays    float64
college_scrim_yds      float64
college_scrim_avg      float64
college_scrim_TD       float64
URL                     object
dtype: object


In [18]:
# Drop rows where 'Year' column contains values 'Year', 'Career', or is blank
filtered_df = combined_df_filtered[~combined_df_filtered['Year'].isin(['Year', 'Career', ''])]

# Show the resulting DataFrame
print(filtered_df)

       Year     School      Conf  Class  Pos     G  college_rush_att  \
0       NaN     School      Conf  Class  Pos   NaN               NaN   
1    2020.0   Marshall      CUSA     FR   RB   6.0               5.0   
2    2021.0   Marshall      CUSA     FR   RB  13.0             250.0   
3    2022.0   Marshall  Sun Belt     SO        3.0              47.0   
4    2023.0   Marshall  Sun Belt     SR   RB  12.0             212.0   
..      ...        ...       ...    ...  ...   ...               ...   
160     NaN     School      Conf  Class  Pos   NaN               NaN   
161  2021.0  Tennessee       SEC     FR   RB   9.0              85.0   
162  2022.0  Tennessee       SEC     SO       13.0             146.0   
163  2023.0  Tennessee       SEC     SR   RB  12.0             137.0   
164     NaN  Tennessee                         NaN             368.0   

     college_rush_yds  college_rush_avg  college_rush_TD  college_rec_rec  \
0                 NaN               NaN              NaN  

In [19]:
# Drop rows where 'Year' column contains NaN
cleaned_df = combined_df_filtered.dropna(subset=['Year'])

# Show the resulting DataFrame
print(cleaned_df)


       Year      School      Conf Class Pos     G  college_rush_att  \
1    2020.0    Marshall      CUSA    FR  RB   6.0               5.0   
2    2021.0    Marshall      CUSA    FR  RB  13.0             250.0   
3    2022.0    Marshall  Sun Belt    SO       3.0              47.0   
4    2023.0    Marshall  Sun Belt    SR  RB  12.0             212.0   
7    2021.0   Wisconsin   Big Ten    FR  RB  12.0             186.0   
..      ...         ...       ...   ...  ..   ...               ...   
157  2022.0  Ohio State   Big Ten    JR      11.0             128.0   
158  2023.0  Ohio State   Big Ten    SR  RB   6.0              49.0   
161  2021.0   Tennessee       SEC    FR  RB   9.0              85.0   
162  2022.0   Tennessee       SEC    SO      13.0             146.0   
163  2023.0   Tennessee       SEC    SR  RB  12.0             137.0   

     college_rush_yds  college_rush_avg  college_rush_TD  college_rec_rec  \
1                22.0               4.4              0.0              

In [20]:
import numpy as np

# Group by 'URL' and apply aggregation functions
agg_functions = {
    'Year': lambda x: '',
    'School': lambda x: x.mode().iloc[0],
    'Conf': lambda x: x.mode().iloc[0],
    'Class': lambda x: '',
    'Pos': lambda x: '',
    'G': 'sum',
    'college_rush_att': 'sum',
    'college_rush_yds': 'sum',
    'college_rush_avg': 'mean',
    'college_rush_TD': 'sum',
    'college_rec_rec': 'sum',
    'college_rec_yds': 'sum',
    'college_rec_avg': 'mean',
    'college_rec_TD': 'sum',
    'college_scrim_plays': 'sum',
    'college_scrim_yds': 'sum',
    'college_scrim_avg': 'mean',
    'college_scrim_TD': 'sum'
}

# Apply aggregation
grouped_df = cleaned_df.groupby('URL').agg(agg_functions).reset_index()

In [21]:
grouped_df

Unnamed: 0,URL,Year,School,Conf,Class,Pos,G,college_rush_att,college_rush_yds,college_rush_avg,college_rush_TD,college_rec_rec,college_rec_yds,college_rec_avg,college_rec_TD,college_scrim_plays,college_scrim_yds,college_scrim_avg,college_scrim_TD
0,https://www.sports-reference.com/cfb/players/a...,,Notre Dame,Ind,,,37.0,373.0,2321.0,6.966667,29.0,26.0,277.0,11.7,1.0,399.0,2598.0,7.166667,30.0
1,https://www.sports-reference.com/cfb/players/b...,,Michigan,Big Ten,,,45.0,675.0,3737.0,5.075,58.0,56.0,411.0,8.775,3.0,731.0,4148.0,5.575,61.0
2,https://www.sports-reference.com/cfb/players/b...,,Wisconsin,Big Ten,,,35.0,597.0,3494.0,5.866667,35.0,49.0,275.0,5.866667,0.0,646.0,3769.0,5.833333,35.0
3,https://www.sports-reference.com/cfb/players/c...,,Missouri,SEC,,,26.0,446.0,2372.0,5.15,23.0,41.0,328.0,7.95,0.0,487.0,2700.0,5.4,23.0
4,https://www.sports-reference.com/cfb/players/d...,,Mississippi State,SEC,,,49.0,462.0,2393.0,5.1,27.0,173.0,1054.0,6.175,1.0,635.0,3447.0,5.325,28.0
5,https://www.sports-reference.com/cfb/players/e...,,Louisiana,Big 12,,,42.0,366.0,2161.0,6.45,18.0,47.0,380.0,7.9,1.0,413.0,2541.0,6.525,19.0
6,https://www.sports-reference.com/cfb/players/g...,,Boise State,MWC,,,48.0,685.0,3596.0,5.32,26.0,88.0,777.0,9.08,8.0,773.0,4373.0,5.78,34.0
7,https://www.sports-reference.com/cfb/players/i...,,Wisconsin,Big Ten,,,37.0,231.0,1392.0,4.68,17.0,42.0,358.0,5.85,1.0,273.0,1750.0,4.98,18.0
8,https://www.sports-reference.com/cfb/players/j...,,Alabama,SEC,,,43.0,355.0,1981.0,6.55,18.0,40.0,409.0,8.05,6.0,395.0,2390.0,7.0,24.0
9,https://www.sports-reference.com/cfb/players/j...,,Louisville,ACC,,,38.0,381.0,2214.0,5.62,18.0,39.0,476.0,18.9,1.0,420.0,2690.0,7.0,19.0


In [24]:
# Remove rows where either 'forty' or 'college_stats_url' is null
combine_table_rb = combine_table_rb.dropna(subset=['forty', 'college_stats_url'])

# Display the updated DataFrame
print(combine_table_rb)


               player pos      college          stats height weight forty  \
10       Emani Bailey  RB          TCU  College Stats    5-7    202  4.61   
18        Trey Benson  RB  Florida St.  College Stats    6-0    216  4.39   
51        Blake Corum  RB     Michigan  College Stats    5-8    205  4.53   
59      Re'Mahn Davis  RB     Kentucky  College Stats    5-8    211  4.52   
75      Audric Estimé  RB   Notre Dame  College Stats   5-11    221  4.71   
100    Isaac Guerendo  RB   Louisville  College Stats    6-0    221  4.33   
116     George Holani  RB    Boise St.  College Stats   5-10    208  4.52   
122      Bucky Irving  RB       Oregon  College Stats    5-9    192  4.55   
137    Dillon Johnson  RB   Washington  College Stats    6-0    217  4.68   
145     Jawhar Jordan  RB   Louisville  College Stats   5-10    193  4.56   
150  Tyrone Tracy Jr.  RB       Purdue  College Stats   5-11    209  4.48   
175    MarShawn Lloyd  RB          USC  College Stats    5-9    220  4.46   

In [27]:
## Perform inner join on the 'college_stats_url' column in combine_table_rb and the 'URL' column in grouped_df
combined_df = pd.merge(combine_table_rb, grouped_df, left_on='college_stats_url', right_on='URL', how='inner')

# Display the combined DataFrame
combined_df


Unnamed: 0,player,pos,college,stats,height,weight,forty,vertical,bench,broad,...,college_rush_avg,college_rush_TD,college_rec_rec,college_rec_yds,college_rec_avg,college_rec_TD,college_scrim_plays,college_scrim_yds,college_scrim_avg,college_scrim_TD
0,Emani Bailey,RB,TCU,College Stats,5-7,202,4.61,33.5,,116.0,...,6.45,18.0,47.0,380.0,7.9,1.0,413.0,2541.0,6.525,19.0
1,Trey Benson,RB,Florida St.,College Stats,6-0,216,4.39,33.5,,122.0,...,5.3,24.0,33.0,371.0,11.25,1.0,349.0,2289.0,5.633333,25.0
2,Blake Corum,RB,Michigan,College Stats,5-8,205,4.53,35.5,27.0,,...,5.075,58.0,56.0,411.0,8.775,3.0,731.0,4148.0,5.575,61.0
3,Re'Mahn Davis,RB,Kentucky,College Stats,5-8,211,4.52,35.0,,119.0,...,4.74,29.0,94.0,762.0,7.66,12.0,840.0,4388.0,5.06,41.0
4,Audric Estimé,RB,Notre Dame,College Stats,5-11,221,4.71,38.0,23.0,125.0,...,6.966667,29.0,26.0,277.0,11.7,1.0,399.0,2598.0,7.166667,30.0
5,Isaac Guerendo,RB,Louisville,College Stats,6-0,221,4.33,41.5,,129.0,...,4.68,17.0,42.0,358.0,5.85,1.0,273.0,1750.0,4.98,18.0
6,George Holani,RB,Boise St.,College Stats,5-10,208,4.52,39.0,24.0,127.0,...,5.32,26.0,88.0,777.0,9.08,8.0,773.0,4373.0,5.78,34.0
7,Bucky Irving,RB,Oregon,College Stats,5-9,192,4.55,29.5,,115.0,...,6.133333,20.0,95.0,785.0,8.7,5.0,570.0,3722.0,6.466667,25.0
8,Dillon Johnson,RB,Washington,College Stats,6-0,217,4.68,31.5,,117.0,...,5.1,27.0,173.0,1054.0,6.175,1.0,635.0,3447.0,5.325,28.0
9,Jawhar Jordan,RB,Louisville,College Stats,5-10,193,4.56,,,,...,5.62,18.0,39.0,476.0,18.9,1.0,420.0,2690.0,7.0,19.0


In [30]:
# Select only the desired columns from the combined DataFrame
selected_columns = ['player', 'forty', 'Conf', 'G', 'college_rush_att', 'college_rush_yds',
                    'college_rush_avg', 'college_rush_TD', 'college_rec_rec', 'college_rec_yds',
                    'college_rec_avg', 'college_rec_TD', 'college_scrim_plays', 'college_scrim_yds',
                    'college_scrim_avg', 'college_scrim_TD']

combined_df_selected = combined_df[selected_columns]

# Display the resulting DataFrame
combined_df_selected


Unnamed: 0,player,forty,Conf,G,college_rush_att,college_rush_yds,college_rush_avg,college_rush_TD,college_rec_rec,college_rec_yds,college_rec_avg,college_rec_TD,college_scrim_plays,college_scrim_yds,college_scrim_avg,college_scrim_TD
0,Emani Bailey,4.61,Big 12,42.0,366.0,2161.0,6.45,18.0,47.0,380.0,7.9,1.0,413.0,2541.0,6.525,19.0
1,Trey Benson,4.39,ACC,36.0,316.0,1918.0,5.3,24.0,33.0,371.0,11.25,1.0,349.0,2289.0,5.633333,25.0
2,Blake Corum,4.53,Big Ten,45.0,675.0,3737.0,5.075,58.0,56.0,411.0,8.775,3.0,731.0,4148.0,5.575,61.0
3,Re'Mahn Davis,4.52,SEC,44.0,746.0,3626.0,4.74,29.0,94.0,762.0,7.66,12.0,840.0,4388.0,5.06,41.0
4,Audric Estimé,4.71,Ind,37.0,373.0,2321.0,6.966667,29.0,26.0,277.0,11.7,1.0,399.0,2598.0,7.166667,30.0
5,Isaac Guerendo,4.33,Big Ten,37.0,231.0,1392.0,4.68,17.0,42.0,358.0,5.85,1.0,273.0,1750.0,4.98,18.0
6,George Holani,4.52,MWC,48.0,685.0,3596.0,5.32,26.0,88.0,777.0,9.08,8.0,773.0,4373.0,5.78,34.0
7,Bucky Irving,4.55,Pac-12,39.0,475.0,2937.0,6.133333,20.0,95.0,785.0,8.7,5.0,570.0,3722.0,6.466667,25.0
8,Dillon Johnson,4.68,SEC,49.0,462.0,2393.0,5.1,27.0,173.0,1054.0,6.175,1.0,635.0,3447.0,5.325,28.0
9,Jawhar Jordan,4.56,ACC,38.0,381.0,2214.0,5.62,18.0,39.0,476.0,18.9,1.0,420.0,2690.0,7.0,19.0


In [31]:
# Define the list of Power 5 conferences
power_5_conferences = {"Big Ten", "ACC", "SEC", "Pac-12", "Big 12"}

# Create a new column 'Power_5_Num' based on the condition
combined_df_selected['Power_5_Num'] = combined_df_selected['Conf'].apply(lambda x: 1 if x in power_5_conferences else 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_selected['Power_5_Num'] = combined_df_selected['Conf'].apply(lambda x: 1 if x in power_5_conferences else 0)


In [32]:
combined_df_selected

Unnamed: 0,player,forty,Conf,G,college_rush_att,college_rush_yds,college_rush_avg,college_rush_TD,college_rec_rec,college_rec_yds,college_rec_avg,college_rec_TD,college_scrim_plays,college_scrim_yds,college_scrim_avg,college_scrim_TD,Power_5_Num
0,Emani Bailey,4.61,Big 12,42.0,366.0,2161.0,6.45,18.0,47.0,380.0,7.9,1.0,413.0,2541.0,6.525,19.0,1
1,Trey Benson,4.39,ACC,36.0,316.0,1918.0,5.3,24.0,33.0,371.0,11.25,1.0,349.0,2289.0,5.633333,25.0,1
2,Blake Corum,4.53,Big Ten,45.0,675.0,3737.0,5.075,58.0,56.0,411.0,8.775,3.0,731.0,4148.0,5.575,61.0,1
3,Re'Mahn Davis,4.52,SEC,44.0,746.0,3626.0,4.74,29.0,94.0,762.0,7.66,12.0,840.0,4388.0,5.06,41.0,1
4,Audric Estimé,4.71,Ind,37.0,373.0,2321.0,6.966667,29.0,26.0,277.0,11.7,1.0,399.0,2598.0,7.166667,30.0,0
5,Isaac Guerendo,4.33,Big Ten,37.0,231.0,1392.0,4.68,17.0,42.0,358.0,5.85,1.0,273.0,1750.0,4.98,18.0,1
6,George Holani,4.52,MWC,48.0,685.0,3596.0,5.32,26.0,88.0,777.0,9.08,8.0,773.0,4373.0,5.78,34.0,0
7,Bucky Irving,4.55,Pac-12,39.0,475.0,2937.0,6.133333,20.0,95.0,785.0,8.7,5.0,570.0,3722.0,6.466667,25.0,1
8,Dillon Johnson,4.68,SEC,49.0,462.0,2393.0,5.1,27.0,173.0,1054.0,6.175,1.0,635.0,3447.0,5.325,28.0,1
9,Jawhar Jordan,4.56,ACC,38.0,381.0,2214.0,5.62,18.0,39.0,476.0,18.9,1.0,420.0,2690.0,7.0,19.0,1


In [33]:
# Save draft_table to a CSV file
combined_df_selected.to_csv('2024_combine_rb.csv', index=False)