In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def get_max_page(start_date, end_date):
    base_url = f"https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate={start_date}&EndDate={end_date}&ILChkBx=yes&InjuriesChkBx=yes&PersonalChkBx=yes&Submit=Search&start=0"
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    links = soup.find_all('a')
    max_page = 0
    for link in links:
        href = link.get('href')
        if href and 'start=' in href:
            page_number = int(href.split('start=')[1])
            if page_number > max_page:
                max_page = page_number
                
    return max_page

def scrape_data(start_date, end_date, max_page):
    data = []
    base_url = "https://www.prosportstransactions.com/basketball/Search/SearchResults.php"
    params = {
        "Player": "",
        "Team": "",
        "BeginDate": start_date,
        "EndDate": end_date,
        "ILChkBx": "yes",
        "InjuriesChkBx": "yes",
        "PersonalChkBx": "yes",
        "Submit": "Search",
        "start": 0
    }

    for start in tqdm(range(0, max_page + 1, 25)):
        params["start"] = start
        response = requests.get(base_url, params=params)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.select_one('.datatable')
        
        if table:
            df = pd.read_html(str(table))[0]
            data.append(df)
    
    return pd.concat(data, ignore_index=True)

def main():
    start_date = "2000-01-01"
    end_date = "2024-05-22"
    
    print("Finding the maximum page number...")
    max_page = get_max_page(start_date, end_date)
    print(f"Maximum page number found: {max_page}")
    
    print("Scraping data...")
    data = scrape_data(start_date, end_date, max_page)
    
    # Adjust the column names and remove the header row from the data
    data.columns = data.iloc[0]
    data = data.drop(0).reset_index(drop=True)
    
    # Save the data to a CSV file
    data.to_csv("basketball_data.csv", index=False)
    print("Data scraping complete. Saved to basketball_data.csv")

if __name__ == "__main__":
    main()


Finding the maximum page number...
Maximum page number found: 63825
Scraping data...


100%|██████████| 2554/2554 [3:11:20<00:00,  4.50s/it]  


Data scraping complete. Saved to basketball_data.csv


In [3]:
import pandas as pd

# Load the existing data
data = pd.read_csv("basketball_data.csv")

# Remove the '• ' prefix from player names in the Acquired and Relinquished columns
data['Acquired'] = data['Acquired'].str.replace('• ', '', regex=False)
data['Relinquished'] = data['Relinquished'].str.replace('• ', '', regex=False)

# Save the cleaned data to a new CSV file
data.to_csv("basketball_data_no_dot.csv", index=False)

print("Dots removed. Saved to basketball_data_no_dot.csv")


Dots removed. Saved to basketball_data_no_dot.csv


In [4]:
# Load the existing data
data = pd.read_csv("basketball_data_no_dot.csv")

# Create a new column "Status"
data['Status'] = 'Unknown'  # Initialize with "Unknown" status

# Update "Status" based on the presence of player names in "Acquired" and "Relinquished" columns
data.loc[data['Relinquished'].notna(), 'Status'] = 'Relinquished'
data.loc[data['Acquired'].notna(), 'Status'] = 'Acquired'

# Save the updated data to a new CSV file
data.to_csv("basketball_data_with_status.csv", index=False)

print("Status column added. Saved to basketball_data_with_status.csv")


Status column added. Saved to basketball_data_with_status.csv


In [5]:
# Load the existing data
data = pd.read_csv("basketball_data_with_status.csv")

# Consolidate "Acquired" and "Relinquished" columns into a single "Player" column
data['Player'] = data['Acquired'].fillna(data['Relinquished'])

# Drop the now redundant "Acquired" and "Relinquished" columns
data.drop(columns=['Acquired', 'Relinquished'], inplace=True)

# Save the updated data to a new CSV file
data.to_csv("basketball_data_consolidated.csv", index=False)

print("Acquired and Relinquished columns consolidated. Saved to basketball_data_consolidated.csv")


Acquired and Relinquished columns consolidated. Saved to basketball_data_consolidated.csv


In [6]:
# Load the existing data
data = pd.read_csv("basketball_data_consolidated.csv")

# Filter the data to include only rows with phrases related to ACL injuries in the "Notes" column
acl_subset = data[data['Notes'].str.contains('ACL|anterior cruciate ligament|Anterior Cruciate Ligament|Anterior cruciate ligament', case=False)]

# Save the ACL injury subset data to a new CSV file
acl_subset.to_csv("basketball_data_acl_subset.csv", index=False)

print("ACL injury subset created. Saved to basketball_data_acl_subset.csv")


ACL injury subset created. Saved to basketball_data_acl_subset.csv


In [7]:
# Load the ACL injury subset data
acl_subset = pd.read_csv("basketball_data_acl_subset.csv")

# Create a subset with unique players
unique_players_subset = acl_subset.drop_duplicates(subset='Player')

# Save the unique players subset data to a new CSV file
unique_players_subset.to_csv("unique_players_subset.csv", index=False)

print("Unique players subset created. Saved to unique_players_subset.csv")


Unique players subset created. Saved to unique_players_subset.csv


In [9]:
import pandas as pd
# Load the basketball_data_with_status DataFrame
data_with_status = pd.read_csv("basketball_data_consolidated.csv")

# Load the ACL injury subset data
acl_subset = pd.read_csv("basketball_data_acl_subset.csv")

# Get unique player names from the ACL injury subset
unique_players = acl_subset['Player'].unique()

# Filter the DataFrame to include only rows for the unique players
acl_players_full = data_with_status[data_with_status['Player'].isin(unique_players)]

# Save the unique players subset data to a new CSV file
acl_players_full.to_csv("acl_players_full.csv", index=False)

print("Unique players subset created. Saved to acl_players_full.csv")


Unique players subset created. Saved to acl_players_full.csv


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import logging
import time

# Load the dataset
dataset_path = 'test_acl.csv'  # Update this to the path of your dataset
players_df = pd.read_csv(dataset_path)

# Define the player IDs manually for Basketball-Reference
player_ids = {
    "Tom Gugliotta": "guglitom01",
    "Bonzi Wells": "wellsbo01",
    "Jamal Crawford": "crawfja01",
    "Chris Crawford": "crawfch01",
    "Al Harrington": "harrial01",
    "Vitaly Potapenko": "potapvi01",
    "Steven Hunter": "huntest01",
    "Felipe Lopez": "lopezfe01",
    "Jarron Collins": "collija01",
    "Jared Jeffries": "jeffrja01",
    "Marcus Fizer": "fizerma01",
    "Obinna Ekezie": "ekeziob01",
    "Pat Garrity": "garripa01",
    "Ben Handlogten": "handlbe01",
    "Alex Garcia": "garcial01",
    "Willie Green": "greenwi01",
    "Nenê": "nenen01",
    "Robert Swift": "swiftro01",
    "Stromile Swift": "swiftst01",
    "Nenad Krstic": "krstine01",
    "Tony Allen": "allento01",
    "D.J. Mbenga": "mbengdj01",
    "Shaun Livingston": "livinsh01",
    "Adam Morrison": "morriad01",
    "Paul Davis": "davispa01",
    "Mickael Gelabale": "gelabmi01",
    "Jason Smith": "smithja01",
    "Jason Richards": "richa00",
    "Mike Wilks": "wilksmi01",
    "Corey Brewer": "breweco01",
    "Michael Redd": "reddmi01",
    "Al Jefferson": "jeffeal01",
    "Leon Powe": "powele01",
    "Kareem Rush": "rushka01",
    "Josh Howard": "howarjo01",
    "Kendrick Perkins": "perkike01",
    "Jeff Ayres": "pendeje02",
    "Gani Lawal": "lawalga01",
    "David West": "westda01",
    "Othyus Jeffers": "jeffeot01",
    "Eric Maynor": "maynoer01",
    "Ricky Rubio": "rubiori01",
    "Derrick Rose": "rosede01",
    "Iman Shumpert": "shumpim01",
    "Baron Davis": "davisba01",
    "Brandon Rush": "rushbr01",
    "Lou Williams": "willilo02",
    "Rajon Rondo": "rondora01",
    "Leandro Barbosa": "barbole01",
    "Danilo Gallinari": "gallida01",
    "Nate Robinson": "robinna01",
    "J.J. Hickson": "hicksjj01",
    "Jabari Parker": "parkeja01",
    "Kendall Marshall": "marshke01",
    "Tony Wroten Jr.": "wroteto01",
    "Dante Exum": "exumda01",
    "Jarrett Jack": "jackja01",
    "Chris Andersen": "anderch01",
    "Zach LaVine": "lavinza01",
    "Brandon Knight": "knighbr03",
    "O.G. Anunoby": "anunoog01",
    "Kristaps Porzingis": "porzikr01",
    "Pau Gasol": "gasolpa01",
    "Dejounte Murray": "murrade01",
    "Klay Thompson": "thompkl01",
    "DeMarcus Cousins": "couside01",
    "Max Strus": "strusma01",
    "Jeremy Lamb": "lambje01",
    "Jonathan Isaac": "isaacjo01",
    "Spencer Dinwiddie": "dinwisp01",
    "Markelle Fultz": "fultzma01",
    "Thomas Bryant": "bryanth01",
    "Jamal Murray": "murraja01",
    "Dario Saric": "saricda01",
    "P.J. Dozier": "doziepj01",
    "Kira Lewis Jr.": "lewiske01",
    "Joe Ingles": "inglejo01",
    "Chris Smith": "smithch04",
    "E.J. Liddell": "liddel01",
    "Montrezl Harrell": "harremo01",
    "Vlatko Cancar": "cancavl01",
    "Jay Scrubb": "scrubja01",
    "Charles Bassey": "bassech01",
    "Saddiq Bey": "beysa01"
}

# Function to scrape game logs and find the return date
def get_return_date(player, injury_date, injury_season):
    injury_date = datetime.strptime(injury_date, "%m/%d/%y")
    player_id = player_ids.get(player)
    return_season = injury_season + 1
    if not player_id:
        logging.warning(f"Player ID not found for {player}")
        return "Player ID not found"
    
    # Construct URL for the game log page
    url = f"https://www.basketball-reference.com/players/{player_id[0]}/{player_id}/gamelog/{return_season}"
    logging.info(f"Fetching URL: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx and 5xx)
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching URL for {player} ({url}): {e}")
        return "URL not found"
    
    soup = BeautifulSoup(response.text, 'html.parser')
    game_log_table = soup.find('table', {'id': 'pgl_basic'})
    if not game_log_table:
        logging.error(f"Game log table not found for {player} ({url})")
        return "Game log table not found"
    
    rows = game_log_table.find_all('tr', class_=lambda x: x != 'thead')
    for row in rows:
        game_date_str = row.find('td', {'data-stat': 'date_game'})
        if not game_date_str:
            continue
        game_date = datetime.strptime(game_date_str.text, "%Y-%m-%d")
        if game_date > injury_date:
            return game_date.strftime("%m/%d/%y")
    return "Return date not found"

# Apply the function to each player with a progress bar
results = []
for _, row in tqdm(players_df.iterrows(), total=players_df.shape[0], desc="Processing players"):
    player = row['Player']
    injury_date = row['Injury_Date']
    injury_season = row['Injury_Season']
    return_date = get_return_date(player, injury_date, injury_season)
    results.append({"Player": player, "Injury_Date": injury_date, "Return_Date": return_date})
    time.sleep(1)  # Delay to avoid overwhelming the server

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('players_return_dates.csv', index=False)

print("Script completed. Check 'players_return_dates.csv' for the results.")


Processing players:   0%|          | 0/84 [00:00<?, ?it/s]ERROR:root:Game log table not found for Tom Gugliotta (https://www.basketball-reference.com/players/g/guglitom01/gamelog/2001)
Processing players:   8%|▊         | 7/84 [00:13<02:29,  1.95s/it]ERROR:root:Game log table not found for Felipe Lopez (https://www.basketball-reference.com/players/l/lopezfe01/gamelog/2004)
Processing players:  10%|▉         | 8/84 [00:14<02:26,  1.92s/it]ERROR:root:Game log table not found for Jarron Collins (https://www.basketball-reference.com/players/c/collija01/gamelog/2004)
Processing players:  17%|█▋        | 14/84 [00:25<02:06,  1.81s/it]ERROR:root:Game log table not found for Alex Garcia (https://www.basketball-reference.com/players/g/garcial01/gamelog/2006)
Processing players:  19%|█▉        | 16/84 [00:29<02:05,  1.85s/it]ERROR:root:Game log table not found for Nenê (https://www.basketball-reference.com/players/n/nenen01/gamelog/2007)
Processing players:  30%|██▉       | 25/84 [00:47<01:55,  

Script completed. Check 'players_return_dates.csv' for the results.



