In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, Comment
import time
import sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from io import StringIO
import io

### Data scrape index: (Current Season - From 1950) 

### All data was scraped from basketballreference.com
https://www.basketball-reference.com/ 

##### Player: (6 csv's)
- RS | Per Game | Player
- RS | Advanced | Player
- RS | Totals | Player
- PO | Per Game | Player
- PO | Advanced | Player
- PO | Totals | Player

- Time: 38 minutes

##### All Awards Voting/Teams (Player/Coaches): (2 csv's all_awards/all_coach_awards)
- MVP voting | Player
- ROY voting | Player
- DPOY voting | Player
- SMOY voting | Player
- MIP voting | Player
- CPOY voting | Player
- All-NBA | Player
- All-Defense | Player
- All-Rookie | Player
- COY voting | Coach

- Time: 366 minutes

##### Team: (6 csv's)
- RS | Per Game | Team
- RS | Opp Per Game | Team
- RS | Advanced | Team
- PO | Advaned | Team
- RS | Schedule/Results | Team (play-in games NOT counted)
- PO | Schedule/Results | Team (play-in games NOT counted)

- Time: 38 minutes

##### Other (Team and Coaches): (4 csv's)
- Other | Pre-Season Odds | Team
- RS | Expanded Standings | Team
- Season | Coaches
- Team | Rosters | Season

- Time: 280 minutes

##### Team/Player Indexs | Data relation (keys) | Additional df's | not web scraped, by the way... (3 csv's)
- custom_team_season_index (contain team ids, origins, conference, for each team, each season since 1950).
- custom_team_franchise_index (cotains team ids, origins, uniques for each franchise since 1947
- custom_player_index (created stricly by returning unique 'Player' and 'Season' (s) since 1950, returning a unique value for the id).

##### Total RawData files created from this:
- > 6 csv's from player
- > 2 csv's from awards
- > 6 csv's from team
- > 4 csv's from other
- > 3 csv's from index

- > Total csv's altogether: 18 files scraped | 3 custom files | 21 files total
- > Total scrape time: 13.5 hours

In [5]:
#set the year range you want 

start = 1950
end = 2025 #minus 1, so if you type 2025, it actually means = 2024

seasons_list = [str(year) for year in range(start, end)]

### RS | Per Game | Player

In [52]:

def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'per_game_stats'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons):
    all_data = pd.DataFrame()

    total_seasons = len(seasons)

    for i, season in enumerate(reversed(seasons), 1):
        data = scrape_season(season)

        if data is not None:
            all_data = pd.concat([all_data, data], ignore_index=True)

        completion_percentage = (i / total_seasons) * 100
        sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
        sys.stdout.flush()

    sys.stdout.flush()

    all_data = all_data[['Season'] + [col for col in all_data.columns if col != 'Season']]
    all_data = all_data[~all_data['Player'].isin(all_data.columns)]

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)
result['player_id'] = result.groupby('Player').ngroup()
result['player_id_season'] = result['player_id'].astype(str) + '_' + result['Season'].astype(str)
result['Player'] = result['Player'].str.replace('*', '')

RS_Per_Game_Player = result
RS_Per_Game_Player.to_csv('RS_Per_Game_Player.csv', index=False)

Scraping: [##################################################] 100.00%

### RS | Advanced | Player

In [5]:

def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'advanced_stats'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons):
    all_data = pd.DataFrame()

    total_seasons = len(seasons)

    for i, season in enumerate(reversed(seasons), 1):
        data = scrape_season(season)

        if data is not None:
            all_data = pd.concat([all_data, data], ignore_index=True)

        completion_percentage = (i / total_seasons) * 100
        sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
        sys.stdout.flush()

    sys.stdout.flush()

    all_data = all_data[['Season'] + [col for col in all_data.columns if col != 'Season']]
    all_data = all_data[['Season'] + [col for col in all_data.columns if col not in ['Season']]]
    all_data = all_data[~all_data['Player'].isin(all_data.columns)]

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

result = result.loc[:, ~result.columns.str.contains('^Unnamed')]
result['player_id'] = result.groupby('Player').ngroup()
result['player_id_season'] = result['player_id'].astype(str) + '_' + result['Season'].astype(str)
result['Player'] = result['Player'].str.replace('*', '')

RS_Advanced_Player = result
RS_Advanced_Player.to_csv('RS_Advanced_Player.csv', index=False)

Scraping: [##################################################] 100.00%

### RS | Totals | Player

In [9]:


def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}_totals.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'totals_stats'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons):
    all_data = pd.DataFrame()

    total_seasons = len(seasons)

    for i, season in enumerate(reversed(seasons), 1):
        data = scrape_season(season)

        if data is not None:
            all_data = pd.concat([all_data, data], ignore_index=True)

        completion_percentage = (i / total_seasons) * 100
        sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
        sys.stdout.flush()

    sys.stdout.flush()

    all_data = all_data[['Season'] + [col for col in all_data.columns if col != 'Season']]
    all_data = all_data[['Season'] + [col for col in all_data.columns if col not in ['Season']]]
    all_data = all_data[~all_data['Player'].isin(all_data.columns)]

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

result = result.loc[:, ~result.columns.str.contains('^Unnamed')]
result['player_id'] = result.groupby('Player').ngroup()
result['player_id_season'] = result['player_id'].astype(str) + '_' + result['Season'].astype(str)
result['Player'] = result['Player'].str.replace('*', '')

RS_Totals_Player = result
RS_Totals_Player.to_csv('RS_Totals_Player.csv', index=False)

Scraping: [##################################################] 100.00%

### PO | Per Game | Player

In [10]:


def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/playoffs/NBA_{season}_per_game.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'per_game_stats'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons):
    all_data = pd.DataFrame()

    total_seasons = len(seasons)

    for i, season in enumerate(reversed(seasons), 1):
        data = scrape_season(season)

        if data is not None:
            all_data = pd.concat([all_data, data], ignore_index=True)

        completion_percentage = (i / total_seasons) * 100
        sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
        sys.stdout.flush()

    sys.stdout.flush()

    all_data = all_data[['Season'] + [col for col in all_data.columns if col != 'Season']]
    all_data = all_data[~all_data['Player'].isin(all_data.columns)]

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)
result['player_id'] = result.groupby('Player').ngroup()
result['player_id_season'] = result['player_id'].astype(str) + '_' + result['Season'].astype(str)
result['Player'] = result['Player'].str.replace('*', '')

PO_Per_Game_Player = result
PO_Per_Game_Player.to_csv('PO_Per_Game_Player.csv', index=False)

Scraping: [##################################################] 100.00%

### PO | Advanced | Player

In [11]:
def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/playoffs/NBA_{season}_advanced.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'advanced_stats'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons):
    all_data = pd.DataFrame()

    total_seasons = len(seasons)

    for i, season in enumerate(reversed(seasons), 1):
        data = scrape_season(season)

        if data is not None:
            all_data = pd.concat([all_data, data], ignore_index=True)

        completion_percentage = (i / total_seasons) * 100
        sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
        sys.stdout.flush()

    sys.stdout.flush()

    all_data = all_data[['Season'] + [col for col in all_data.columns if col != 'Season']]
    all_data = all_data[~all_data['Player'].isin(all_data.columns)]

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

result = result.loc[:, ~result.columns.str.contains('^Unnamed')]
result['player_id'] = result.groupby('Player').ngroup()
result['player_id_season'] = result['player_id'].astype(str) + '_' + result['Season'].astype(str)
result['Player'] = result['Player'].str.replace('*', '')

PO_Advanced_Player = result
PO_Advanced_Player.to_csv('PO_Advanced_Player.csv', index=False)

Scraping: [##################################################] 100.00%

### PO | Totals | Player

In [12]:

def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/playoffs/NBA_{season}_totals.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'totals_stats'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons):
    all_data = pd.DataFrame()

    total_seasons = len(seasons)

    for i, season in enumerate(reversed(seasons), 1):
        data = scrape_season(season)

        if data is not None:
            all_data = pd.concat([all_data, data], ignore_index=True)

        completion_percentage = (i / total_seasons) * 100
        sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
        sys.stdout.flush()

    sys.stdout.flush()

    all_data = all_data[['Season'] + [col for col in all_data.columns if col != 'Season']]
    all_data = all_data[['Season'] + [col for col in all_data.columns if col not in ['Season']]]
    all_data = all_data[~all_data['Player'].isin(all_data.columns)]

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

result = result.loc[:, ~result.columns.str.contains('^Unnamed')]
result['player_id'] = result.groupby('Player').ngroup()
result['player_id_season'] = result['player_id'].astype(str) + '_' + result['Season'].astype(str)
result['Player'] = result['Player'].str.replace('*', '')

PO_Totals_Player = result
PO_Totals_Player.to_csv('PO_Totals_Player.csv', index=False)

Scraping: [##################################################] 100.00%

### All Award Voting | Player | Coaches

In [13]:
def scrape_season_table(season, table_id):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/awards/awards_{season}.html'

    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    try:
        table = driver.find_element(By.ID, table_id)
        table_html = table.get_attribute('outerHTML')
        driver.quit()

        df = pd.read_html(StringIO(table_html), header=[1])[0]

        df['Season'] = season
        df['award_type'] = table_id

        all_data = pd.concat([all_data, df], ignore_index=True)
    except Exception as e:
        print(f"No table found for {table_id} in {season}")
        print(e)
        driver.quit()
        return None

    time.sleep(5)

    return all_data

def scrape_all_seasons_tables(seasons, table_ids):
    all_data = pd.DataFrame()

    total_tasks = len(seasons) * len(table_ids)
    task_count = 0

    for season in reversed(seasons):
        for table_id in table_ids:
            data = scrape_season_table(season, table_id)

            if data is not None:
                all_data = pd.concat([all_data, data], ignore_index=True)

            task_count += 1
            completion_percentage = (task_count / total_tasks) * 100
            sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
            sys.stdout.flush()

    sys.stdout.flush()

    print("Columns before removing 'Unnamed':", all_data.columns)

    all_data.columns = all_data.columns.map(str)
    all_data = all_data.loc[:, ~all_data.columns.str.contains('^Unnamed')]

    print("Columns after removing 'Unnamed':", all_data.columns)

    if isinstance(all_data.columns, pd.MultiIndex):
        all_data.columns = [' '.join(col).strip() for col in all_data.columns.values]

    print("Columns after flattening MultiIndex:", all_data.columns)
    
    return all_data

seasons_list = [str(year) for year in range(start, end)]
table_ids = ['mvp', 'roy', 'dpoy', 'smoy', 'mip', 'clutch_poy', 'leading_all_nba', 'leading_all_defense', 'leading_all_rookie', 'coy']

result = scrape_all_seasons_tables(seasons_list, table_ids)

all_award_voting = result
all_award_voting.to_csv('all_award_voting.csv', index=False)

Scraping: [#                                                 ] 3.33%No table found for clutch_poy in 2022
Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="clutch_poy"]"}
  (Session info: chrome-headless-shell=127.0.6533.89); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00488923+23283]
	(No symbol) [0x0044E934]
	(No symbol) [0x00380733]
	(No symbol) [0x003C326F]
	(No symbol) [0x003C34AB]
	(No symbol) [0x003FEE42]
	(No symbol) [0x003E4464]
	(No symbol) [0x003FCB8D]
	(No symbol) [0x003E41B6]
	(No symbol) [0x003B8017]
	(No symbol) [0x003B890D]
	GetHandleVerifier [0x0057A5F3+1013699]
	GetHandleVerifier [0x00583E4C+1052700]
	GetHandleVerifier [0x0057D4B4+1025668]
	GetHandleVerifier [0x004AEA2B+179195]
	(No symbol) [0x00456833]
	(No symbol) [0x00453198]
	(No symbol) [0x00453337]
	(No symbol) [0x0044B4BE]
	BaseThreadI

In [15]:
#OLD - 

# Cleaning all award data
all_award_voting = pd.read_csv("all_award_voting.csv")

coach_data = all_award_voting[all_award_voting['award_type'] == 'coy']
all_award_voting = all_award_voting[all_award_voting['award_type'] != 'coy']
all_award_voting = all_award_voting[all_award_voting['Player'].notna() & (all_award_voting['Player'] != '')]

all_award_voting['1st_team'] = all_award_voting['# Tm'].apply(lambda x: 1 if x in ['1st', '1T'] else 0)
all_award_voting['2nd_team'] = all_award_voting['# Tm'].apply(lambda x: 1 if x in ['2nd', '2T'] else 0)
all_award_voting['3rd_team'] = all_award_voting['# Tm'].apply(lambda x: 1 if x in ['3rd', '3T'] else 0)

all_award_voting = all_award_voting[['Rank', 'Player', 'Tm', 'Share', 'Season', 'award_type', '1st_team', '2nd_team', '3rd_team']]

coach_data = coach_data.rename(columns={'Share': 'coy_share'})
coach_data = coach_data[['Rank', 'Coach', 'Tm', 'coy_share', 'Season']]

all_award_voting['mvp_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'mvp' else None, axis=1)
all_award_voting['dpoy_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'dpoy' else None, axis=1)
all_award_voting['roy_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'roy' else None, axis=1)
all_award_voting['smoy_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'smoy' else None, axis=1)
all_award_voting['mip_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'mip' else None, axis=1)
all_award_voting['cpoy_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'clutch_poy' else None, axis=1)

share_columns = ['mvp_share', 'dpoy_share', 'roy_share', 'smoy_share', 'mip_share', 'cpoy_share']
all_award_voting[share_columns] = all_award_voting[share_columns].apply(pd.to_numeric, errors='coerce')

all_award_voting['leading_all_nba'] = all_award_voting['award_type'].apply(lambda x: 1 if x == 'leading_all_nba' else 0)
all_award_voting['leading_all_defense'] = all_award_voting['award_type'].apply(lambda x: 1 if x == 'leading_all_defense' else 0)
all_award_voting['leading_all_rookie'] = all_award_voting['award_type'].apply(lambda x: 1 if x == 'leading_all_rookie' else 0)

all_award_voting = all_award_voting.drop(columns=['Share', 'award_type'])

all_award_voting['all_nba'] = all_award_voting.apply(lambda x: 1 if x['leading_all_nba'] == 1 or x['1st_team'] == 1 or x['2nd_team'] == 1 or x['3rd_team'] == 1 else 0, axis=1)
all_award_voting['all_defense'] = all_award_voting.apply(lambda x: 1 if x['leading_all_defense'] == 1 or x['1st_team'] == 1 or x['2nd_team'] == 1 or x['3rd_team'] == 1 else 0, axis=1)
all_award_voting['all_rookie'] = all_award_voting.apply(lambda x: 1 if x['leading_all_rookie'] == 1 or x['1st_team'] == 1 or x['2nd_team'] == 1 or x['3rd_team'] == 1 else 0, axis=1)

all_award_voting['all_nba_1st_team'] = all_award_voting.apply(lambda x: 1 if x['all_nba'] == 1 and x['1st_team'] == 1 else 0, axis=1)
all_award_voting['all_nba_2nd_team'] = all_award_voting.apply(lambda x: 1 if x['all_nba'] == 1 and x['2nd_team'] == 1 else 0, axis=1)
all_award_voting['all_nba_3rd_team'] = all_award_voting.apply(lambda x: 1 if x['all_nba'] == 1 and x['3rd_team'] == 1 else 0, axis=1)

all_award_voting['all_def_1st_team'] = all_award_voting.apply(lambda x: 1 if x['all_defense'] == 1 and x['1st_team'] == 1 else 0, axis=1)
all_award_voting['all_def_2nd_team'] = all_award_voting.apply(lambda x: 1 if x['all_defense'] == 1 and x['2nd_team'] == 1 else 0, axis=1)
all_award_voting['all_def_3rd_team'] = all_award_voting.apply(lambda x: 1 if x['all_defense'] == 1 and x['3rd_team'] == 1 else 0, axis=1)

all_award_voting['all_rookie_1st_team'] = all_award_voting.apply(lambda x: 1 if x['all_rookie'] == 1 and x['1st_team'] == 1 else 0, axis=1)
all_award_voting['all_rookie_2nd_team'] = all_award_voting.apply(lambda x: 1 if x['all_rookie'] == 1 and x['2nd_team'] == 1 else 0, axis=1)
all_award_voting['all_rookie_3rd_team'] = all_award_voting.apply(lambda x: 1 if x['all_rookie'] == 1 and x['3rd_team'] == 1 else 0, axis=1)

all_award_voting['won_mvp'] = all_award_voting.groupby('Season')['mvp_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_roy'] = all_award_voting.groupby('Season')['roy_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_dpoy'] = all_award_voting.groupby('Season')['dpoy_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_smoy'] = all_award_voting.groupby('Season')['smoy_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_mip'] = all_award_voting.groupby('Season')['mip_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_cpoy'] = all_award_voting.groupby('Season')['cpoy_share'].transform(lambda x: (x == x.max()).astype(int))

coach_data['coy_share'] = pd.to_numeric(coach_data['coy_share'], errors='coerce')
coach_data['won_coy'] = coach_data.groupby('Season')['coy_share'].transform(lambda x: (x == x.max()).astype(int))

all_award_voting.to_csv('all_award_voting.csv', index=False)
coach_data.to_csv('all_award_coach_voting.csv', index=False)


In [208]:
# Cleaning all award data
# NEW

#all_award_voting = pd.read_csv("all_award_voting_raw.csv")
all_award_voting = pd.read_csv("all_award_voting_raw.csv")

coach_data = all_award_voting[all_award_voting['award_type'] == 'coy']
all_award_voting = all_award_voting[all_award_voting['award_type'] != 'coy']
all_award_voting = all_award_voting[all_award_voting['Player'].notna() & (all_award_voting['Player'] != '')]

all_award_voting['1st_team'] = all_award_voting['# Tm'].apply(lambda x: 1 if x in ['1st', '1T'] else 0)
all_award_voting['2nd_team'] = all_award_voting['# Tm'].apply(lambda x: 1 if x in ['2nd', '2T'] else 0)
all_award_voting['3rd_team'] = all_award_voting['# Tm'].apply(lambda x: 1 if x in ['3rd', '3T'] else 0)

all_award_voting = all_award_voting[['Rank', 'Player', 'Tm', 'Share', 'Season', 'award_type', '1st_team', '2nd_team', '3rd_team']]

coach_data = coach_data.rename(columns={'Share': 'coy_share'})
coach_data = coach_data[['Rank', 'Coach', 'Tm', 'coy_share', 'Season']]

all_award_voting['mvp_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'mvp' else None, axis=1)
all_award_voting['dpoy_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'dpoy' else None, axis=1)
all_award_voting['roy_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'roy' else None, axis=1)
all_award_voting['smoy_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'smoy' else None, axis=1)
all_award_voting['mip_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'mip' else None, axis=1)
all_award_voting['cpoy_share'] = all_award_voting.apply(lambda x: x['Share'] if x['award_type'] == 'clutch_poy' else None, axis=1)

share_columns = ['mvp_share', 'dpoy_share', 'roy_share', 'smoy_share', 'mip_share', 'cpoy_share']
all_award_voting[share_columns] = all_award_voting[share_columns].apply(pd.to_numeric, errors='coerce')

all_award_voting['leading_all_nba'] = all_award_voting['award_type'].apply(lambda x: 1 if x == 'leading_all_nba' else 0)
all_award_voting['leading_all_defense'] = all_award_voting['award_type'].apply(lambda x: 1 if x == 'leading_all_defense' else 0)
all_award_voting['leading_all_rookie'] = all_award_voting['award_type'].apply(lambda x: 1 if x == 'leading_all_rookie' else 0)

#all_award_voting = all_award_voting.drop(columns=['Share', 'award_type'])

# Get unique award types
award_types = all_award_voting['award_type'].unique()

# Create new columns for each unique award_type combined with '1st_team', '2nd_team', '3rd_team'
for award_type in award_types:
    all_award_voting[f'{award_type}_1st_team'] = all_award_voting.apply(
        lambda row: row['1st_team'] if row['award_type'] == award_type else 0, axis=1
    )
    all_award_voting[f'{award_type}_2nd_team'] = all_award_voting.apply(
        lambda row: row['2nd_team'] if row['award_type'] == award_type else 0, axis=1
    )
    all_award_voting[f'{award_type}_3rd_team'] = all_award_voting.apply(
        lambda row: row['3rd_team'] if row['award_type'] == award_type else 0, axis=1
    )
    all_award_voting[f'{award_type}'] = all_award_voting.apply(
        lambda row: 1 if row['award_type'] == award_type else 0, axis=1
    )

# Create 'count_all_nba', 'count_all_defense', and 'count_all_rookie' columns
all_award_voting['count_all_nba'] = all_award_voting.apply(
    lambda row: 1 if row['leading_all_nba_1st_team'] == 1 or row['leading_all_nba_2nd_team'] == 1 or row['leading_all_nba_3rd_team'] == 1 else 0, axis=1
)
all_award_voting['count_all_defense'] = all_award_voting.apply(
    lambda row: 1 if row['leading_all_defense_1st_team'] == 1 or row['leading_all_defense_2nd_team'] == 1 else 0, axis=1
)
all_award_voting['count_all_rookie'] = all_award_voting.apply(
    lambda row: 1 if row['leading_all_rookie_1st_team'] == 1 or row['leading_all_rookie_2nd_team'] == 1 else 0, axis=1
)

# Drop the specified columns
columns_to_drop = [
    'leading_all_defense_3rd_team', 'leading_all_rookie_3rd_team', 'leading_all_nba', 'leading_all_defense', 'leading_all_rookie',
    'mvp_1st_team', 'mvp_2nd_team', 'mvp_3rd_team', 'mvp', 'roy_1st_team', 'roy_2nd_team', 'roy_3rd_team', 'roy',
    'dpoy_1st_team', 'dpoy_2nd_team', 'dpoy_3rd_team', 'dpoy', 'smoy_1st_team', 'smoy_2nd_team', 'smoy_3rd_team', 'smoy',
    'mip_1st_team', 'mip_2nd_team', 'mip_3rd_team', 'mip', 'clutch_poy_1st_team', 'clutch_poy_2nd_team', 'clutch_poy_3rd_team', 'clutch_poy'
]
all_award_voting.drop(columns=columns_to_drop, inplace=True)

# Drop the 'Share' and 'award_type' columns after creating the new columns
all_award_voting = all_award_voting.drop(columns=['Share', 'award_type'])

all_award_voting['won_mvp'] = all_award_voting.groupby('Season')['mvp_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_roy'] = all_award_voting.groupby('Season')['roy_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_dpoy'] = all_award_voting.groupby('Season')['dpoy_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_smoy'] = all_award_voting.groupby('Season')['smoy_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_mip'] = all_award_voting.groupby('Season')['mip_share'].transform(lambda x: (x == x.max()).astype(int))
all_award_voting['won_cpoy'] = all_award_voting.groupby('Season')['cpoy_share'].transform(lambda x: (x == x.max()).astype(int))

coach_data['coy_share'] = pd.to_numeric(coach_data['coy_share'], errors='coerce')
coach_data['won_coy'] = coach_data.groupby('Season')['coy_share'].transform(lambda x: (x == x.max()).astype(int))

all_award_voting.to_csv('all_award_voting.csv', index=False)
coach_data.to_csv('all_award_coach_voting.csv', index=False)

### RS | Per Game | Team

In [16]:


def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'per_game-team'})

        if table:
            table_html = str(table)
            df = pd.read_html(StringIO(table_html))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons):
    all_data = pd.DataFrame()

    total_seasons = len(seasons)

    for i, season in enumerate(reversed(seasons), 1):
        data = scrape_season(season)

        if data is not None:
            all_data = pd.concat([all_data, data], ignore_index=True)

        completion_percentage = (i / total_seasons) * 100
        sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
        sys.stdout.flush()

    sys.stdout.flush()

    all_data = all_data[~all_data['Team'].str.contains('League Average', na=False)]

    all_data['make_playoffs'] = all_data['Team'].str.contains('\*').astype(int)

    all_data['Team'] = all_data['Team'].str.replace('*', '', regex=False)

    all_data = all_data[['Season'] + [col for col in all_data.columns if col != 'Season']]
    all_data = all_data[~all_data['Team'].isin(all_data.columns)]

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

result['team_id'] = result.groupby('Team').ngroup()
result['team_id_season'] = result['team_id'].astype(str) + '_' + result['Season'].astype(str)

RS_Per_Game_Team = result
RS_Per_Game_Team.to_csv('RS_Per_Game_Team.csv', index=False)

Scraping: [##################################################] 100.00%

### RS | Opp Per Game | Team

In [17]:
def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'per_game-opponent'})

        if table:
            table_html = str(table)
            df = pd.read_html(StringIO(table_html))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons):
    all_data = pd.DataFrame()

    total_seasons = len(seasons)

    for i, season in enumerate(reversed(seasons), 1):
        data = scrape_season(season)

        if data is not None:
            all_data = pd.concat([all_data, data], ignore_index=True)

        completion_percentage = (i / total_seasons) * 100
        sys.stdout.write(f"\rScraping: [{'#' * int(completion_percentage // 2)}{' ' * (50 - int(completion_percentage // 2))}] {completion_percentage:.2f}%")
        sys.stdout.flush()

    sys.stdout.flush()

    all_data = all_data[~all_data['Team'].str.contains('League Average', na=False)]

    all_data['make_playoffs'] = all_data['Team'].str.contains('\*').astype(int)

    all_data['Team'] = all_data['Team'].str.replace('*', '', regex=False)

    all_data = all_data[['Season'] + [col for col in all_data.columns if col != 'Season']]
    all_data = all_data[~all_data['Team'].isin(all_data.columns)]

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

result['team_id'] = result.groupby('Team').ngroup()
result['team_id_season'] = result['team_id'].astype(str) + '_' + result['Season'].astype(str)

columns_to_exclude = ['Season', 'Rk', 'make_playoffs', 'team_id', 'team_id_season', 'Team']
new_columns = {col: f'opp_{col}' for col in result.columns if col not in columns_to_exclude}
result = result.rename(columns=new_columns)

RS_Opp_Per_Game_Team = result
RS_Opp_Per_Game_Team.to_csv('RS_Opp_Per_Game_Team.csv', index=False)

Scraping: [##################################################] 100.00%

### RS | Advanced | Team

In [22]:


def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'advanced-team'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons_list):
    all_data = pd.DataFrame()

    for season in seasons_list:
        data = scrape_season(season)

        if data is not None:
            data['Season'] = season

            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

RS_Advanced_Team = result
RS_Advanced_Team.to_csv('RS_Advanced_Team.csv', index=False)

Scraping data for season 1950
Scraping data for season 1951
Scraping data for season 1952
Scraping data for season 1953
Scraping data for season 1954
Scraping data for season 1955
Scraping data for season 1956
Scraping data for season 1957
Scraping data for season 1958
Scraping data for season 1959
Scraping data for season 1960
Scraping data for season 1961
Scraping data for season 1962
Scraping data for season 1963
Scraping data for season 1964
Scraping data for season 1965
Scraping data for season 1966
Scraping data for season 1967
Scraping data for season 1968
Scraping data for season 1969
Scraping data for season 1970
Scraping data for season 1971
Scraping data for season 1972
Scraping data for season 1973
Scraping data for season 1974
Scraping data for season 1975
Scraping data for season 1976
Scraping data for season 1977
Scraping data for season 1978
Scraping data for season 1979
Scraping data for season 1980
Scraping data for season 1981
Scraping data for season 1982
Scraping d

In [25]:
#cleaning rs advanced team data

RS_Advanced_Team = pd.read_csv("RS_Advanced_Team.csv")

RS_Advanced_Team.columns = RS_Advanced_Team.iloc[0]
RS_Advanced_Team = RS_Advanced_Team[1:]
RS_Advanced_Team.columns.name = None
RS_Advanced_Team.columns = RS_Advanced_Team.columns.astype(str)

RS_Advanced_Team = RS_Advanced_Team.loc[:, ~RS_Advanced_Team.columns.str.contains('^Unnamed')]

RS_Advanced_Team = RS_Advanced_Team[RS_Advanced_Team['Team'] != 'League Average']

RS_Advanced_Team['make_playoffs'] = RS_Advanced_Team['Team'].apply(lambda x: 1 if '*' in x else 0)

RS_Advanced_Team['season'] = RS_Advanced_Team.iloc[:, RS_Advanced_Team.columns.get_loc('Attend./G') + 1].astype(str)
RS_Advanced_Team.drop(columns=RS_Advanced_Team.columns[RS_Advanced_Team.columns.get_loc('Attend./G') + 1], inplace=True)

RS_Advanced_Team['Rk'] = RS_Advanced_Team['Rk'].astype(float).round().astype(int).astype(str)
RS_Advanced_Team['season'] = RS_Advanced_Team['season'].astype(float).round().astype(int).astype(str)

RS_Advanced_Team['Team'] = RS_Advanced_Team['Team'].str.replace('*', '')
RS_Advanced_Team['overall_record'] = RS_Advanced_Team['W'].astype(float) / (RS_Advanced_Team['W'].astype(float) + RS_Advanced_Team['L'].astype(float))

RS_Advanced_Team['rk_season'] = RS_Advanced_Team.groupby('season')['overall_record'].rank(ascending=False)

RS_Advanced_Team['rk_season'] = RS_Advanced_Team['rk_season'].fillna(0).astype(int)

for idx in [17, 18, 19, 20]:
    RS_Advanced_Team.columns.values[idx] = 'offensive_' + RS_Advanced_Team.columns[idx]

for idx in [21, 22, 23, 24]:
    RS_Advanced_Team.columns.values[idx] = 'defensive_' + RS_Advanced_Team.columns[idx]

RS_Advanced_Team.to_csv('RS_Advanced_Team.csv', index=False)

### PO | Advanced | Team

In [26]:
def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/playoffs/NBA_{season}.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'advanced-team'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons_list):
    all_data = pd.DataFrame()

    for season in seasons_list:
        data = scrape_season(season)

        if data is not None:
            data['Season'] = season

            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

PO_Advanced_Team = result
PO_Advanced_Team.to_csv('PO_Advanced_Team.csv', index=False)

Scraping data for season 1950
Scraping data for season 1951
Scraping data for season 1952
Scraping data for season 1953
Scraping data for season 1954
Scraping data for season 1955
Scraping data for season 1956
Scraping data for season 1957
Scraping data for season 1958
Scraping data for season 1959
Scraping data for season 1960
Scraping data for season 1961
Scraping data for season 1962
Scraping data for season 1963
Scraping data for season 1964
Scraping data for season 1965
Scraping data for season 1966
Scraping data for season 1967
Scraping data for season 1968
Scraping data for season 1969
Scraping data for season 1970
Scraping data for season 1971
Scraping data for season 1972
Scraping data for season 1973
Scraping data for season 1974
Scraping data for season 1975
Scraping data for season 1976
Scraping data for season 1977
Scraping data for season 1978
Scraping data for season 1979
Scraping data for season 1980
Scraping data for season 1981
Scraping data for season 1982
Scraping d

In [33]:
#cleaning po advanced team data

PO_Advanced_Team = pd.read_csv("PO_Advanced_Team.csv")

PO_Advanced_Team.columns = PO_Advanced_Team.iloc[0]
PO_Advanced_Team = PO_Advanced_Team[1:]

PO_Advanced_Team.columns.name = None

PO_Advanced_Team.columns = PO_Advanced_Team.columns.astype(str)

PO_Advanced_Team = PO_Advanced_Team[PO_Advanced_Team['Tm'] != 'League Average']

PO_Advanced_Team = PO_Advanced_Team.loc[:, ~PO_Advanced_Team.columns.str.contains('^Unnamed')]

PO_Advanced_Team.rename(columns={'nan': 'season'}, inplace=True)

PO_Advanced_Team['Rk'] = PO_Advanced_Team['Rk'].fillna(-1).astype(float).round().astype(int).astype(str)

PO_Advanced_Team['season'] = PO_Advanced_Team['season'].astype(float).round().astype(int).astype(str)

PO_Advanced_Team['W'] = PO_Advanced_Team['W'].astype(float)

PO_Advanced_Team['season'] = PO_Advanced_Team['season'].astype(str)

PO_Advanced_Team['champion_share'] = PO_Advanced_Team['W'] / PO_Advanced_Team.groupby('season')['W'].transform('max')

for idx in [15, 16, 17, 18]:
    PO_Advanced_Team.columns.values[idx] = 'offensive_' + PO_Advanced_Team.columns[idx]

for idx in [19, 20, 21, 22]:
    PO_Advanced_Team.columns.values[idx] = 'defensive_' + PO_Advanced_Team.columns[idx]

PO_Advanced_Team.to_csv('PO_Advanced_Team.csv', index=False)


### RS | Schedule/Results | Team

In [35]:


def scrape_season(season):
    all_data = pd.DataFrame()
    months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may', 'june', 'july']

    for month in months:
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html'
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'schedule'})

            if table:
                html_str = str(table)
                df = pd.read_html(StringIO(html_str))[0]

                df['Season'] = season

                all_data = pd.concat([all_data, df], ignore_index=True)
            else:
                print(f"No table found for {season} in {month}")
        else:
            print(f"Failed to retrieve data for {season} in {month}")

        time.sleep(4)

    return all_data

def scrape_all_seasons():
    all_data = pd.DataFrame()

    for season in range(start, end):
        data = scrape_season(str(season))

        if data is not None:
            data['Season'] = season

            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

result = scrape_all_seasons()

RS_Schedule_Team = result
RS_Schedule_Team.to_csv('RS_Schedule_Team.csv', index=False)

Failed to retrieve data for 1950 in may
Failed to retrieve data for 1950 in june
Failed to retrieve data for 1950 in july
Scraping data for season 1950
Failed to retrieve data for 1951 in may
Failed to retrieve data for 1951 in june
Failed to retrieve data for 1951 in july
Scraping data for season 1951
Failed to retrieve data for 1952 in october
Failed to retrieve data for 1952 in may
Failed to retrieve data for 1952 in june
Failed to retrieve data for 1952 in july
Scraping data for season 1952
Failed to retrieve data for 1953 in may
Failed to retrieve data for 1953 in june
Failed to retrieve data for 1953 in july
Scraping data for season 1953
Failed to retrieve data for 1954 in may
Failed to retrieve data for 1954 in june
Failed to retrieve data for 1954 in july
Scraping data for season 1954
Failed to retrieve data for 1955 in may
Failed to retrieve data for 1955 in june
Failed to retrieve data for 1955 in july
Scraping data for season 1955
Failed to retrieve data for 1956 in october


### PO | Schedule/Results | Team

In [38]:


def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/playoffs/NBA_{season}_games.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'schedule'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
    else:
        print(f"Failed to retrieve data for {season}")

    time.sleep(4)

    return all_data

def scrape_all_seasons():
    all_data = pd.DataFrame()

    for season in range(start, end):
        data = scrape_season(str(season))

        if data is not None:
            data['Season'] = season

            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

result = scrape_all_seasons()

PO_Schedule_Team = result
PO_Schedule_Team.to_csv('PO_Schedule_Team.csv', index=False)

Scraping data for season 1950
Scraping data for season 1951
Scraping data for season 1952
Scraping data for season 1953
Scraping data for season 1954
Scraping data for season 1955
Scraping data for season 1956
Scraping data for season 1957
Scraping data for season 1958
Scraping data for season 1959
Scraping data for season 1960
Scraping data for season 1961
Scraping data for season 1962
Scraping data for season 1963
Scraping data for season 1964
Scraping data for season 1965
Scraping data for season 1966
Scraping data for season 1967
Scraping data for season 1968
Scraping data for season 1969
Scraping data for season 1970
Scraping data for season 1971
Scraping data for season 1972
Scraping data for season 1973
Scraping data for season 1974
Scraping data for season 1975
Scraping data for season 1976
Scraping data for season 1977
Scraping data for season 1978
Scraping data for season 1979
Scraping data for season 1980
Scraping data for season 1981
Scraping data for season 1982
Scraping d

In [39]:

dtype = {
    'Date': str,
    'Start (ET)': str,
    'Visitor/Neutral': str,
    'PTS': float,
    'Home/Neutral': str,
    'PTS.1': float,
    'Unnamed: 7': str,  # Replace 'Unnamed: 7' with the actual column name if known
    'Attend.': float,
    'LOG': str,
    'Notes': str
}
#btw play in games are NOT counted for both


#Cleaning regular season schedule data
RS_Schedule_Team = pd.read_csv("RS_Schedule_Team.csv", dtype=dtype)

RS_Schedule_Team['matchup'] = RS_Schedule_Team['Visitor/Neutral'] + '_' + RS_Schedule_Team['Home/Neutral']

RS_Schedule_Team['game_id'] = RS_Schedule_Team['Date'] + '_' + RS_Schedule_Team['matchup']

RS_Schedule_Team = RS_Schedule_Team.loc[:, ~RS_Schedule_Team.columns.str.contains('^Unnamed')]

RS_Schedule_Team = RS_Schedule_Team[~RS_Schedule_Team['Notes'].str.contains('Play-In Game', na=False)]

columns_to_drop = ['Notes', 'Arena', 'LOG', 'Attend.']
RS_Schedule_Team = RS_Schedule_Team.drop(columns=columns_to_drop, errors='ignore')

RS_Schedule_Team = RS_Schedule_Team.rename(columns={'PTS': 'v/n_pts', 'PTS.1': 'h/n_pts'})

#=====================================================================================================

#Cleaning playoff schedule data
PO_Schedule_Team = pd.read_csv("PO_Schedule_Team.csv", dtype=dtype)

PO_Schedule_Team['matchup'] = PO_Schedule_Team['Visitor/Neutral'] + '_' + PO_Schedule_Team['Home/Neutral']

PO_Schedule_Team['game_id'] = PO_Schedule_Team['Date'] + '_' + PO_Schedule_Team['matchup']

PO_Schedule_Team = PO_Schedule_Team.loc[:, ~PO_Schedule_Team.columns.str.contains('^Unnamed')]

columns_to_drop = ['Notes', 'Arena', 'LOG', 'Attend.']
PO_Schedule_Team = PO_Schedule_Team.drop(columns=columns_to_drop, errors='ignore')

PO_Schedule_Team = PO_Schedule_Team.rename(columns={'PTS': 'v/n_pts', 'PTS.1': 'h/n_pts'})

#=====================================================================================================

RS_Schedule_Team = RS_Schedule_Team[~RS_Schedule_Team['game_id'].isin(PO_Schedule_Team['game_id'])]

RS_Schedule_Team.to_csv('RS_Schedule_Team.csv', index=False)
PO_Schedule_Team.to_csv('PO_Schedule_Team.csv', index=False)

  RS_Schedule_Team = pd.read_csv("RS_Schedule_Team.csv")


### Other | Pre-Season Odds | Team

In [8]:
def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}_preseason_odds.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'NBA_preseason_odds'})

        if table:
            html_str = str(table)
            df = pd.read_html(StringIO(html_str))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
    else:
        print(f"Failed to retrieve data for {season}")

    time.sleep(4)

    return all_data

def scrape_all_seasons(seasons_list):
    all_data = pd.DataFrame()

    for season in seasons_list:
        data = scrape_season(season)

        if data is not None:
            data['Season'] = season

            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

seasons_list = [str(year) for year in range(start, end)]

result = scrape_all_seasons(seasons_list)

PSO_Team = result
PSO_Team.to_csv('PSO_Team.csv', index=False)

Failed to retrieve data for 1950
Scraping data for season 1950
Failed to retrieve data for 1951
Scraping data for season 1951
Failed to retrieve data for 1952
Scraping data for season 1952
Failed to retrieve data for 1953
Scraping data for season 1953
Failed to retrieve data for 1954
Scraping data for season 1954
Failed to retrieve data for 1955
Scraping data for season 1955
Failed to retrieve data for 1956
Scraping data for season 1956
Failed to retrieve data for 1957
Scraping data for season 1957
Failed to retrieve data for 1958
Scraping data for season 1958
Failed to retrieve data for 1959
Scraping data for season 1959
Failed to retrieve data for 1960
Scraping data for season 1960
Failed to retrieve data for 1961
Scraping data for season 1961
Failed to retrieve data for 1962
Scraping data for season 1962
Failed to retrieve data for 1963
Scraping data for season 1963
Failed to retrieve data for 1964
Scraping data for season 1964
Failed to retrieve data for 1965
Scraping data for seas

In [11]:
#cleaning pso team data

PSO_Team = pd.read_csv("PSO_Team.csv")

PSO_Team = PSO_Team.loc[:, ~PSO_Team.columns.str.contains('^Unnamed')]

PSO_Team['overall_record_o/u'] = PSO_Team['Result'].str.contains(r'\(over\)', na=False).astype(int)

PSO_Team = PSO_Team.rename(columns={'W-L O/U': 'projected_overall_record'})

def calculate_overall_record(result):
    if pd.isna(result):
        return None
    try:
        record = result.split(' ')[0]  #rec part
        wins, losses = map(int, record.split('-'))
        return wins / (wins + losses)
    except Exception as e:
        return None

PSO_Team['overall_record'] = PSO_Team['Result'].apply(calculate_overall_record)

PSO_Team['overall_record'] = (PSO_Team['overall_record'] * 100).round(1)

PSO_Team = PSO_Team.drop(columns=['Result'])

PSO_Team.to_csv('PSO_Team.csv', index=False)

### RS | Expanded Standings | Team

In [19]:

def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}_standings.html#all_expanded_standings'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        
        for comment in comments:
            if 'expanded_standings' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table', {'id': 'expanded_standings'})
                
                if table:
                    
                    df = pd.read_html(StringIO(str(table)))[0]

                    df['Season'] = season

                    all_data = pd.concat([all_data, df], ignore_index=True)
                    break
        else:
            print(f"No table found for {season}")
            return None
    else:
        print(f"Failed to retrieve data for {season}")
        return None

    time.sleep(4)

    return all_data

def scrape_all_seasons():
    all_data = pd.DataFrame()

    for season in range(start, end):
        data = scrape_season(str(season))

        if data is not None:
            data['Season'] = season

            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

result = scrape_all_seasons()

RS_exp_stand_Team = result
RS_exp_stand_Team.to_csv('RS_exp_stand_Team.csv', index=False)

Scraping data for season 1950
Scraping data for season 1951
Scraping data for season 1952
Scraping data for season 1953
Scraping data for season 1954
Scraping data for season 1955
Scraping data for season 1956
Scraping data for season 1957
Scraping data for season 1958
Scraping data for season 1959
Scraping data for season 1960
Scraping data for season 1961
Scraping data for season 1962
Scraping data for season 1963
Scraping data for season 1964
Scraping data for season 1965
Scraping data for season 1966
Scraping data for season 1967
Scraping data for season 1968
Scraping data for season 1969
Scraping data for season 1970
Scraping data for season 1971
Scraping data for season 1972
Scraping data for season 1973
Scraping data for season 1974
Scraping data for season 1975
Scraping data for season 1976
Scraping data for season 1977
Scraping data for season 1978
Scraping data for season 1979
Scraping data for season 1980
Scraping data for season 1981
Scraping data for season 1982
Scraping d

In [36]:
#clean expanded standings data

RS_exp_stand_Team = pd.read_csv("RS_exp_stand_Team.csv")

RS_exp_stand_Team.columns = RS_exp_stand_Team.iloc[0]
RS_exp_stand_Team = RS_exp_stand_Team[1:]

RS_exp_stand_Team.reset_index(drop=True, inplace=True)

new_column_headers = [
    'Rk', 'Team', 'overall_record', 'home_rec', 'road_rec', 'neu_rec', 'cen_div_rec', 'e_div_rec',
    'w_div_rec', '3pt_or_less_rec', '10pt_or_more_rec', 'oct_mon_rec', 'nov_mon_rec', 'dec_mon_rec',
    'jan_mon_rec', 'feb_mon_rec', 'mar_mon_rec', 'season', 'pre_all_star_rec', 'post_all_star_rec',
    'e_conf_rec', 'w_conf_rec', 'atl_div_rec', 'midw_div_rec', 'pac_div_rec', 'apr_mon_rec',
    'may_mon_rec', 'se_div_rec', 'nw_div_rec', 'sw_div_rec', 'jul_mon_rec', 'aug_mon_rec'
]

if len(RS_exp_stand_Team.columns) >= len(new_column_headers):
    RS_exp_stand_Team.columns = new_column_headers
else:
    print("The DataFrame does not have enough columns to rename according to the provided headers.")

def calculate_winning_percentage(record):
    try:
        wins, losses = map(int, record.split('-'))
        percentage = wins / (wins + losses)
        return percentage
    except:
        return None

for column in RS_exp_stand_Team.columns:
    if column not in ['Rk', 'Team', 'season']:
        RS_exp_stand_Team[column] = RS_exp_stand_Team[column].apply(calculate_winning_percentage)

RS_exp_stand_Team.to_csv('RS_exp_stand_Team.csv', index=False)

### Season | Coaches

In [39]:


def scrape_season(season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/leagues/NBA_{season}_coaches.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'NBA_coaches'})

        if table:
            df = pd.read_html(io.StringIO(str(table)))[0]

            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {season}")
    else:
        print(f"Failed to retrieve data for {season}")

    time.sleep(4)

    return all_data

def scrape_all_seasons():
    all_data = pd.DataFrame()

    for season in range(start, end):  
        data = scrape_season(str(season))

        if data is not None:
            data['Season'] = season
            all_data = pd.concat([all_data, data], ignore_index=True)

            print(f"Scraping data for season {season}")

    return all_data

result = scrape_all_seasons()

coaches_season = result
coaches_season.to_csv('coaches_season.csv', index=False)

Scraping data for season 1950
Scraping data for season 1951
Scraping data for season 1952
Scraping data for season 1953
Scraping data for season 1954
Scraping data for season 1955
Scraping data for season 1956
Scraping data for season 1957
Scraping data for season 1958
Scraping data for season 1959
Scraping data for season 1960
Scraping data for season 1961
Scraping data for season 1962
Scraping data for season 1963
Scraping data for season 1964
Scraping data for season 1965
Scraping data for season 1966
Scraping data for season 1967
Scraping data for season 1968
Scraping data for season 1969
Scraping data for season 1970
Scraping data for season 1971
Scraping data for season 1972
Scraping data for season 1973
Scraping data for season 1974
Scraping data for season 1975
Scraping data for season 1976
Scraping data for season 1977
Scraping data for season 1978
Scraping data for season 1979
Scraping data for season 1980
Scraping data for season 1981
Scraping data for season 1982
Scraping d

In [50]:
#clean coach data

Coaches = pd.read_csv("coaches_season.csv")

Coaches = Coaches.iloc[2:].reset_index(drop=True)

columns_to_drop = [2, 5, 16]
Coaches.drop(Coaches.columns[columns_to_drop], axis=1, inplace=True)

new_column_headers = [
    'Coach', 'Tm', 'sea_w/franc_count', 'sea_overall_count', 'rs_curr_sea_G', 'rs_curr_sea_W', 'rs_curr_sea_L',
    'rs_w/franc_G', 'rs_w/franc_W', 'rs_w/franc_L', 'rs_career_G', 'rs_career_W',
    'rs_career_L', 'rs_career_W%', 'po_curr_sea_G', 'po_curr_sea_W', 'po_curr_sea_L',
    'po_w/franc_G', 'po_w/franc_W', 'po_w/franc_L', 'po_career_G', 'po_career_W', 'po_career_L', 'season'
]

Coaches.columns = new_column_headers

Coaches['season'] = Coaches['season'].astype(float).round().astype(int)

Coaches.to_csv('coaches_season.csv', index=False)

### Team Rosters | Seasons

In [74]:
#need unique tm ids to loop for scrapper

RS_Per_Game_Player = pd.read_csv("RS_Per_Game_Player.csv")

unique_teams = RS_Per_Game_Player['Tm'].unique()

print(unique_teams)

['TOT' 'TOR' 'NYK' 'MIA' 'UTA' 'MEM' 'MIN' 'PHO' 'CLE' 'NOP' 'MIL' 'ORL'
 'WAS' 'POR' 'DET' 'CHO' 'PHI' 'BOS' 'SAS' 'SAC' 'BRK' 'LAC' 'OKC' 'ATL'
 'CHI' 'DEN' 'HOU' 'IND' 'DAL' 'LAL' 'GSW' 'CHA' 'NOH' 'NJN' 'SEA' 'NOK'
 'CHH' 'VAN' 'WSB' 'KCK' 'SDC' 'NOJ' 'BUF' 'NYN' 'KCO' 'CAP' 'BAL' 'CIN'
 'SDR' 'SFW' 'STL' 'CHZ' 'SYR' 'PHW' 'CHP' 'MNL' 'ROC' 'FTW' 'MLH' 'BLB'
 'INO' 'WSC' 'TRI' 'CHS' 'DNN' 'AND' 'WAT' 'SHE' 'STB']


In [75]:

teams = ['TOR', 'NYK', 'MIA', 'UTA', 'MEM', 'MIN', 'PHO', 'CLE', 'NOP', 'MIL', 'ORL',
         'WAS', 'POR', 'DET', 'CHO', 'PHI', 'BOS', 'SAS', 'SAC', 'BRK', 'LAC', 'OKC', 'ATL',
         'CHI', 'DEN', 'HOU', 'IND', 'DAL', 'LAL', 'GSW', 'CHA', 'NOH', 'NJN', 'SEA', 'NOK',
         'CHH', 'VAN', 'WSB', 'KCK', 'SDC', 'NOJ', 'BUF', 'NYN', 'KCO', 'CAP', 'BAL', 'CIN',
         'SDR', 'SFW', 'STL', 'CHZ', 'SYR', 'PHW', 'CHP', 'MNL', 'ROC', 'FTW', 'MLH', 'BLB',
         'INO', 'WSC', 'TRI', 'CHS', 'DNN', 'AND', 'WAT', 'SHE', 'STB']

def scrape_season(team, season):
    all_data = pd.DataFrame()

    url = f'https://www.basketball-reference.com/teams/{team}/{season}.html'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'roster'})

        if table:
            df = pd.read_html(io.StringIO(str(table)))[0]

            df['Team'] = team
            df['Season'] = season

            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No table found for {team} in {season}")
    else:
        print(f"Failed to retrieve data for {team} in {season}")

    time.sleep(4)

    return all_data

def scrape_all_seasons(start, end):
    all_data = pd.DataFrame()

    for team in teams:
        for season in range(start, end):  # Adjust the range to include the end year
            data = scrape_season(team, str(season))

            if data is not None:
                all_data = pd.concat([all_data, data], ignore_index=True)

                print(f"Scraping data for {team} in season {season}")

    return all_data

result = scrape_all_seasons(start, end)

team_rosters_season = result
team_rosters_season.to_csv('team_rosters_season.csv', index=False)

Failed to retrieve data for TOR in 1950
Scraping data for TOR in season 1950
Failed to retrieve data for TOR in 1951
Scraping data for TOR in season 1951
Failed to retrieve data for TOR in 1952
Scraping data for TOR in season 1952
Failed to retrieve data for TOR in 1953
Scraping data for TOR in season 1953
Failed to retrieve data for TOR in 1954
Scraping data for TOR in season 1954
Failed to retrieve data for TOR in 1955
Scraping data for TOR in season 1955
Failed to retrieve data for TOR in 1956
Scraping data for TOR in season 1956
Failed to retrieve data for TOR in 1957
Scraping data for TOR in season 1957
Failed to retrieve data for TOR in 1958
Scraping data for TOR in season 1958
Failed to retrieve data for TOR in 1959
Scraping data for TOR in season 1959
Failed to retrieve data for TOR in 1960
Scraping data for TOR in season 1960
Failed to retrieve data for TOR in 1961
Scraping data for TOR in season 1961
Failed to retrieve data for TOR in 1962
Scraping data for TOR in season 1962

### Custom | Player | Index

In [2]:
RS_Per_Game_Player = pd.read_csv("RS_Per_Game_Player.csv")

unique_players = RS_Per_Game_Player[['Player']].drop_duplicates().reset_index(drop=True)

unique_players['player_id'] = range(1, len(unique_players) + 1)

unique_players.to_csv('custom_player_index.csv', index=False)

### Final cleaning

In [3]:
#set files (4) not cleaning needed
PO_Schedule_Team = pd.read_csv("PO_Schedule_Team.csv")
RS_Schedule_Team = pd.read_csv("RS_Schedule_Team.csv")
custom_team_season_index = pd.read_csv("custom_team_season_index.csv")
custom_team_franchise_index = pd.read_csv("custom_team_franchise_index.csv")
custom_player_index = pd.read_csv("custom_player_index.csv")

#clean up files
coaches_season = pd.read_csv("coaches_season.csv")
RS_exp_stand_team = pd.read_csv("RS_exp_stand_team.csv") #3
PSO_Team = pd.read_csv("PSO_Team.csv")

PO_Advanced_Team = pd.read_csv("PO_Advanced_Team.csv")
RS_Advanced_Team = pd.read_csv("RS_Advanced_Team.csv")
RS_Opp_Per_Game_Team = pd.read_csv("RS_Opp_Per_Game_Team.csv") #4
RS_Per_Game_Team = pd.read_csv("RS_Per_Game_Team.csv")

all_award_coach_voting = pd.read_csv("all_award_coach_voting.csv") #2
all_award_voting = pd.read_csv("all_award_voting.csv")

PO_Totals_Player = pd.read_csv("PO_Totals_Player.csv")
PO_Advanced_Player = pd.read_csv("PO_Advanced_Player.csv")#3
PO_Per_Game_Player = pd.read_csv("PO_Per_Game_Player.csv")

RS_Totals_Player = pd.read_csv("RS_Totals_Player.csv")
RS_Advanced_Player = pd.read_csv("RS_Advanced_Player.csv") #3
RS_Per_Game_Player = pd.read_csv("RS_Per_Game_Player.csv")

team_rosters_season = pd.read_csv("team_rosters_season.csv") #1

In [4]:
#cleaning and merging, add index columns to datasets

#merges team/coaches
coaches_season_df = pd.merge(coaches_season, custom_team_franchise_index[['Tm', 'team_id']], on='Tm', how='left')
RS_exp_stand_team_df = pd.merge(RS_exp_stand_team, custom_team_franchise_index[['Team', 'team_id']], on='Team', how='left')
PSO_Team_df = pd.merge(PSO_Team, custom_team_franchise_index[['Team', 'team_id']], on='Team', how='left')
all_award_coach_voting_df = pd.merge(all_award_coach_voting, custom_team_franchise_index[['Tm', 'team_id']], on='Tm', how='left')

#merges player stats
team_rosters_season_df = pd.merge(team_rosters_season, custom_player_index[['Player', 'player_id']], on='Player', how='left')

# Remove "*" from all rows in the 'Player' column
all_award_voting['Player'] = all_award_voting['Player'].str.replace('*', '')
# Drop all rows that contain "Player" in the 'Player' column
all_award_voting = all_award_voting[all_award_voting['Player'] != 'Player']
all_award_voting_df = pd.merge(all_award_voting, team_rosters_season_df[['Player', 'player_id']], on='Player', how='left')

PO_Totals_Player.drop(columns=['player_id', 'player_id_season'], inplace=True)
PO_Totals_Player_df = pd.merge(PO_Totals_Player, custom_player_index[['Player', 'player_id']], on='Player', how='left')

PO_Advanced_Player.drop(columns=['player_id', 'player_id_season'], inplace=True)
PO_Advanced_Player_df = pd.merge(PO_Advanced_Player, custom_player_index[['Player', 'player_id']], on='Player', how='left')

PO_Per_Game_Player.drop(columns=['player_id', 'player_id_season'], inplace=True)
PO_Per_Game_Player_df = pd.merge(PO_Per_Game_Player, custom_player_index[['Player', 'player_id']], on='Player', how='left')

RS_Totals_Player.drop(columns=['player_id', 'player_id_season'], inplace=True)
RS_Totals_Player_df = pd.merge(RS_Totals_Player, custom_player_index[['Player', 'player_id']], on='Player', how='left')

RS_Advanced_Player.drop(columns=['player_id', 'player_id_season'], inplace=True)
RS_Advanced_Player_df = pd.merge(RS_Advanced_Player, custom_player_index[['Player', 'player_id']], on='Player', how='left')

RS_Per_Game_Player.drop(columns=['player_id', 'player_id_season'], inplace=True)
RS_Per_Game_Player_df = pd.merge(RS_Per_Game_Player, custom_player_index[['Player', 'player_id']], on='Player', how='left')

#clean po advanced team 
PO_Advanced_Team['Team'] = PO_Advanced_Team['Team'].fillna(PO_Advanced_Team['Tm'])
PO_Advanced_Team.drop(columns=['Tm'], inplace=True)
PO_Advanced_Team = PO_Advanced_Team[PO_Advanced_Team['Team'] != 'League Average']
#merge
PO_Advanced_Team_df = pd.merge(PO_Advanced_Team, custom_team_franchise_index[['Team', 'team_id']], on='Team', how='left')
#new feature 'champion'
PO_Advanced_Team_df['team_id'] = PO_Advanced_Team_df['team_id'].round().astype(int)
PO_Advanced_Team_df['champion'] = PO_Advanced_Team_df['champion_share'].apply(lambda x: 1 if x == 1 else 0)


#clean rs advanced team 
#RS_Advanced_Team = RS_Advanced_Team[RS_Advanced_Team['season'] != 2025] #MAKE SURE TO REMOVE THIS FOR FUTURE USE!!!!
#merge
RS_Advanced_Team_df = pd.merge(RS_Advanced_Team, custom_team_franchise_index[['Team', 'team_id']], on='Team', how='left')

#clean rs opp per game team
RS_Opp_Per_Game_Team.drop(columns=['team_id', 'team_id_season'], inplace=True)
#merge
RS_Opp_Per_Game_Team_df = pd.merge(RS_Opp_Per_Game_Team, custom_team_franchise_index[['Team', 'team_id']], on='Team', how='left')

#clean rs per game team
RS_Per_Game_Team.drop(columns=['team_id', 'team_id_season'], inplace=True)
#merge
RS_Per_Game_Team_df = pd.merge(RS_Per_Game_Team, custom_team_franchise_index[['Team', 'team_id']], on='Team', how='left')

In [5]:
#ensuring these df's can whole numbers in id. rk. sea. columns
PO_Per_Game_Player_df['player_id'] = pd.to_numeric(PO_Per_Game_Player_df['player_id'], errors='coerce').round(0)
PO_Per_Game_Player_df['player_id'] = PO_Per_Game_Player_df['player_id'].fillna(0).astype(int)

RS_Opp_Per_Game_Team_df['Rk'] = pd.to_numeric(RS_Opp_Per_Game_Team_df['Rk'], errors='coerce').round(0)
RS_Opp_Per_Game_Team_df['Rk'] = RS_Opp_Per_Game_Team_df['Rk'].fillna(0).astype(int)

PO_Totals_Player_df['player_id'] = pd.to_numeric(PO_Totals_Player_df['player_id'], errors='coerce').round(0)
PO_Totals_Player_df['player_id'] = PO_Totals_Player_df['player_id'].fillna(0).astype(int)

PO_Advanced_Player_df['player_id'] = pd.to_numeric(PO_Advanced_Player_df['player_id'], errors='coerce').round(0)
PO_Advanced_Player_df['player_id'] = PO_Advanced_Player_df['player_id'].fillna(0).astype(int)

all_award_voting_df['player_id'] = pd.to_numeric(all_award_voting_df['player_id'], errors='coerce').round(0)
all_award_voting_df['player_id'] = all_award_voting_df['player_id'].fillna(0).astype(int)

team_rosters_season_df['player_id'] = pd.to_numeric(team_rosters_season_df['player_id'], errors='coerce').round(0)
team_rosters_season_df['player_id'] = team_rosters_season_df['player_id'].fillna(0).astype(int)

#if needed
team_rosters_season_df['Tm'] = team_rosters_season_df['Team']

In [6]:
# Function to rename 'Season' or 'season' to 'season'
def rename_season_column(df):
    if 'Season' in df.columns:
        df.rename(columns={'Season': 'season'}, inplace=True)
    elif 'season' in df.columns:
        df.rename(columns={'season': 'season'}, inplace=True)
    return df

# List of DataFrames to process
dfs = {
    'coaches_season_df': coaches_season_df,
    'RS_exp_stand_team_df': RS_exp_stand_team_df,
    'PSO_Team_df': PSO_Team_df,
    'team_rosters_season_df': team_rosters_season_df,
    'all_award_coach_voting_df': all_award_coach_voting_df,
    'all_award_voting_df': all_award_voting_df,
    'PO_Totals_Player_df': PO_Totals_Player_df,
    'PO_Advanced_Player_df': PO_Advanced_Player_df,
    'PO_Per_Game_Player_df': PO_Per_Game_Player_df,
    'PO_Advanced_Team_df': PO_Advanced_Team_df,
    'RS_Advanced_Team_df': RS_Advanced_Team_df,
    'RS_Opp_Per_Game_Team_df': RS_Opp_Per_Game_Team_df,
    'RS_Per_Game_Team_df': RS_Per_Game_Team_df,
    'RS_Totals_Player_df': RS_Totals_Player_df,
    'RS_Advanced_Player_df': RS_Advanced_Player_df,
    'RS_Per_Game_Player_df': RS_Per_Game_Player_df
}

# Apply renaming to all specified DataFrames
for name, df in dfs.items():
    dfs[name] = rename_season_column(df)

In [7]:
def update_teams_and_filter(df, po_df):
    #merge to get the 'Tm' values from PO_Per_Game_Player into a new column 'Tm_po' - JA
    merged_df = df.merge( 
        po_df[['player_id', 'season', 'Tm']], 
        on=['player_id', 'season'], 
        how='left',
        suffixes=('', '_po')
    )

    #create the 'team_update' column with values from 'Tm_po' or fallback to original 'Tm'- JA
    merged_df['team_update'] = merged_df['Tm_po'].combine_first(merged_df['Tm'])

    #drop the extra 'Tm_po' column used for merging
    merged_df.drop(columns=['Tm_po'], inplace=True)

    #group by 'Player' and 'season', and select the row with the maximum value in 'G'- JA
    filtered_df = merged_df.loc[merged_df.groupby(['Player', 'season'])['G'].idxmax()]

    #create the 'team_after_td' column based on the specified criteria- JA
    filtered_df['team_after_td'] = filtered_df.apply(
        lambda row: row['team_update'] if row['Tm'] == 'TOT' else row['Tm'], axis=1
    )

    #rename the 'Tm' column to 'team_before_td'- JA
    filtered_df.rename(columns={'Tm': 'team_before_td'}, inplace=True)

    #drop the 'team_update' column- JA
    filtered_df.drop(columns=['team_update'], inplace=True)

    return filtered_df

RS_Per_Game_Player_df = update_teams_and_filter(RS_Per_Game_Player_df, PO_Per_Game_Player_df)
RS_Advanced_Player_df = update_teams_and_filter(RS_Advanced_Player_df, PO_Per_Game_Player_df)
RS_Totals_Player_df = update_teams_and_filter(RS_Totals_Player_df, PO_Per_Game_Player_df)

In [8]:
#list of columns to average- JA
columns_to_average = [
    '1st_team', '2nd_team', '3rd_team', 'mvp_share', 'dpoy_share', 'roy_share', 'smoy_share', 
    'mip_share', 'cpoy_share', 'leading_all_nba_1st_team', 'leading_all_nba_2nd_team', 
    'leading_all_nba_3rd_team', 'leading_all_defense_1st_team', 'leading_all_defense_2nd_team', 
    'leading_all_rookie_1st_team', 'leading_all_rookie_2nd_team', 'count_all_nba', 
    'count_all_defense', 'count_all_rookie', 'won_mvp', 'won_roy', 'won_dpoy', 
    'won_smoy', 'won_mip', 'won_cpoy'
]

#group by 'Player' and 'season' and calculate the mean of the specified columns- JA
all_award_voting_df = all_award_voting_df.groupby(['Player', 'season'])[columns_to_average].mean().reset_index()

#columns to check and convert values greater than 0 to 1- JA
columns_to_convert = [
    '1st_team', '2nd_team', '3rd_team', 'leading_all_nba_1st_team', 'leading_all_nba_2nd_team', 
    'leading_all_nba_3rd_team', 'leading_all_defense_1st_team', 'leading_all_defense_2nd_team', 
    'leading_all_rookie_1st_team', 'leading_all_rookie_2nd_team', 'count_all_nba', 
    'count_all_defense', 'count_all_rookie', 'won_mvp', 'won_roy', 'won_dpoy', 
    'won_smoy', 'won_mip', 'won_cpoy'
]

#convert values greater than 0 to 1- JA
for col in columns_to_convert:
    all_award_voting_df[col] = all_award_voting_df[col].apply(lambda x: 1 if x > 0 else 0)

In [9]:
#final
coaches_season_df.to_csv('coaches_season.csv', index=False, encoding="utf-8-sig")
RS_exp_stand_team_df.to_csv('RS_exp_stand_team.csv', index=False, encoding="utf-8-sig")
PSO_Team_df.to_csv('PSO_Team.csv', index=False, encoding="utf-8-sig")
team_rosters_season_df.to_csv('team_rosters_season.csv', index=False, encoding="utf-8-sig")

all_award_coach_voting_df.to_csv('all_award_coach_voting.csv', index=False, encoding="utf-8-sig")
all_award_voting_df.to_csv('all_award_voting.csv', index=False, encoding="utf-8-sig")

PO_Totals_Player_df.to_csv('PO_Totals_Player.csv', index=False, encoding="utf-8-sig")
PO_Advanced_Player_df.to_csv('PO_Advanced_Player.csv', index=False, encoding="utf-8-sig")
PO_Per_Game_Player_df.to_csv('PO_Per_Game_Player.csv', index=False, encoding="utf-8-sig")

PO_Advanced_Team_df.to_csv('PO_Advanced_Team.csv', index=False, encoding="utf-8-sig")

RS_Advanced_Team_df.to_csv('RS_Advanced_Team.csv', index=False, encoding="utf-8-sig")
RS_Opp_Per_Game_Team_df.to_csv('RS_Opp_Per_Game_Team.csv', index=False, encoding="utf-8-sig")
RS_Per_Game_Team_df.to_csv('RS_Per_Game_Team.csv', index=False, encoding="utf-8-sig")

RS_Totals_Player_df.to_csv('RS_Totals_Player.csv', index=False, encoding="utf-8-sig")
RS_Advanced_Player_df.to_csv('RS_Advanced_Player.csv', index=False, encoding="utf-8-sig")
RS_Per_Game_Player_df.to_csv('RS_Per_Game_Player.csv', index=False, encoding="utf-8-sig")

#16