Installing required packages 

In [28]:
%pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


Importing required packages

In [29]:
import pandas as pd
from io import StringIO
import json
from sklearn.preprocessing import LabelEncoder


Creating LabelEncoder object for use within our main function

In [13]:
le = LabelEncoder()

Creating two sets to make sure we only include **active players** in the final CSV

In [14]:
with open("data/players.json") as file:
    players = json.load(file)

active_players_name = set()
active_players_code = set()

for player in players.values():
    active_players_name.add(player['Player'])
    active_players_code.add(player['PlayerCode'])


Function to update each row in our new dataframe with the defensive ratings (`opp_def_rtg` and `opp_def_rtg_adj`), as well as a function to make all the duplicate player codes the same

In [15]:
with open("data/id_to_player_mapping.json", "r") as file:
    player_dict = json.load(file)

player_value_counts = {}
for key, value in player_dict.items():
    if value in player_value_counts:
        player_value_counts[value].append(key)
    else:
        player_value_counts[value] = [key]

duplicate_players = {value: keys for value, keys in player_value_counts.items() if len(keys) > 1}

with open("data/player_to_id_mapping.json", "r") as file:
    id_dict = json.load(file)

id_value_counts = {}
for key, value in id_dict.items():
    if value in id_value_counts:
        id_value_counts[value].append(key)
    else:
        id_value_counts[value] = [key]

duplicate_ids = {value: keys for value, keys in id_value_counts.items() if len(keys) > 1}

with open('data/player_to_id_mapping.json', 'r') as file:
    p_t_c = json.load(file)

duplicate_ids


{'2789': ['Larry Nance Jr.', 'Larry Nance'],
 '124084': ['Gary Payton', 'Gary Payton II'],
 '663874': ['Moe Wagner', 'Moritz Wagner'],
 '10014159': ["Jae'sean Tate", "Jae'Sean Tate"],
 '10012663': ['Isaiah Stewart', 'Isaiah Stewart II'],
 '10009546': ['Nathaniel Hinton', 'Nate Hinton'],
 '40722952': ['Herb Jones', 'Herbert Jones'],
 '47882246': ['Kenny Lofton', 'Kenneth Lofton'],
 '47267129': ['Vincent Williams', 'Vince Williams'],
 '47267136': ['MarJon Beauchamp', 'Marjon Beauchamp'],
 '40722937': ['BJ Boston', 'Brandon Boston'],
 '52566219': ['Dereck Lively', 'Dereck Lively II']}

In [16]:
def get_def_ratings(row):
    with open('data/def_rating.json', 'r') as file:
        def_rtgs = json.load(file)
    team_def_rtg = def_rtgs[row['Opponent'].upper()]
    year = int(row['GameDay'].split("-")[0])
    for i in team_def_rtg:
        if i['year'] == year:
            return i['def_rtg'], i['def_rtg_adj']
    
    return -1, -1

def standardize_player_id(row):
    try:
        return duplicate_ids[row['PlayerID']][0] if row['PlayerID'] in duplicate_ids else row['Player'], p_t_c[row['Player']]
    except KeyError as e:
        print(e, row["Player"])


Main function to generate and return a dataframe for each year specified, as well as player and team mappings for later analysis

In [17]:
def generate_dataframe_by_year(year):
    print(f"Processing year: {year}")
    # Open the CSV file for the specific year and read its contents
    with open(f"data/NatStat-NBA{year}-Player_Statlines-2024-09-17-h13.csv", "r") as file:
        csv_content = file.read()

    # Use StringIO to treat the CSV content as a file-like object for reading into pandas
    data = StringIO(csv_content)

    # Read the CSV into a pandas DataFrame, treating all columns as strings
    df = pd.read_csv(data, dtype={"GameDay": "string", "GameID" : "string", "Player" : "string", "PlayerID" : "string", 
                                  "PlayerCode": "string", "TeamID" : "string", "Team" : "string", "OpponentID": "string", 
                                  "Opponent" : "string", "Location" : "string", "Division" : "string", "Conference" : "string", 
                                  "Playoffs" : "string", "WinOrLoss" : "string", "Starter" : "string", "PlayerType" : "string", 
                                  "PerfScore" : "string", "MIN" : "string", "PTS" : "string", "FGM" : "string", "FGA" : "string", 
                                  "3FM" : "string", "3FA" : "string", "FTM" : "string", "FTA" : "string", "REB" : "string", 
                                  "AST" : "string", "STL" : "string", "BLK" : "string", "OREB" : "string", "TO" : "string", 
                                  "PF" : "string"})
    
    if year < 2022:
        df = df[~(df['Player'] == 'Jabari Smith')]

    # Check if 'PlayerCode' exists, if not, create the column and fill it with 0 as in some years this column doesn't exist
    if 'PlayerCode' not in df.columns:
        df['PlayerCode'] = 0

    # Apply the get_def_ratings function to get opponent defensive ratings and add them to the DataFrame
    df[['opp_def_rtg', 'opp_def_rtg_adj']] = df.apply(get_def_ratings, axis=1, result_type="expand")

    # Filter the DataFrame to keep only active players based on PlayerCode or Player name
    df = df[(df['PlayerCode'].isin(active_players_code)) | (df['Player'].isin(active_players_name))]

    # Remove duplicate player IDs    
    df[['Player', 'PlayerID']] = df.apply(standardize_player_id, axis=1, result_type="expand")

    # Create raw copy before performing more edits
    df_raw = df.copy()

    # Create dictionaries mapping PlayerID to Player name and TeamID to Team name
    current_player_mappings = df.set_index('PlayerID')['Player'].to_dict()
    current_team_mappings = df.set_index('TeamID')['Team'].to_dict()

    # Drop unnecessary columns from the DataFrame
    df.drop(columns=['GameDay', 'Player', 'PlayerCode', 'Team', 'Opponent', 'Division', 'Conference'], inplace=True)

    # Fill missing values (NaNs) with '0'
    df.fillna('0', inplace=True)

    # Label encode categorical columns
    df['Location'] = le.fit_transform(df['Location'])
    df['Playoffs'] = le.fit_transform(df['Playoffs'])
    df['WinOrLoss'] = le.fit_transform(df['WinOrLoss'])
    df['Starter'] = le.fit_transform(df['Starter'])

    # Convert all columns to numeric, replacing non-numeric values with 0
    df = df.apply(pd.to_numeric, errors="coerce").fillna(0)

    # Print a message indicating the processing of the year is finished
    print(f"Finished year: {year}")
    
    # Return the modified DataFrame and the current player and team mappings
    return df, current_player_mappings, current_team_mappings, df_raw

In [18]:
final = pd.DataFrame()
final_raw = pd.DataFrame()
player_mappings = {}
team_mappings = {}
years = [2000 + i for i in range(4, 25)]

for year in years:
    # Call function to get df and mappings
    df, current_player_mappings, current_team_mappings, df_raw = generate_dataframe_by_year(year)
    # Append the new df with our final one that we are going to save as CSV
    final = pd.concat([final, df], ignore_index=True)
    final_raw = pd.concat([final_raw, df_raw], ignore_index=True)
    # Update the two mappings with (potentially) new values
    player_mappings.update(current_player_mappings)
    team_mappings.update(current_team_mappings)



Processing year: 2004
Finished year: 2004
Processing year: 2005
Finished year: 2005
Processing year: 2006
Finished year: 2006
Processing year: 2007
Finished year: 2007
Processing year: 2008
Finished year: 2008
Processing year: 2009
Finished year: 2009
Processing year: 2010
Finished year: 2010
Processing year: 2011
Finished year: 2011
Processing year: 2012
Finished year: 2012
Processing year: 2013
Finished year: 2013
Processing year: 2014
Finished year: 2014
Processing year: 2015
Finished year: 2015
Processing year: 2016
Finished year: 2016
Processing year: 2017
Finished year: 2017
Processing year: 2018
Finished year: 2018
Processing year: 2019
Finished year: 2019
Processing year: 2020
Finished year: 2020
Processing year: 2021
Finished year: 2021
Processing year: 2022
Finished year: 2022
Processing year: 2023
Finished year: 2023
Processing year: 2024
Finished year: 2024


Write our player and tean mappings to JSON files for later use, save the final csv for later

In [19]:
# with open("data/player_mappings.json", "w") as file: 
#     json.dump(player_mappings, file)
# with open("data/team_mappings.json", "w") as file:
#     json.dump(team_mappings, file)

# Swap player and codes to create mapping
# with open('data/id_to_player_mapping.json', 'r') as file:
#     id_to_player_mapping = json.load(file)

# player_to_id_mapping = {value: key for key, value in id_to_player_mapping.items()}

# with open('data/player_to_id_mapping.json', 'w') as file:
#     json.dump(player_to_id_mapping, file)

In [20]:
final.to_csv('data/out.csv', index=False)
final_raw.to_csv('data/out_raw.csv', index=False)

In [32]:
df = pd.DataFrame()

for player in duplicate_players:
    df = pd.concat([df, final_raw.query(f'Player == "{player}"')])

df = df.drop_duplicates(subset="PlayerID")


In [33]:
with open('data/out_raw.csv', 'r') as file:
    raw_content = file.read()

raw_data = StringIO(raw_content)
final_raw = pd.read_csv(raw_data, dtype={"GameDay": "string", "GameID" : "string", "Player" : "string", "PlayerID" : "string", 
                                  "PlayerCode": "string", "TeamID" : "string", "Team" : "string", "OpponentID": "string", 
                                  "Opponent" : "string", "Location" : "string", "Division" : "string", "Conference" : "string", 
                                  "Playoffs" : "string", "WinOrLoss" : "string", "Starter" : "string", "PlayerType" : "string", 
                                  "PerfScore" : "string", "MIN" : "string", "PTS" : "string", "FGM" : "string", "FGA" : "string", 
                                  "3FM" : "string", "3FA" : "string", "FTM" : "string", "FTA" : "string", "REB" : "string", 
                                  "AST" : "string", "STL" : "string", "BLK" : "string", "OREB" : "string", "TO" : "string", 
                                  "PF" : "string"})

In [35]:
players_by_year = {}
unique_raw = final_raw.drop_duplicates(subset="Player")

for year in range(2003, 2025):
    if year not in players_by_year:
        players_by_year[year] = []

    pd = unique_raw.query(f'GameDay > "{year}-1-8" & GameDay < "{year + 1}-7-1"')['Player'].to_dict()
    for name in pd.items():
        players_by_year[year].append(name[1])

with open('data/players_by_year.json', 'w') as file:
    json.dump(players_by_year, file)