# Data Merger

In this notebook, we will merge the counting statistics, advanced statistics and 
transaction data.

Note, much of the code in this notebook is not optimized and could be 
significantly improved. It would be much better to avoid looping through Pandas 
dataframes row by row, and instead apply more builtin Pandas function, but for 
now it seems to work. 

In [64]:
import pandas as pd
import numpy as np
from copy import deepcopy
import datetime

In [65]:
# This ensures side effects do not occur when dealing with views and copies of
# dataframes. See
# https://pandas.pydata.org/pandas-docs/stable//user_guide/copy_on_write.html.

pd.options.mode.copy_on_write = True

In [66]:
pd.options.display.max_columns = None

Load data. 

In [67]:
stats = pd.read_csv('data_to_merge/season_counting_stats.csv')

In [68]:
advanced_stats = pd.read_csv('data_to_merge/advanced_stats.csv')

In [69]:
# Note I made one change to transaction_data.csv. There was previously a row
# with Danny Young being waived on October ?, 1993. I changed this to
# October 1 for simplicity. This shouldn't affect anything drastically.
transaction_data = pd.read_csv('data_to_merge/transaction_data.csv')

In [70]:
nba_key_dates = pd.read_csv('data_to_merge/nba_key_dates.csv')

Remove data from stats and advanced stats prior to 1990 season because our transaction data cuts off at 1990. 

In [71]:
stats = stats[stats['SEASON_START']>1989]
advanced_stats = advanced_stats[advanced_stats['SEASON_START']>1989]
# Below we will keep nba key dates in 1989 because we want to know about 1990
# offseason.
nba_key_dates = nba_key_dates[nba_key_dates['SEASON_START']>1988]

Organize stats columns.

In [72]:
stats_column_names = stats.columns.values.tolist()

In [73]:
stats_columns_to_remove = ['SEASON_ID', 'LEAGUE_ID', 'TEAM_ID', 'NAME', 'SEASON_START']
for col in stats_columns_to_remove:
    stats_column_names.remove(col)

stats_column_names.insert(0, 'NAME')
stats_column_names.insert(2, 'SEASON_START')

In [74]:
stats = stats[stats_column_names]

Organize advanced stats columns. 

In [75]:
advanced_stats = advanced_stats.rename(columns={'tm': 'TEAM_ABBREVIATION'})

In [76]:
advanced_stats_column_names = advanced_stats.columns.values.tolist()
advanced_stats_columns_to_remove = ['player', 'age', 'lg', 'g', 'mp']
for col in advanced_stats_columns_to_remove:
    advanced_stats_column_names.remove(col)

advanced_stats = advanced_stats[advanced_stats_column_names]

In [77]:
advanced_stats_col_rename_dict = {x: x.upper() for x in advanced_stats_column_names}
advanced_stats = advanced_stats.rename(columns=advanced_stats_col_rename_dict)

Fixing mismatched team abbreviations. 

In [78]:
advanced_stats_teams = list(set(advanced_stats.TEAM_ABBREVIATION.tolist()))
stats_teams = list(set(stats.TEAM_ABBREVIATION.tolist()))

In [79]:
len(advanced_stats_teams)

39

In [80]:
len(stats_teams)

41

In [81]:
adv_stats_to_stats_team_dict = {}
for team in advanced_stats_teams:
    if team in stats_teams:
        adv_stats_to_stats_team_dict[team] = team
    else:
        adv_stats_to_stats_team_dict[team] = 'no match'

stats_to_adv_stats_team_dict = {}
for team in stats_teams:
    if team in advanced_stats_teams:
        stats_to_adv_stats_team_dict[team] = team
    else:
        stats_to_adv_stats_team_dict[team] = 'no match'

In [82]:
adv_stats_to_stats_team_dict

{'TOT': 'TOT',
 'NOP': 'NOP',
 'IND': 'IND',
 'ATL': 'ATL',
 'LAC': 'LAC',
 'SAC': 'SAC',
 'CLE': 'CLE',
 'NOK': 'NOK',
 'POR': 'POR',
 'OKC': 'OKC',
 'WSB': 'no match',
 'CHA': 'CHA',
 'MIA': 'MIA',
 'DET': 'DET',
 'WAS': 'WAS',
 'MEM': 'MEM',
 'GSW': 'GSW',
 'LAL': 'LAL',
 'TOR': 'TOR',
 'CHI': 'CHI',
 'VAN': 'VAN',
 'BOS': 'BOS',
 'HOU': 'HOU',
 'NJN': 'NJN',
 'ORL': 'ORL',
 'DEN': 'DEN',
 'CHO': 'no match',
 'PHO': 'no match',
 'UTA': 'UTA',
 'SEA': 'SEA',
 'DAL': 'DAL',
 'PHI': 'PHI',
 'CHH': 'CHH',
 'MIL': 'MIL',
 'BRK': 'no match',
 'MIN': 'MIN',
 'NOH': 'NOH',
 'SAS': 'SAS',
 'NYK': 'NYK'}

In [83]:
stats_to_adv_stats_team_dict

{'TOT': 'TOT',
 'NOP': 'NOP',
 'IND': 'IND',
 'ATL': 'ATL',
 'LAC': 'LAC',
 'SAC': 'SAC',
 'CLE': 'CLE',
 'BKN': 'no match',
 'SAN': 'no match',
 'NOK': 'NOK',
 'POR': 'POR',
 'OKC': 'OKC',
 'PHL': 'no match',
 'CHA': 'CHA',
 'MIA': 'MIA',
 'DET': 'DET',
 'MEM': 'MEM',
 'WAS': 'WAS',
 'GSW': 'GSW',
 'VAN': 'VAN',
 'LAL': 'LAL',
 'CHI': 'CHI',
 'TOR': 'TOR',
 'BOS': 'BOS',
 'PHX': 'no match',
 'GOS': 'no match',
 'HOU': 'HOU',
 'NJN': 'NJN',
 'ORL': 'ORL',
 'UTA': 'UTA',
 'DEN': 'DEN',
 'SEA': 'SEA',
 'CHH': 'CHH',
 'DAL': 'DAL',
 'PHI': 'PHI',
 'UTH': 'no match',
 'MIL': 'MIL',
 'MIN': 'MIN',
 'NOH': 'NOH',
 'SAS': 'SAS',
 'NYK': 'NYK'}

In [84]:
adv_stats_to_stats_team_dict['PHO'] = 'PHX'
adv_stats_to_stats_team_dict['CHO'] = 'CHA'
adv_stats_to_stats_team_dict['BRK'] = 'BKN'
# Note, the advanced stats use WSB for the Washington Bullets and WAS for the
# Washington Wizards, but the counting stats use WAS for both. So, for
# simplicity, we will map both WAS and WSB from advanced stats to WAS.
adv_stats_to_stats_team_dict['WSB'] = 'WAS'

def fix_adv_name(name):
    return adv_stats_to_stats_team_dict[name]

advanced_stats['TEAM_ABBREVIATION'] = advanced_stats['TEAM_ABBREVIATION'].apply(fix_adv_name)

# Note, the counting stats use both PHL and PHI for the Philadelphia 76ers,
# but the advanced stats only use PHI. So we need to make stats use only PHI.
# Similarly, the counting stats use GOS and GSW for the Golden State Warriors,
# but the advanced stats only use GSW.
# Similarly, the counting stats use SAS and SAN for the San Antonio Spurs,
# but the advanced stats only use SAS.
# Similarly, the counting stats use UTA and UTH for the San Antonio Spurs,
# but the advanced stats only use UTA.
def fix_stats_name(name):
    if name == 'PHL':
        return 'PHI'
    if name == 'GOS':
        return 'GSW'
    if name == 'SAN':
        return 'SAS'
    if name == 'UTH':
        return 'UTA'
    return name
stats['TEAM_ABBREVIATION'] = stats['TEAM_ABBREVIATION'].apply(fix_stats_name)

Let's do some final checks to see if the stats and advanced stats match up
before merging. 

In [85]:
stats.shape

(19614, 26)

In [86]:
advanced_stats.shape

(19611, 25)

Assuming every (PLAYER_ID, SEASON_START, TEAM_ABBREVIATION) tuple of advanced 
stats is in stats, it appears we are missing 3 rows from advanced stats. 
Let's find them. 

Sarunas Marciulionis (36, 1996, CLE) is missing from advanced stats. It is
strange because every reference I have found (except for NBA.com) does not list 
him as ever having played for Cleveland. It lists him as playing 1 game for 
Cleveland and 16 games for Denver in the 1996 season, but most other references
have him playing for 17 games with Denver. So in stats we will drop the CLE and 
DEN rows and rename the TOT row to DEN (see a few cells below).


In [87]:
stats[stats['PLAYER_ID']==36]

Unnamed: 0,NAME,PLAYER_ID,SEASON_START,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
206,Sarunas Marciulionis,36,1990,GSW,27.0,50,10.0,987.0,183,365,0.501,1.0,6.0,0.167,178,246,0.724,51.0,67.0,118.0,85,62.0,4.0,75.0,136,545
207,Sarunas Marciulionis,36,1991,GSW,28.0,72,5.0,2117.0,491,912,0.538,3.0,10.0,0.3,376,477,0.788,68.0,140.0,208.0,243,116.0,10.0,193.0,237,1361
208,Sarunas Marciulionis,36,1992,GSW,29.0,30,8.0,836.0,178,328,0.543,3.0,15.0,0.2,162,213,0.761,40.0,57.0,97.0,105,51.0,2.0,76.0,92,521
209,Sarunas Marciulionis,36,1994,SEA,31.0,66,4.0,1194.0,216,457,0.473,35.0,87.0,0.402,145,198,0.732,17.0,51.0,68.0,110,72.0,3.0,98.0,126,612
210,Sarunas Marciulionis,36,1995,SAC,32.0,53,0.0,1039.0,176,389,0.452,64.0,157.0,0.408,155,200,0.775,20.0,57.0,77.0,118,52.0,4.0,96.0,112,571
211,Sarunas Marciulionis,36,1996,CLE,33.0,1,0.0,30.0,3,9,0.333,1.0,3.0,0.333,0,0,0.0,2.0,2.0,4.0,2,0.0,0.0,2.0,3,7
212,Sarunas Marciulionis,36,1996,DEN,33.0,16,0.0,238.0,35,92,0.38,10.0,27.0,0.37,29,36,0.806,10.0,16.0,26.0,23,12.0,1.0,38.0,35,109
213,Sarunas Marciulionis,36,1996,TOT,33.0,17,0.0,268.0,38,101,0.376,11.0,30.0,0.367,29,36,0.806,12.0,18.0,30.0,25,12.0,1.0,40.0,38,116


In [88]:
advanced_stats[advanced_stats['PLAYER_ID']==36]

Unnamed: 0,PLAYER_ID,SEASON_START,POS,EXPERIENCE,TEAM_ABBREVIATION,PER,TS_PERCENT,X3P_AR,F_TR,ORB_PERCENT,DRB_PERCENT,TRB_PERCENT,AST_PERCENT,STL_PERCENT,BLK_PERCENT,TOV_PERCENT,USG_PERCENT,OWS,DWS,WS,WS_48,OBPM,DBPM,BPM,VORP
16783,36,1996,SG,7,DEN,8.1,0.496,0.297,0.356,5.4,8.0,6.7,16.7,2.4,0.3,25.5,27.5,-0.6,0.1,-0.5,-0.085,-4.0,-0.8,-4.8,-0.2
17328,36,1995,SG,6,SAC,16.7,0.599,0.404,0.514,2.2,6.4,4.3,19.5,2.6,0.3,16.8,24.2,1.9,0.7,2.6,0.12,1.6,-0.3,1.3,0.9
17780,36,1994,SG,5,SEA,14.2,0.562,0.19,0.433,1.7,5.0,3.4,14.1,3.0,0.2,15.3,23.3,1.1,1.2,2.3,0.092,-1.2,0.0,-1.2,0.3
18711,36,1992,SF,4,GSW,20.4,0.618,0.046,0.649,5.3,7.6,6.5,19.0,2.9,0.1,15.3,24.3,2.3,0.6,3.0,0.17,2.4,0.4,2.8,1.0
19169,36,1991,SG,3,GSW,18.8,0.607,0.011,0.523,3.6,7.1,5.4,16.0,2.5,0.3,14.7,24.9,4.9,1.4,6.4,0.144,1.5,-0.2,1.3,1.7
19610,36,1990,SG,2,GSW,16.3,0.576,0.016,0.674,5.7,7.5,6.6,12.0,2.9,0.2,13.7,22.2,1.8,0.7,2.5,0.121,0.0,0.1,0.1,0.5


Chance Comanche	played exactly one game in the 2022-2023 season and did not play
in any other seasons. There is no row in advanced stats for this player. I think 
it is fine to drop this player from the stats data. 

In [89]:
stats[stats['PLAYER_ID']==1628435]

Unnamed: 0,NAME,PLAYER_ID,SEASON_START,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
27943,Chance Comanche,1628435,2022,POR,27.0,1,0.0,21.0,3,5,0.6,0.0,0.0,0.0,1,4,0.25,2.0,1.0,3.0,0,0.0,1.0,0.0,0,7


In [90]:
advanced_stats[advanced_stats['PLAYER_ID']==1628435]

Unnamed: 0,PLAYER_ID,SEASON_START,POS,EXPERIENCE,TEAM_ABBREVIATION,PER,TS_PERCENT,X3P_AR,F_TR,ORB_PERCENT,DRB_PERCENT,TRB_PERCENT,AST_PERCENT,STL_PERCENT,BLK_PERCENT,TOV_PERCENT,USG_PERCENT,OWS,DWS,WS,WS_48,OBPM,DBPM,BPM,VORP


In [91]:
stats = stats.drop([211, 212, 27943])

In [92]:
stats.at[213, 'TEAM_ABBREVIATION'] = 'DEN'

In [93]:
stats.at[213, 'TEAM_ABBREVIATION']

'DEN'

Now we check the number of rows of stats and advanced stats and see they are 
equal, which is what we want. 

In [94]:
stats.shape

(19611, 26)

In [95]:
advanced_stats.shape

(19611, 25)

Merge stats and advanced stats. 

In [96]:
merged_data = pd.merge(stats, advanced_stats, on=['PLAYER_ID', 'SEASON_START', 'TEAM_ABBREVIATION'], how='left')

We check the shape and see it has the right number of rows. 

In [97]:
merged_data.shape

(19611, 48)

Organize merged data columns. 

In [98]:
merged_data_columns = merged_data.columns.values.tolist()

In [99]:
merged_data_columns.remove('EXPERIENCE')
merged_data_columns.insert(5, 'EXPERIENCE')
merged_data_columns.remove('POS')
merged_data_columns.insert(6, 'POS')

In [100]:
merged_data = merged_data[merged_data_columns]

Next, we collapse the rows from (PLAYER, SEASON, TEAM) to (PLAYER, SEASON). 
We will preserve the teams by creating a list of all teams a player has played 
for during each season.

One could not do this collapse and keep more data, but 
to simplify some of the analysis I will do this collapse. 

In [101]:
merged_data = merged_data.reset_index(drop=True)

In [102]:
merged_data_collapsed_teams = pd.DataFrame()

In [103]:
merged_data

Unnamed: 0,NAME,PLAYER_ID,SEASON_START,TEAM_ABBREVIATION,PLAYER_AGE,EXPERIENCE,POS,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PER,TS_PERCENT,X3P_AR,F_TR,ORB_PERCENT,DRB_PERCENT,TRB_PERCENT,AST_PERCENT,STL_PERCENT,BLK_PERCENT,TOV_PERCENT,USG_PERCENT,OWS,DWS,WS,WS_48,OBPM,DBPM,BPM,VORP
0,Byron Scott,2,1990,LAL,30.0,8,SG,82,82.0,2630.0,501,1051,0.477,71.0,219.0,0.324,118,148,0.797,54.0,192.0,246.0,177,95.0,21.0,85.0,146,1191,14.2,0.534,0.208,0.141,2.5,8.1,5.4,10.3,1.8,0.5,7.1,19.9,3.6,2.9,6.6,0.120,0.3,0.6,0.9,2.0
1,Byron Scott,2,1991,LAL,31.0,9,SG,82,82.0,2679.0,460,1005,0.458,54.0,157.0,0.344,244,291,0.838,74.0,236.0,310.0,226,105.0,28.0,119.0,140,1218,15.5,0.537,0.156,0.290,3.1,10.2,6.6,13.4,2.0,0.6,9.5,20.4,3.9,2.1,6.0,0.107,1.0,0.1,1.0,2.0
2,Byron Scott,2,1992,LAL,32.0,10,SG,58,53.0,1677.0,296,659,0.449,44.0,135.0,0.326,156,184,0.848,27.0,107.0,134.0,157,55.0,13.0,70.0,98,792,14.4,0.535,0.205,0.279,1.8,7.3,4.6,14.2,1.6,0.5,8.6,20.7,2.4,0.9,3.3,0.094,0.3,-1.0,-0.7,0.5
3,Byron Scott,2,1993,IND,33.0,11,SG,67,2.0,1197.0,256,548,0.467,27.0,74.0,0.365,157,195,0.805,19.0,91.0,110.0,133,62.0,9.0,103.0,80,696,17.6,0.549,0.135,0.356,1.9,8.5,5.3,18.9,2.7,0.5,14.0,27.0,1.7,1.4,3.1,0.123,1.3,0.2,1.5,1.1
4,Byron Scott,2,1994,IND,34.0,12,PG,80,1.0,1528.0,265,583,0.455,79.0,203.0,0.389,193,227,0.850,18.0,133.0,151.0,108,61.0,13.0,119.0,123,802,15.8,0.587,0.348,0.389,1.5,10.3,6.0,12.2,2.1,0.7,14.8,24.0,2.1,1.9,3.9,0.124,1.1,0.3,1.4,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19606,Maozinha Pereira,1641970,2023,MEM,23.0,1,SF,7,1.0,122.0,18,35,0.514,5.0,13.0,0.385,7,10,0.700,11.0,26.0,37.0,2,6.0,4.0,5.0,10,48,15.7,0.609,0.371,0.286,9.5,24.3,16.6,2.5,2.4,3.1,11.3,15.6,0.1,0.2,0.3,0.123,-1.2,0.7,-0.5,0.0
19607,Trey Jemison,1641998,2023,WAS,24.0,1,C,2,0.0,1.0,0,0,0.000,0.0,0.0,0.000,0,0,0.000,0.0,1.0,1.0,0,0.0,0.0,1.0,0,0,-36.1,,,,0.0,100.0,53.5,0.0,0.0,0.0,100.0,42.1,0.0,0.0,0.0,-1.200,-30.3,-3.6,-33.9,0.0
19608,Trey Jemison,1641998,2023,MEM,24.0,1,C,23,14.0,573.0,75,136,0.551,0.0,0.0,0.000,21,25,0.840,64.0,69.0,133.0,27,12.0,28.0,33.0,69,171,11.9,0.582,0.000,0.184,11.8,13.7,12.7,7.1,1.0,4.6,18.3,13.5,0.3,0.6,0.9,0.075,-4.7,-0.4,-5.1,-0.4
19609,Trey Jemison,1641998,2023,TOT,24.0,1,C,25,14.0,574.0,75,136,0.551,0.0,0.0,0.000,21,25,0.840,64.0,70.0,134.0,27,12.0,28.0,34.0,69,171,11.8,0.582,0.000,0.184,11.7,13.9,12.8,7.1,1.0,4.6,18.8,13.5,0.3,0.6,0.9,0.073,-4.7,-0.4,-5.1,-0.5


In [104]:
i = 0
while i < merged_data.shape[0]:
    team_list = [merged_data.iloc[i].TEAM_ABBREVIATION]
    j = 0
    while True:
        if i+j+1 >= merged_data.shape[0]:
            new_row = merged_data.iloc[(i+j):(i+j+1), :]
            new_row.iat[0, 3] = deepcopy(team_list)
            merged_data_collapsed_teams = pd.concat([merged_data_collapsed_teams, new_row])
            i += j + 1
            break
        elif (merged_data.iloc[i+j+1].PLAYER_ID == merged_data.iloc[i].PLAYER_ID) and (merged_data.iloc[i+j+1].SEASON_START == merged_data.iloc[i].SEASON_START):
            if merged_data.iloc[i+j+1].TEAM_ABBREVIATION != 'TOT':
                team_list.append(merged_data.iloc[i+j+1].TEAM_ABBREVIATION)
        else:
            new_row = merged_data.iloc[(i+j):(i+j+1), :]
            new_row.iat[0, 3] = deepcopy(team_list)
            merged_data_collapsed_teams = pd.concat([merged_data_collapsed_teams, new_row])
            i += j + 1
            break
        j += 1

In [105]:
merged_data_collapsed_teams

Unnamed: 0,NAME,PLAYER_ID,SEASON_START,TEAM_ABBREVIATION,PLAYER_AGE,EXPERIENCE,POS,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PER,TS_PERCENT,X3P_AR,F_TR,ORB_PERCENT,DRB_PERCENT,TRB_PERCENT,AST_PERCENT,STL_PERCENT,BLK_PERCENT,TOV_PERCENT,USG_PERCENT,OWS,DWS,WS,WS_48,OBPM,DBPM,BPM,VORP
0,Byron Scott,2,1990,[LAL],30.0,8,SG,82,82.0,2630.0,501,1051,0.477,71.0,219.0,0.324,118,148,0.797,54.0,192.0,246.0,177,95.0,21.0,85.0,146,1191,14.2,0.534,0.208,0.141,2.5,8.1,5.4,10.3,1.8,0.5,7.1,19.9,3.6,2.9,6.6,0.120,0.3,0.6,0.9,2.0
1,Byron Scott,2,1991,[LAL],31.0,9,SG,82,82.0,2679.0,460,1005,0.458,54.0,157.0,0.344,244,291,0.838,74.0,236.0,310.0,226,105.0,28.0,119.0,140,1218,15.5,0.537,0.156,0.290,3.1,10.2,6.6,13.4,2.0,0.6,9.5,20.4,3.9,2.1,6.0,0.107,1.0,0.1,1.0,2.0
2,Byron Scott,2,1992,[LAL],32.0,10,SG,58,53.0,1677.0,296,659,0.449,44.0,135.0,0.326,156,184,0.848,27.0,107.0,134.0,157,55.0,13.0,70.0,98,792,14.4,0.535,0.205,0.279,1.8,7.3,4.6,14.2,1.6,0.5,8.6,20.7,2.4,0.9,3.3,0.094,0.3,-1.0,-0.7,0.5
3,Byron Scott,2,1993,[IND],33.0,11,SG,67,2.0,1197.0,256,548,0.467,27.0,74.0,0.365,157,195,0.805,19.0,91.0,110.0,133,62.0,9.0,103.0,80,696,17.6,0.549,0.135,0.356,1.9,8.5,5.3,18.9,2.7,0.5,14.0,27.0,1.7,1.4,3.1,0.123,1.3,0.2,1.5,1.1
4,Byron Scott,2,1994,[IND],34.0,12,PG,80,1.0,1528.0,265,583,0.455,79.0,203.0,0.389,193,227,0.850,18.0,133.0,151.0,108,61.0,13.0,119.0,123,802,15.8,0.587,0.348,0.389,1.5,10.3,6.0,12.2,2.1,0.7,14.8,24.0,2.1,1.9,3.9,0.124,1.1,0.3,1.4,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19604,Dexter Dennis,1641926,2023,[DAL],25.0,1,SG,4,0.0,30.0,10,19,0.526,1.0,8.0,0.125,1,3,0.333,1.0,8.0,9.0,4,0.0,1.0,3.0,1,22,18.3,0.541,0.421,0.158,3.7,29.0,16.4,23.7,0.0,2.9,12.9,33.3,0.0,0.0,0.0,0.004,1.7,-0.1,1.7,0.0
19605,Onuralp Bitim,1641931,2023,[CHI],25.0,1,SG,23,1.0,268.0,32,84,0.381,12.0,44.0,0.273,4,5,0.800,6.0,26.0,32.0,13,3.0,2.0,10.0,24,80,5.0,0.464,0.524,0.060,2.5,11.1,6.7,6.5,0.6,0.8,10.4,15.8,-0.3,0.1,-0.1,-0.027,-5.4,-2.0,-7.4,-0.4
19606,Maozinha Pereira,1641970,2023,[MEM],23.0,1,SF,7,1.0,122.0,18,35,0.514,5.0,13.0,0.385,7,10,0.700,11.0,26.0,37.0,2,6.0,4.0,5.0,10,48,15.7,0.609,0.371,0.286,9.5,24.3,16.6,2.5,2.4,3.1,11.3,15.6,0.1,0.2,0.3,0.123,-1.2,0.7,-0.5,0.0
19609,Trey Jemison,1641998,2023,"[WAS, MEM]",24.0,1,C,25,14.0,574.0,75,136,0.551,0.0,0.0,0.000,21,25,0.840,64.0,70.0,134.0,27,12.0,28.0,34.0,69,171,11.8,0.582,0.000,0.184,11.7,13.9,12.8,7.1,1.0,4.6,18.8,13.5,0.3,0.6,0.9,0.073,-4.7,-0.4,-5.1,-0.5


Next, we add the transaction data. We will create the following new columns:

- WAIVED_OFF
    - 1 if player was waived during this offseason, 0 otherwise. 
- WAIVED_REG
    - 1 if player was waived during this regular season, 0 otherwise. 
- WAIVED_POST
     - 1 if player was waived during this postseason, 0 otherwise. 
- WAIVED_NBA_YEAR
     - 1 if player was waived anytime between the first day (inclusive) of 
this regular season to the first day of the next regular season (exclusive), 
and 0 otherwise.
- WAIVED_NEXT_OFF
    - 1 if player was waived during next offseason, 0 otherwise. 
- WAIVED_NEXT_REG
    - 1 if player was waived during next regular season, 0 otherwise. 
- WAIVED_NEXT_POST
    - 1 if player was waived during next postseason, 0 otherwise. 
- WAIVED_NEXT_NBA_YEAR
    - 1 if player was waived anytime between the first day (inclusive) of 
the next regular season to the first day of the next-next regular season (exclusive), 
and 0 otherwise. 

Same as above for TRADED. 

The definition I am using for the offseason is the first day after the last NBA
finals game of the PREVIOUS SEASON to the day before the first regular season game.
For example, the 2022 offseason was from 2022-06-17 to 2022-10-17.

The definition I am using for postseason is the first day after the last regular 
season game to the last day of the NBA finals. 

To create these columns we will need to read the transaction dates and place
them into the appropriate columns. To do this sorting, we will need to know the 
dates of each regular, post, and off-seasons. I have compiled this in 
nba_key_dates.csv. 

We will also create a new column IN_LEAGUE_NEXT which yields a 1 if we have 
an entry for that player in our stats data for the following season, and 0 
otherwise. If we don't consider this column, some misleading 
analysis could occur. For example, if we have no 0s for being waived in the
following season, it could simply because they were not even in the league, and 
not because they are performing well. 

In [106]:
transaction_cols = transaction_data.columns.values.tolist()
transaction_cols.remove('SEASON_START')
transaction_cols.remove('PLAYER_NAME')
transaction_cols.remove('BBREF_ID')
transaction_cols.remove('TEAM_ABBREVIATION')
transaction_data = transaction_data[transaction_cols]

In [107]:
def transaction_type(date, transaction):
    if transaction == 1:
        date = datetime.datetime.strptime(date, "%B %d, %Y").date()
        for i in range(1, nba_key_dates.shape[0]):
            season_start = nba_key_dates.iloc[i, 0]
            previous_reg_end = datetime.datetime.strptime(nba_key_dates.iloc[i-1,3], "%Y-%m-%d").date()
            reg_start = datetime.datetime.strptime(nba_key_dates.iloc[i, 2], "%Y-%m-%d").date()
            reg_end = datetime.datetime.strptime(nba_key_dates.iloc[i, 3], "%Y-%m-%d").date()
            post_end = datetime.datetime.strptime(nba_key_dates.iloc[i, 4], "%Y-%m-%d").date()
            if (previous_reg_end < date) and (date < reg_start):
                return 'off', season_start
            if (reg_start <= date) and (date <= reg_end):
                return 'reg', season_start
            if (reg_end < date) and (date <= post_end):
                return 'post', season_start
    else:
        return 0


In [108]:
transaction_data['WAIVED_CONSTRUCTOR'] = transaction_data.apply(lambda x: transaction_type(x.DATE, x.WAIVED), axis=1)

In [109]:
transaction_data['TRADED_CONSTRUCTOR'] = transaction_data.apply(lambda x: transaction_type(x.DATE, x.TRADED), axis=1)

In [110]:
transaction_data

Unnamed: 0,DATE,WAIVED,TRADED,PLAYER_ID,WAIVED_CONSTRUCTOR,TRADED_CONSTRUCTOR
0,"July 5, 1990",1,0,194,"(off, 1990)",0
1,"July 9, 1990",1,0,76025,"(off, 1990)",0
2,"July 9, 1990",1,0,77104,"(off, 1990)",0
3,"July 24, 1990",1,0,78386,"(off, 1990)",0
4,"August 10, 1990",1,0,76911,"(off, 1990)",0
...,...,...,...,...,...,...
7884,"June 26, 2024",0,1,1642346,0,"(off, 2024)"
7885,"June 26, 2024",0,1,1641747,0,"(off, 2024)"
7886,"May 11, 1992",0,1,76616,0,"(post, 1991)"
7887,"September 1, 1995",0,1,77962,0,"(off, 1995)"


In [111]:
def transaction_builder(player_id, season_start, transaction_type, transaction_period, future):
    temp_df = transaction_data[(transaction_data['PLAYER_ID']== player_id) & (transaction_data[transaction_type]== 1)]
    if transaction_type == 'WAIVED':
        type_col = 4
    else:
        type_col = 5
    for i in range(temp_df.shape[0]):
        if temp_df.iloc[i, type_col] == (transaction_period, season_start+future):
            return 1
    return 0

In [112]:
def nba_year_builder(off, reg, post):
    if off == 1:
        return 1
    if reg == 1:
        return 1
    if post == 1:
        return 1
    return 0

In [113]:
merged_data_collapsed_teams['WAIVED_OFF'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'WAIVED', 'off', 0), axis=1)
merged_data_collapsed_teams['WAIVED_REG'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'WAIVED', 'reg', 0), axis=1)
merged_data_collapsed_teams['WAIVED_POST'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'WAIVED', 'post', 0), axis=1)

In [114]:
merged_data_collapsed_teams['WAIVED_NBA_YEAR'] = merged_data_collapsed_teams.apply(lambda x: nba_year_builder(x.WAIVED_OFF, x.WAIVED_REG, x.WAIVED_POST), axis=1)

In [115]:
merged_data_collapsed_teams['WAIVED_NEXT_OFF'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'WAIVED', 'off', 1), axis=1)
merged_data_collapsed_teams['WAIVED_NEXT_REG'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'WAIVED', 'reg', 1), axis=1)
merged_data_collapsed_teams['WAIVED_NEXT_POST'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'WAIVED', 'post', 1), axis=1)

In [116]:
merged_data_collapsed_teams['WAIVED_NEXT_NBA_YEAR'] = merged_data_collapsed_teams.apply(lambda x: nba_year_builder(x.WAIVED_NEXT_OFF, x.WAIVED_NEXT_REG, x.WAIVED_NEXT_POST), axis=1)

In [117]:
merged_data_collapsed_teams['TRADED_OFF'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'TRADED', 'off', 0), axis=1)
merged_data_collapsed_teams['TRADED_REG'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'TRADED', 'reg', 0), axis=1)
merged_data_collapsed_teams['TRADED_POST'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'TRADED', 'post', 0), axis=1)

In [118]:
merged_data_collapsed_teams['TRADED_NBA_YEAR'] = merged_data_collapsed_teams.apply(lambda x: nba_year_builder(x.TRADED_OFF, x.TRADED_REG, x.TRADED_POST), axis=1)

In [119]:
merged_data_collapsed_teams['TRADED_NEXT_OFF'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'TRADED', 'off', 1), axis=1)
merged_data_collapsed_teams['TRADED_NEXT_REG'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'TRADED', 'reg', 1), axis=1)
merged_data_collapsed_teams['TRADED_NEXT_POST'] = merged_data_collapsed_teams.apply(lambda x: transaction_builder(x.PLAYER_ID, x.SEASON_START, 'TRADED', 'post', 1), axis=1)

In [120]:
merged_data_collapsed_teams['TRADED_NEXT_NBA_YEAR'] = merged_data_collapsed_teams.apply(lambda x: nba_year_builder(x.TRADED_NEXT_OFF, x.TRADED_NEXT_REG, x.TRADED_NEXT_POST), axis=1)

In [121]:
def in_league_next(player_id, season_start):
    if merged_data_collapsed_teams[(merged_data_collapsed_teams['PLAYER_ID']==player_id) & (merged_data_collapsed_teams['SEASON_START']==season_start+1)].size > 0:
        return 1
    else:
        return 0

In [122]:
merged_data_collapsed_teams['IN_LEAGUE_NEXT'] = merged_data_collapsed_teams.apply(lambda x: in_league_next(x.PLAYER_ID, x.SEASON_START), axis=1)

In [123]:
merged_data_collapsed_teams

Unnamed: 0,NAME,PLAYER_ID,SEASON_START,TEAM_ABBREVIATION,PLAYER_AGE,EXPERIENCE,POS,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PER,TS_PERCENT,X3P_AR,F_TR,ORB_PERCENT,DRB_PERCENT,TRB_PERCENT,AST_PERCENT,STL_PERCENT,BLK_PERCENT,TOV_PERCENT,USG_PERCENT,OWS,DWS,WS,WS_48,OBPM,DBPM,BPM,VORP,WAIVED_OFF,WAIVED_REG,WAIVED_POST,WAIVED_NBA_YEAR,WAIVED_NEXT_OFF,WAIVED_NEXT_REG,WAIVED_NEXT_POST,WAIVED_NEXT_NBA_YEAR,TRADED_OFF,TRADED_REG,TRADED_POST,TRADED_NBA_YEAR,TRADED_NEXT_OFF,TRADED_NEXT_REG,TRADED_NEXT_POST,TRADED_NEXT_NBA_YEAR,IN_LEAGUE_NEXT
0,Byron Scott,2,1990,[LAL],30.0,8,SG,82,82.0,2630.0,501,1051,0.477,71.0,219.0,0.324,118,148,0.797,54.0,192.0,246.0,177,95.0,21.0,85.0,146,1191,14.2,0.534,0.208,0.141,2.5,8.1,5.4,10.3,1.8,0.5,7.1,19.9,3.6,2.9,6.6,0.120,0.3,0.6,0.9,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,Byron Scott,2,1991,[LAL],31.0,9,SG,82,82.0,2679.0,460,1005,0.458,54.0,157.0,0.344,244,291,0.838,74.0,236.0,310.0,226,105.0,28.0,119.0,140,1218,15.5,0.537,0.156,0.290,3.1,10.2,6.6,13.4,2.0,0.6,9.5,20.4,3.9,2.1,6.0,0.107,1.0,0.1,1.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,Byron Scott,2,1992,[LAL],32.0,10,SG,58,53.0,1677.0,296,659,0.449,44.0,135.0,0.326,156,184,0.848,27.0,107.0,134.0,157,55.0,13.0,70.0,98,792,14.4,0.535,0.205,0.279,1.8,7.3,4.6,14.2,1.6,0.5,8.6,20.7,2.4,0.9,3.3,0.094,0.3,-1.0,-0.7,0.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,Byron Scott,2,1993,[IND],33.0,11,SG,67,2.0,1197.0,256,548,0.467,27.0,74.0,0.365,157,195,0.805,19.0,91.0,110.0,133,62.0,9.0,103.0,80,696,17.6,0.549,0.135,0.356,1.9,8.5,5.3,18.9,2.7,0.5,14.0,27.0,1.7,1.4,3.1,0.123,1.3,0.2,1.5,1.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,Byron Scott,2,1994,[IND],34.0,12,PG,80,1.0,1528.0,265,583,0.455,79.0,203.0,0.389,193,227,0.850,18.0,133.0,151.0,108,61.0,13.0,119.0,123,802,15.8,0.587,0.348,0.389,1.5,10.3,6.0,12.2,2.1,0.7,14.8,24.0,2.1,1.9,3.9,0.124,1.1,0.3,1.4,1.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19604,Dexter Dennis,1641926,2023,[DAL],25.0,1,SG,4,0.0,30.0,10,19,0.526,1.0,8.0,0.125,1,3,0.333,1.0,8.0,9.0,4,0.0,1.0,3.0,1,22,18.3,0.541,0.421,0.158,3.7,29.0,16.4,23.7,0.0,2.9,12.9,33.3,0.0,0.0,0.0,0.004,1.7,-0.1,1.7,0.0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
19605,Onuralp Bitim,1641931,2023,[CHI],25.0,1,SG,23,1.0,268.0,32,84,0.381,12.0,44.0,0.273,4,5,0.800,6.0,26.0,32.0,13,3.0,2.0,10.0,24,80,5.0,0.464,0.524,0.060,2.5,11.1,6.7,6.5,0.6,0.8,10.4,15.8,-0.3,0.1,-0.1,-0.027,-5.4,-2.0,-7.4,-0.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19606,Maozinha Pereira,1641970,2023,[MEM],23.0,1,SF,7,1.0,122.0,18,35,0.514,5.0,13.0,0.385,7,10,0.700,11.0,26.0,37.0,2,6.0,4.0,5.0,10,48,15.7,0.609,0.371,0.286,9.5,24.3,16.6,2.5,2.4,3.1,11.3,15.6,0.1,0.2,0.3,0.123,-1.2,0.7,-0.5,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19609,Trey Jemison,1641998,2023,"[WAS, MEM]",24.0,1,C,25,14.0,574.0,75,136,0.551,0.0,0.0,0.000,21,25,0.840,64.0,70.0,134.0,27,12.0,28.0,34.0,69,171,11.8,0.582,0.000,0.184,11.7,13.9,12.8,7.1,1.0,4.6,18.8,13.5,0.3,0.6,0.9,0.073,-4.7,-0.4,-5.1,-0.5,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [124]:
merged_data_collapsed_teams = merged_data_collapsed_teams.rename(columns={'TEAM_ABBREVIATION': 'TEAMS_LIST'})

In [125]:
merged_data_collapsed_teams.to_csv('merged_data_collapsed_teams.csv', index=False)