In [225]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import seaborn as sns; sns.set()

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Build lineup dataset that has the pts_diff and then 300+ cols for merged numerical stat profile of each player wrt previous season

Steps
- Isolate numeric features for player dataset
- cull lineup data to lineups where all players have entries for the prev season in the player dataset
- get the player stat profile for each lineup
- combine valid lineup dataframe with pts_diff with the lineup combined player stat profiles 
  - keep the lineup mins played data
- Write out dataset

In [230]:
df = pd.read_csv('datasets/master_players_stats_bio_clusters_pca.csv', index_col=0)
df.shape

(6541, 145)

### Isolate numeric features that can be used

In [232]:
dropcols = list(df.columns[df.columns.str.contains('pc_')]) + list(df.columns[df.columns.str.contains('cluster_')]) 
df_num = df.drop(dropcols, axis=1)
df_num.shape

df_num.dtypes.value_counts()

df_num.select_dtypes('O').columns
df_num.drop(['pos', 'team_id', 'salary', 'nationality'], axis=1, inplace=True)
## keep player_id for merging later
df_num.shape

(6541, 92)

float64    77
int64      10
object      5
dtype: int64

Index(['player_id', 'pos', 'team_id', 'salary', 'nationality'], dtype='object')

(6541, 88)

In [226]:
sorted(df_num.columns)

## leaving in clustering label

['age',
 'and1s_per_min',
 'ast_pct',
 'ast_per_poss',
 'astd_pts_per_min',
 'avg_dist',
 'blk_pct',
 'blk_per_poss',
 'bpm',
 'dbpm',
 'def_rtg',
 'drawn_shooting_per_min',
 'drb_pct',
 'drb_per_poss',
 'dws',
 'fg2_pct',
 'fg2_per_poss',
 'fg2a_per_poss',
 'fg3_heave',
 'fg3_pct',
 'fg3_per_poss',
 'fg3a_heave',
 'fg3a_per_fga_pct',
 'fg3a_per_poss',
 'fg_dunk_per_min',
 'fg_pct',
 'fg_pct_00_03',
 'fg_pct_03_10',
 'fg_pct_10_16',
 'fg_pct_16_xx',
 'fg_pct_corner3',
 'fg_pct_fg2a',
 'fg_pct_fg3a',
 'fg_per_poss',
 'fga_per_poss',
 'fouls_offensive_per_min',
 'fouls_shooting_per_min',
 'ft_pct',
 'ft_per_poss',
 'fta_per_fga_pct',
 'fta_per_poss',
 'g',
 'gs',
 'height',
 'label',
 'mp',
 'obpm',
 'off_rtg',
 'orb_pct',
 'orb_per_poss',
 'own_shots_blk_per_min',
 'ows',
 'pct_ast_fg2',
 'pct_ast_fg3',
 'pct_c',
 'pct_fg3a_corner3',
 'pct_fga_00_03',
 'pct_fga_03_10',
 'pct_fga_10_16',
 'pct_fga_16_xx',
 'pct_fga_dunk',
 'pct_fga_fg2a',
 'pct_fga_fg3a',
 'pct_pf',
 'pct_pg',
 'pct_sf',

In [245]:
df_line = pd.read_csv('datasets/master_4man_lineup_clusters_2000_2020.csv', index_col=0)
df_line.shape

(5222, 51)

### Drop 2000 and all columns but `diff_pts` as player season data starts in 2000 and I'm using a 1 year lag

In [246]:
df_line = df_line[df_line['year'] > 2000].copy()
df_line = df_line[['diff_pts', 'year', 'player_ids', 'mp']].copy()
df_line

Unnamed: 0_level_0,diff_pts,year,player_ids,mp
lineup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R. McLeod | D. Mutombo | J. Terry | L. Wright ATL 2001,3.6,2001,"mcleoro01, mutomdi01, terryja01, wrighlo02",410.583333
S. Abdur-Rahim | D. Johnson | N. Mohammed | J. Terry ATL 2002,1.2,2002,"abdursh01, johnsde03, mohamna01, terryja01",665.516667
S. Abdur-Rahim | N. Mohammed | I. Newble | J. Terry ATL 2002,-2.7,2002,"abdursh01, mohamna01, newblir01, terryja01",591.433333
S. Abdur-Rahim | D. Johnson | I. Newble | J. Terry ATL 2002,3.4,2002,"abdursh01, johnsde03, newblir01, terryja01",566.183333
S. Abdur-Rahim | D. Johnson | N. Mohammed | I. Newble ATL 2002,-0.6,2002,"abdursh01, johnsde03, mohamna01, newblir01",492.583333
...,...,...,...,...
T. Ariza | B. Beal | T. Bryant | T. Satoranský WAS 2019,-0.5,2019,"arizatr01, bealbr01, bryanth01, satorto01",460.150000
B. Beal | T. Bryant | J. Green | T. Satoranský WAS 2019,-1.3,2019,"bealbr01, bryanth01, greenje02, satorto01",443.250000
T. Ariza | T. Bryant | J. Green | T. Satoranský WAS 2019,-0.4,2019,"arizatr01, bryanth01, greenje02, satorto01",359.783333
B. Beal | M. Morris | O. Porter | J. Wall WAS 2019,-0.8,2019,"bealbr01, morrima02, porteot01, walljo01",342.833333


## Func for building merged player stat profile

In [247]:
def check_if_valid_lineup(lineup, df_players):
    player_ids = lineup['player_ids'].split(', ')
    year = lineup['year'] - 1    
    for pid in player_ids:
        if df_players[(df_players['player_id']==pid) & (df_players['year']==year)].empty:
            return False
    return True

# l1 = df_line.iloc[0]
# check_if_valid_lineup(l1, df)    

In [248]:
df_players = df_num.copy()
indices = df_line.apply(lambda lineup: check_if_valid_lineup(lineup, df_players), axis=1)

In [249]:
indices.value_counts()

True     3356
False    1579
dtype: int64

In [199]:
## checking that a player is actually missing
pid = 'abdursh01'
pid = 'mohamna01'
pid = 'newblir01'  # missing

year = 2001
player = df_players[(df_players['player_id']==pid) & (df_players['year']==year)]
player

Unnamed: 0_level_0,player_id,age,g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,...,off_rtg,orb_per_poss,pf_per_poss,pts_per_poss,stl_per_poss,tov_per_poss,trb_per_poss,height,weight,label
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [250]:
df_valid_line = df_line[indices].copy()
df_valid_line.shape

(3356, 4)

## Need to build the player stat profile now for each valid lineup

In [255]:
def build_lineup_stat_profile(lineup, df_players):
    player_ids = lineup['player_ids'].split(', ')
    year = lineup['year'] - 1
    stats = []
    for pid in player_ids:
        player = df_players[(df_players['player_id']==pid) & (df_players['year']==year)]
#         stats.append(player.drop('player_id', axis=1).values)
        stats.append(player.values)
    return reduce(lambda left, right: np.append(left, right), stats)

### Build lineup series and combine with valid lineup df

In [253]:
lineup_stat_profile_series = df_valid_line.apply(lambda lineup: build_lineup_stat_profile(lineup, df_players), axis=1)
lineup_stat_profiles = [np.array(lineup_stat_profile_series[row]) for row in range(len(lineup_stat_profile_series))]
df_lineup_stat_profiles = pd.DataFrame(lineup_stat_profiles, index=df_valid_line.index)
df_lineups_with_player_stats = df_valid_line.merge(df_lineup_stat_profiles, left_index=True, right_index=True, sort=False)


In [256]:
df_lineups_with_player_stats.shape
df_lineups_with_player_stats.head()
# df_lineups_with_player_stats.drop('year', axis=1, inplace=True)
df_lineups_with_player_stats.drop(df_lineups_with_player_stats.select_dtypes('O').columns, axis=1, inplace=True)
df_lineups_with_player_stats.shape
df_lineups_with_player_stats.head()



(3368, 351)

Unnamed: 0_level_0,diff_pts,year,mp,1,2,3,4,5,6,7,...,342,343,344,345,346,347,348,349,350,351
lineup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A. Afflalo | C. Anthony | C. Billups | K. Martin DEN 2010,7.2,2010,598.85,23,74,1234,8.9,0.548,0.357,0.237,...,104.0,2.0,4.8,18.5,2.3,2.5,9.5,81,234,2
A. Afflalo | C. Anthony | C. Billups | K. Martin DEN 2011,7.8,2011,351.216667,24,82,2221,10.9,0.576,0.426,0.168,...,101.0,3.6,4.7,17.0,1.8,2.3,13.9,81,234,2
A. Afflalo | C. Anthony | C. Billups | N. Hilário DEN 2010,4.8,2010,1073.5,23,74,1234,8.9,0.548,0.357,0.237,...,120.0,3.7,5.6,22.8,1.9,3.0,12.1,73,250,0
A. Afflalo | C. Anthony | C. Billups | N. Hilário DEN 2011,4.0,2011,807.683333,24,82,2221,10.9,0.576,0.426,0.168,...,124.0,3.1,5.2,20.9,2.1,2.3,11.5,73,250,7
A. Afflalo | C. Anthony | J. Calderón | R. Lopez NYK 2016,-3.8,2016,953.283333,29,78,2502,10.7,0.533,0.377,0.224,...,117.0,5.9,3.8,17.6,0.5,2.3,12.3,84,281,7


(3368, 351)

Unnamed: 0_level_0,diff_pts,year,mp,1,2,3,4,5,6,7,...,342,343,344,345,346,347,348,349,350,351
lineup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A. Afflalo | C. Anthony | C. Billups | K. Martin DEN 2010,7.2,2010,598.85,23,74,1234,8.9,0.548,0.357,0.237,...,104.0,2.0,4.8,18.5,2.3,2.5,9.5,81,234,2
A. Afflalo | C. Anthony | C. Billups | K. Martin DEN 2011,7.8,2011,351.216667,24,82,2221,10.9,0.576,0.426,0.168,...,101.0,3.6,4.7,17.0,1.8,2.3,13.9,81,234,2
A. Afflalo | C. Anthony | C. Billups | N. Hilário DEN 2010,4.8,2010,1073.5,23,74,1234,8.9,0.548,0.357,0.237,...,120.0,3.7,5.6,22.8,1.9,3.0,12.1,73,250,0
A. Afflalo | C. Anthony | C. Billups | N. Hilário DEN 2011,4.0,2011,807.683333,24,82,2221,10.9,0.576,0.426,0.168,...,124.0,3.1,5.2,20.9,2.1,2.3,11.5,73,250,7
A. Afflalo | C. Anthony | J. Calderón | R. Lopez NYK 2016,-3.8,2016,953.283333,29,78,2502,10.7,0.533,0.377,0.224,...,117.0,5.9,3.8,17.6,0.5,2.3,12.3,84,281,7


## Write out

In [257]:
df_lineups_with_player_stats.to_csv('datasets/4man_lineups_with_player_stat_profiles.csv')

In [258]:
df_lineup_stat_profiles.select_dtypes('O').head()
# lineup_stat_profile_series


Unnamed: 0_level_0,0,88,176,264
lineup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R. McLeod | D. Mutombo | J. Terry | L. Wright ATL 2001,mcleoro01,mutomdi01,terryja01,wrighlo02
S. Abdur-Rahim | D. Johnson | N. Mohammed | J. Terry ATL 2002,abdursh01,johnsde03,mohamna01,terryja01
S. Abdur-Rahim | N. Mohammed | J. Terry | J. Vaughn ATL 2002,abdursh01,mohamna01,terryja01,vaughja01
S. Abdur-Rahim | D. Glover | N. Mohammed | J. Terry ATL 2002,abdursh01,glovedi01,mohamna01,terryja01
S. Abdur-Rahim | T. Kukoč | N. Mohammed | J. Terry ATL 2002,abdursh01,kukocto01,mohamna01,terryja01


### Check that stats and index match wrt player ids

In [204]:
df_lineup_player_stats.select_dtypes('O')

Unnamed: 0_level_0,player_ids,0,88,176,264
lineup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A. Afflalo | C. Anthony | C. Billups | K. Martin DEN 2010,"afflaar01, anthoca01, billuch01, martike01",afflaar01,anthoca01,billuch01,martike01
A. Afflalo | C. Anthony | C. Billups | K. Martin DEN 2011,"afflaar01, anthoca01, billuch01, martike01",afflaar01,anthoca01,billuch01,martike01
A. Afflalo | C. Anthony | C. Billups | N. Hilário DEN 2010,"afflaar01, anthoca01, billuch01, hilarne01",afflaar01,anthoca01,billuch01,hilarne01
A. Afflalo | C. Anthony | C. Billups | N. Hilário DEN 2011,"afflaar01, anthoca01, billuch01, hilarne01",afflaar01,anthoca01,billuch01,hilarne01
A. Afflalo | C. Anthony | J. Calderón | R. Lopez NYK 2016,"afflaar01, anthoca01, caldejo01, lopezro01",afflaar01,anthoca01,caldejo01,lopezro01
...,...,...,...,...,...
Ö. Aşık | J. Harden | C. Parsons | P. Patterson HOU 2013,"asikom01, hardeja01, parsoch01, pattepa01",asikom01,hardeja01,parsoch01,pattepa01
Ö. Aşık | J. Harden | J. Lin | C. Parsons HOU 2013,"asikom01, hardeja01, linje01, parsoch01",asikom01,hardeja01,linje01,parsoch01
Ö. Aşık | J. Harden | J. Lin | C. Parsons HOU 2014,"asikom01, hardeja01, linje01, parsoch01",asikom01,hardeja01,linje01,parsoch01
Ö. Aşık | J. Harden | J. Lin | P. Patterson HOU 2013,"asikom01, hardeja01, linje01, pattepa01",asikom01,hardeja01,linje01,pattepa01


In [188]:
df_lineup_player_stats = df_valid_line.copy()
df_lineup_player_stats = df_lineup_player_stats.merge(df_lineup_stat_profiles, left_index=True, right_index=True)
df_lineup_player_stats.shape
df_lineup_player_stats.head()



(3368, 355)

Unnamed: 0_level_0,diff_pts,year,player_ids,0,1,2,3,4,5,6,...,342,343,344,345,346,347,348,349,350,351
lineup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A. Afflalo | C. Anthony | C. Billups | K. Martin DEN 2010,7.2,2010,"afflaar01, anthoca01, billuch01, martike01",afflaar01,23,74,1234,8.9,0.548,0.357,...,104.0,2.0,4.8,18.5,2.3,2.5,9.5,81,234,2
A. Afflalo | C. Anthony | C. Billups | K. Martin DEN 2011,7.8,2011,"afflaar01, anthoca01, billuch01, martike01",afflaar01,24,82,2221,10.9,0.576,0.426,...,101.0,3.6,4.7,17.0,1.8,2.3,13.9,81,234,2
A. Afflalo | C. Anthony | C. Billups | N. Hilário DEN 2010,4.8,2010,"afflaar01, anthoca01, billuch01, hilarne01",afflaar01,23,74,1234,8.9,0.548,0.357,...,120.0,3.7,5.6,22.8,1.9,3.0,12.1,73,250,0
A. Afflalo | C. Anthony | C. Billups | N. Hilário DEN 2011,4.0,2011,"afflaar01, anthoca01, billuch01, hilarne01",afflaar01,24,82,2221,10.9,0.576,0.426,...,124.0,3.1,5.2,20.9,2.1,2.3,11.5,73,250,7
A. Afflalo | C. Anthony | J. Calderón | R. Lopez NYK 2016,-3.8,2016,"afflaar01, anthoca01, caldejo01, lopezro01",afflaar01,29,78,2502,10.7,0.533,0.377,...,117.0,5.9,3.8,17.6,0.5,2.3,12.3,84,281,7


In [173]:
df_valid_line


Unnamed: 0_level_0,diff_pts,year,player_ids
lineup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R. McLeod | D. Mutombo | J. Terry | L. Wright ATL 2001,3.6,2001,"mcleoro01, mutomdi01, terryja01, wrighlo02"
S. Abdur-Rahim | D. Johnson | N. Mohammed | J. Terry ATL 2002,1.2,2002,"abdursh01, johnsde03, mohamna01, terryja01"
S. Abdur-Rahim | N. Mohammed | J. Terry | J. Vaughn ATL 2002,-10.4,2002,"abdursh01, mohamna01, terryja01, vaughja01"
S. Abdur-Rahim | D. Glover | N. Mohammed | J. Terry ATL 2002,-13.8,2002,"abdursh01, glovedi01, mohamna01, terryja01"
S. Abdur-Rahim | T. Kukoč | N. Mohammed | J. Terry ATL 2002,-5.2,2002,"abdursh01, kukocto01, mohamna01, terryja01"
...,...,...,...
B. Beal | K. Oubre | O. Porter | J. Wall WAS 2018,14.8,2018,"bealbr01, oubreke01, porteot01, walljo01"
B. Beal | M. Gortat | K. Oubre | O. Porter WAS 2018,4.1,2018,"bealbr01, gortama01, oubreke01, porteot01"
T. Ariza | B. Beal | J. Green | T. Satoranský WAS 2019,-2.5,2019,"arizatr01, bealbr01, greenje02, satorto01"
B. Beal | M. Morris | O. Porter | J. Wall WAS 2019,-0.8,2019,"bealbr01, morrima02, porteot01, walljo01"


In [149]:
len(df_players.columns) * 4 - 4


348

In [148]:
l1 = df_valid_line.iloc[0]
l1

l1_profile = build_lineup_stat_profile(l1, df_players)
len(l1_profile)

diff_pts                                             3.6
year                                                2001
player_ids    mcleoro01, mutomdi01, terryja01, wrighlo02
Name: R. McLeod | D. Mutombo | J. Terry | L. Wright ATL 2001, dtype: object

348

In [105]:
df_players = df_num.copy()
df_players.dtypes.value_counts()

float64    77
int64      10
object      1
dtype: int64

In [124]:
pid = 'mohamna01'
year = 2001
player = df_players[(df_players['player_id']==pid) & (df_players['year']==year)]
# a= []
# a.append(player.to_numpy()[0])
a = player.to_numpy()

In [137]:
b = np.append(a, ['Yooo', 'Hello, there'])
pid = 'mohamna01'
year = 2002
c = df_players[(df_players['player_id']==pid) & (df_players['year']==year)].values
pid = 'mohamna01'
year = 2004
g = df_players[(df_players['player_id']==pid) & (df_players['year']==year)].values
# c.values

# d = np.append(b, c.values)

h = [a,c,g]
from functools import reduce

i = reduce(lambda left, right: np.append(left, right), h)
i

# f = np.append(a, c)
# f


array(['mohamna01', 23, 58, 912, 19.2, 0.52, 0.003, 0.341, 14.1, 23.6,
       18.8, 3.9, 1.7, 2.8, 13.1, 24.0, 1.1, 1.4, 2.5, 0.129, -0.2, -1.3,
       -1.5, 0.1, 2001, 5.3, 0, 0, 0.03399122807017544,
       0.47700000000000004, 0.642, 0.316, 0.342, 0.278, 0.0, 0.478, 0.0,
       0.636, 0.0, 0.0, 0.485, 0.266, 0.198, 0.049, 0.08900000000000001,
       0.997, 0.003, 0.015350877192982457, 0.0449561403508772,
       0.0668859649122807, 0.009868421052631578, 0.0756578947368421,
       0.042763157894736836, 0.0, 0.0, 0.0, 22.0, 78.0, -0.5, -1.0,
       0.01644736842105263, 0.03179824561403509, 1.1, 2.0, 99.0, 11.0,
       0.478, 10.1, 21.1, 0.0, 0.0, 0.1, 10.1, 21.2, 0.706, 5.1, 7.2, 22,
       103.0, 6.6, 6.5, 25.3, 1.7, 3.7, 17.6, 72, 221, 7, 'mohamna01', 24,
       82, 2168, 14.0, 0.49, 0.001, 0.311, 12.7, 21.6, 17.1, 2.6, 1.5,
       2.1, 12.7, 19.4, 0.9, 2.4, 3.3, 0.07200000000000001, -2.2, -1.6,
       -3.8, -1.0, 2002, 5.8, 0, 1, 0.026752767527675275, 0.461, 0.583,
       0.332999999