In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

## Steps

1. drop columns
2. enforce minutes threshold: 700 total minutes OR (20mpg & 20GP)
3. write data as cleaned_data

- Resulting shape: (7246, 68)
- Leaves only Marcus Williams (NJN) 07 as the only player originally with double entry problem. Will just leave as it to indicate its a partial stats, shouldn't matter

In [88]:
## Load in all data
data_dir = 'season_data/'
master_data = data_dir + 'master_1996_2019.csv'
df_master = pd.read_csv(master_data, index_col=0)
print(df_master.shape)
df_master.head()

(11149, 86)


Unnamed: 0,%3PA,%3PM,%AST,%BLK,%BLKA,%DREB,%FGA,%FGA 2PT,%FGA 3PT,%FGM,...,REB%,STL,TD3,TEAM,TO RATIO,TOT MIN,TOV,TS%,USG%,W
A.C. Green 96,2.6,0.4,6.8,8.4,15.5,29.9,11.8,95.9,4.1,13.3,...,13.5,1.5,0,DAL,10.3,2494.0,1.6,52.3,11.8,23
Aaron McKie 96,16.6,19.6,25.2,12.4,22.6,17.7,14.4,71.8,28.2,13.2,...,7.1,2.5,0,DET,13.5,1624.0,2.9,52.4,14.2,48
Aaron Williams 96,0.5,0.0,6.3,40.8,11.9,24.3,16.6,99.3,0.7,22.4,...,12.9,1.5,0,VAN,15.0,562.0,3.0,59.9,16.1,4
Acie Earl 96,2.9,0.0,11.1,49.1,14.5,20.5,22.3,97.2,2.8,20.5,...,9.4,1.5,0,MIL,12.7,500.0,3.5,43.5,22.0,14
Adam Keefe 96,0.5,0.0,6.7,16.9,16.5,24.4,11.3,99.4,0.6,11.6,...,12.9,1.7,0,UTA,15.9,917.0,2.5,57.2,12.4,48


In [91]:
### Columns to drop:


# is it redundant to have %fga 2pt and have %pts 2pt for players? ... i guess the differences between them are interesting

drop_cols = ['TEAM',
             'AGE',
             'W',
             'L',
             '+/-',
             '2ND PTS',
             'DD2',
             'DEFRTG',
             'DEF WS',
             'FP',
             'NETRTG',
             'OFFRTG',
             'OPP FBPS',
             'OPP PTS OFF TO',
             'OPP 2ND PTS',
             'PACE',
             'PIE',
             'PFD',  # NO DATA PRE 2005
             'PTS OFF TO',
             'TD3'
            ]
len(drop_cols)

20

In [92]:
df_reduced = df_master.drop(drop_cols, axis=1)

In [24]:
print(df_reduced.shape)
df_reduced.head()

(11149, 67)


Unnamed: 0,%3PA,%3PM,%AST,%BLK,%BLKA,%DREB,%FGA,%FGA 2PT,%FGA 3PT,%FGM,...,PITP,PTS,REB,REB%,STL,TO RATIO,TOT MIN,TOV,TS%,USG%
A.C. Green 96,2.6,0.4,6.8,8.4,15.5,29.9,11.8,95.9,4.1,13.3,...,6.3,12.7,14.0,13.5,1.5,10.3,2494.0,1.6,52.3,11.8
Aaron McKie 96,16.6,19.6,25.2,12.4,22.6,17.7,14.4,71.8,28.2,13.2,...,5.0,14.0,7.2,7.1,2.5,13.5,1624.0,2.9,52.4,14.2
Aaron Williams 96,0.5,0.0,6.3,40.8,11.9,24.3,16.6,99.3,0.7,22.4,...,15.6,19.1,13.5,12.9,1.5,15.0,562.0,3.0,59.9,16.1
Acie Earl 96,2.9,0.0,11.1,49.1,14.5,20.5,22.3,97.2,2.8,20.5,...,10.1,19.0,9.7,9.4,1.5,12.7,500.0,3.5,43.5,22.0
Adam Keefe 96,0.5,0.0,6.7,16.9,16.5,24.4,11.3,99.4,0.6,11.6,...,7.6,13.3,12.2,12.9,1.7,15.9,917.0,2.5,57.2,12.4


In [25]:
df_reduced.columns

Index(['%3PA', '%3PM', '%AST', '%BLK', '%BLKA', '%DREB', '%FGA', '%FGA 2PT',
       '%FGA 3PT', '%FGM', '%FTA', '%FTM', '%OREB', '%PF', '%PFD', '%PTS',
       '%PTS 2PT', '%PTS 2PT MR', '%PTS 3PT', '%PTS FBPS', '%PTS FT',
       '%PTS OFFTO', '%PTS PITP', '%REB', '%STL', '%TOV', '2FGM %AST',
       '2FGM %UAST', '3FGM %AST', '3FGM %UAST', '3P%', '3PA', '3PM', 'AST',
       'AST RATIO', 'AST%', 'AST/TO', 'BLK', 'BLKA', 'DREB', 'DREB%', 'EFG%',
       'FBPS', 'FG%', 'FGA', 'FGM', 'FGM %AST', 'FGM %UAST', 'FT%', 'FTA',
       'FTM', 'GP', 'OPP PITP', 'OREB', 'OREB%', 'PF', 'PFD', 'PITP', 'PTS',
       'REB', 'REB%', 'STL', 'TO RATIO', 'TOT MIN', 'TOV', 'TS%', 'USG%'],
      dtype='object')

In [94]:
## Adding MPG
df_reduced['MPG'] = df_reduced['TOT MIN'] / df_reduced['GP']
df_reduced['MPG']

A.C. Green 96          30.048193
Aaron McKie 96         19.566265
Aaron Williams 96      17.030303
Acie Earl 96           10.638298
Adam Keefe 96          14.790323
                         ...    
Zach LaVine 19         34.750000
Zach Norvell Jr. 19     8.200000
Zhaire Smith 19         4.571429
Zion Williamson 19     27.833333
Zylan Cheatham 19      12.750000
Name: MPG, Length: 11149, dtype: float64

In [95]:
df_reduced_700min = df_reduced[(df_reduced['TOT MIN'] > 700) | ((df_reduced['MPG'] >= 20) & (df_reduced['GP'] >= 20))]
df_reduced_700min.shape

(7246, 67)

In [79]:
alt = df_reduced[(df_reduced['TOT MIN'] > 700) | ((df_reduced['MPG'] > 30) & (df_reduced['GP'] > 20))]
alt.shape

(7166, 68)

In [73]:
df_reduced_700min[~df_reduced_700min.index.isin(alt.index)][['MPG', 'GP']]


Unnamed: 0,MPG,GP
Brian Grant 96,25.416667,24
Dee Brown 96,24.857143,21
Greg Minor 96,23.826087,23
Billy Owens 98,21.476190,21
Dennis Rodman 98,28.695652,23
...,...,...
Jake Layman 19,21.956522,23
Nicolas Batum 19,22.954545,22
Reggie Bullock 19,23.586207,29
Rodney Hood 19,29.476190,21


In [96]:
df_reduced_700min = df_reduced[(df_reduced['TOT MIN'] > 700) | ((df_reduced['MPG'] >= 20) & (df_reduced['GP'] >= 20))]

In [97]:
df_reduced_700min.to_csv(data_dir + 'cleaned_data.csv')

In [83]:
df_reduced_700min[df_reduced_700min.index.str.contains('\) ')]

Unnamed: 0,%3PA,%3PM,%AST,%BLK,%BLKA,%DREB,%FGA,%FGA 2PT,%FGA 3PT,%FGM,...,PTS,REB,REB%,STL,TO RATIO,TOT MIN,TOV,TS%,USG%,MPG
Marcus Williams (NJN) 07,38.7,40.6,38.5,3.5,15.3,17.2,21.1,51.5,48.5,18.4,...,18.7,6.0,6.2,1.5,14.2,854.0,4.5,49.9,20.7,16.113208


In [30]:
680/24

28.333333333333332

In [27]:
df_reduced['%3PA'].sort_values(ascending=False)[0:10]

Josh Akognon 12              100.0
Naz Mitrou-Long 17           100.0
Donte Grantham 18            100.0
Darius Johnson-Odom 12       100.0
Von Wafer 06                 100.0
Chris Boucher 17             100.0
Thanasis Antetokounmpo 15    100.0
Alvin Williams 06            100.0
Royal Ivey 13                100.0
Blake Ahearn 11               75.0
Name: %3PA, dtype: float64

In [None]:
### Columns to add:
# MPG


In [18]:
df_master.columns

Index(['%3PA', '%3PM', '%AST', '%BLK', '%BLKA', '%DREB', '%FGA', '%FGA 2PT',
       '%FGA 3PT', '%FGM', '%FTA', '%FTM', '%OREB', '%PF', '%PFD', '%PTS',
       '%PTS 2PT', '%PTS 2PT MR', '%PTS 3PT', '%PTS FBPS', '%PTS FT',
       '%PTS OFFTO', '%PTS PITP', '%REB', '%STL', '%TOV', '+/-', '2FGM %AST',
       '2FGM %UAST', '2ND PTS', '3FGM %AST', '3FGM %UAST', '3P%', '3PA', '3PM',
       'AGE', 'AST', 'AST RATIO', 'AST%', 'AST/TO', 'BLK', 'BLKA', 'DD2',
       'DEF WS', 'DEFRTG', 'DREB', 'DREB%', 'EFG%', 'FBPS', 'FG%', 'FGA',
       'FGM', 'FGM %AST', 'FGM %UAST', 'FP', 'FT%', 'FTA', 'FTM', 'GP', 'L',
       'NETRTG', 'OFFRTG', 'OPP 2ND PTS', 'OPP FBPS', 'OPP PITP',
       'OPP PTS OFF TO', 'OREB', 'OREB%', 'PACE', 'PF', 'PFD', 'PIE', 'PITP',
       'PTS', 'PTS OFF TO', 'REB', 'REB%', 'STL', 'TD3', 'TEAM', 'TO RATIO',
       'TOT MIN', 'TOV', 'TS%', 'USG%', 'W'],
      dtype='object')

In [10]:
df_master[['%FGA 2PT', '%FGA 3PT', '%FGM', '%PTS FBPS', '%PTS PITP']]

Unnamed: 0,%FGA 2PT,%FGA 3PT,%FGM,%PTS FBPS,%PTS PITP
A.C. Green 96,95.9,4.1,13.3,13.4,49.6
Aaron McKie 96,71.8,28.2,13.2,18.9,35.6
Aaron Williams 96,99.3,0.7,22.4,9.9,81.8
Acie Earl 96,97.2,2.8,20.5,11.2,53.2
Adam Keefe 96,99.4,0.6,11.6,17.9,57.0
...,...,...,...,...,...
Zach LaVine 19,59.6,40.4,31.7,13.9,39.6
Zach Norvell Jr. 19,33.3,66.7,8.3,30.0,0.0
Zhaire Smith 19,72.7,27.3,11.1,0.0,75.0
Zion Williamson 19,96.1,3.9,33.4,15.9,74.4
