In [1]:
#Import dependencies
import numpy as np
import pandas as pd

# Data Pre-Processing

In [2]:
# Read the csv file into a pandas DataFrame
nba_players = pd.read_csv('Data/Seasons_Stats.csv')
nba_players.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


In [3]:
#Prevent duplicate data by removing
nba_players = nba_players[nba_players.Tm != 'TOT']

#Drop any null columns where all values are null
nba_players = nba_players.dropna(axis = 'columns', how = 'all')

#Drop data that occurs prior to the rise of metrics
nba_players = nba_players[nba_players.Year >= 1982]

#Fill in null values
nba_players = nba_players.fillna(0)

#Reset index
nba_players.reset_index(inplace = True, drop = True)

nba_players.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,6449,1982.0,Kareem Abdul-Jabbar*,C,34.0,LAL,76.0,76.0,2677.0,23.4,...,0.706,172.0,487.0,659.0,225.0,63.0,207.0,230.0,224.0,1818.0
1,6450,1982.0,Alvan Adams,C,27.0,PHO,79.0,75.0,2393.0,18.6,...,0.781,138.0,448.0,586.0,356.0,114.0,78.0,196.0,269.0,1196.0
2,6451,1982.0,Mark Aguirre,SF,22.0,DAL,51.0,20.0,1468.0,17.3,...,0.68,89.0,160.0,249.0,164.0,37.0,22.0,135.0,152.0,955.0
3,6452,1982.0,Danny Ainge,SG,22.0,BOS,53.0,1.0,564.0,10.1,...,0.862,25.0,31.0,56.0,87.0,37.0,3.0,53.0,86.0,219.0
4,6453,1982.0,Tiny Archibald*,PG,33.0,BOS,68.0,51.0,2167.0,14.3,...,0.747,25.0,91.0,116.0,541.0,52.0,3.0,178.0,131.0,858.0


In [5]:
#Print the list of columns in the dataset
nba_players.columns

Index(['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

# Feature Selection

In [11]:
#Define a list of relevant columns for percentage-based models
percent_important_columns = ['Year', 'Pos', 'TRB%', 'AST%', 'STL%',
                    'BLK%', 'TOV%', 'USG%', '3P%', '2P%', 'FT%']

#Create a new dataframe containing the percentage-relevant columns
percent_nba_players = nba_players[percent_important_columns]

#Save the percentage-based dataframe as a new CSV file
percent_nba_players.to_csv('Data/percent_stats.csv')

#Print the dataframe
percent_nba_players

Unnamed: 0,Year,Pos,TRB%,AST%,STL%,BLK%,TOV%,USG%,3P%,2P%,FT%
0,1982.0,C,13.4,11.9,1.1,4.1,13.3,25.6,0.000,0.580,0.706
1,1982.0,C,13.6,22.1,2.3,1.9,14.8,22.8,0.000,0.494,0.781
2,1982.0,SF,9.7,18.6,1.2,0.9,12.7,29.8,0.352,0.475,0.680
3,1982.0,SG,5.5,19.7,3.1,0.3,17.5,21.5,0.294,0.363,0.862
4,1982.0,PG,2.9,31.9,1.1,0.1,18.4,17.9,0.375,0.475,0.747
...,...,...,...,...,...,...,...,...,...,...,...
16629,2017.0,PF,12.9,9.1,1.8,3.0,10.9,15.5,0.000,0.572,0.679
16630,2017.0,C,13.2,12.2,0.7,3.3,10.2,16.5,0.000,0.497,0.564
16631,2017.0,C,17.6,5.3,0.9,3.7,8.3,14.8,0.000,0.323,0.600
16632,2017.0,SF,8.0,6.1,0.9,1.5,14.4,14.4,0.333,0.451,0.775


In [12]:
#Define a list of relevant columns for raw-valued models
raw_important_columns = ['Year', 'Pos', 'G', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS' ]

#Create a new dataframe containing the raw-relevant columns
raw_nba_players = nba_players[raw_important_columns]

#Save the raw-based dataframe as a new CSV file
raw_nba_players.to_csv('Data/raw_stats.csv')

#Print the dataframe
raw_nba_players

Unnamed: 0,Year,Pos,G,MP,FG,FGA,3P,3PA,2P,2PA,FT,FTA,TRB,AST,STL,BLK,TOV,PF,PTS
0,1982.0,C,76.0,2677.0,753.0,1301.0,0.0,3.0,753.0,1298.0,312.0,442.0,659.0,225.0,63.0,207.0,230.0,224.0,1818.0
1,1982.0,C,79.0,2393.0,507.0,1027.0,0.0,1.0,507.0,1026.0,182.0,233.0,586.0,356.0,114.0,78.0,196.0,269.0,1196.0
2,1982.0,SF,51.0,1468.0,381.0,820.0,25.0,71.0,356.0,749.0,168.0,247.0,249.0,164.0,37.0,22.0,135.0,152.0,955.0
3,1982.0,SG,53.0,564.0,79.0,221.0,5.0,17.0,74.0,204.0,56.0,65.0,56.0,87.0,37.0,3.0,53.0,86.0,219.0
4,1982.0,PG,68.0,2167.0,308.0,652.0,6.0,16.0,302.0,636.0,236.0,316.0,116.0,541.0,52.0,3.0,178.0,131.0,858.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16629,2017.0,PF,62.0,1725.0,253.0,443.0,0.0,1.0,253.0,442.0,133.0,196.0,405.0,99.0,62.0,58.0,65.0,189.0,639.0
16630,2017.0,C,51.0,525.0,78.0,158.0,0.0,1.0,78.0,157.0,22.0,39.0,124.0,42.0,7.0,21.0,20.0,61.0,178.0
16631,2017.0,C,19.0,108.0,10.0,31.0,0.0,0.0,10.0,31.0,3.0,5.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
16632,2017.0,SF,44.0,843.0,88.0,221.0,33.0,99.0,55.0,122.0,31.0,40.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0
