In [2]:
#Import dependencies
import numpy as np
import pandas as pd

# Data Pre-Processing

In [3]:
# Read the csv file into a pandas DataFrame
nba_players = pd.read_csv('Seasons_Stats.csv')
nba_players.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


In [4]:
#Prevent duplicate data by removing
nba_players = nba_players[nba_players.Tm != 'TOT']

#Drop any null columns where all values are null
nba_players = nba_players.dropna(axis = 'columns', how = 'all')

#Drop data that occurs prior to the rise of metrics
nba_players = nba_players[nba_players.Year >= 1982]

#Fill in null values
nba_players = nba_players.fillna(0)

#Reset index
nba_players.reset_index(inplace = True, drop = True)

nba_players.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,6449,1982.0,Kareem Abdul-Jabbar*,C,34.0,LAL,76.0,76.0,2677.0,23.4,...,0.706,172.0,487.0,659.0,225.0,63.0,207.0,230.0,224.0,1818.0
1,6450,1982.0,Alvan Adams,C,27.0,PHO,79.0,75.0,2393.0,18.6,...,0.781,138.0,448.0,586.0,356.0,114.0,78.0,196.0,269.0,1196.0
2,6451,1982.0,Mark Aguirre,SF,22.0,DAL,51.0,20.0,1468.0,17.3,...,0.68,89.0,160.0,249.0,164.0,37.0,22.0,135.0,152.0,955.0
3,6452,1982.0,Danny Ainge,SG,22.0,BOS,53.0,1.0,564.0,10.1,...,0.862,25.0,31.0,56.0,87.0,37.0,3.0,53.0,86.0,219.0
4,6453,1982.0,Tiny Archibald*,PG,33.0,BOS,68.0,51.0,2167.0,14.3,...,0.747,25.0,91.0,116.0,541.0,52.0,3.0,178.0,131.0,858.0


In [5]:
# #Convert the string value for the "Pos" columns into a usable data type
# # Step 0: Reformat data
# data = nba_players.values
# y = data[:, 3]

# from sklearn.preprocessing import LabelEncoder
# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(y)
# encoded_y = label_encoder.transform(y)
    
# from keras.utils import to_categorical
# # Step 2: One-hot encoding
# one_hot_y = to_categorical(encoded_y)
# one_hot_y

In [6]:
nba_players.columns

Index(['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

# Feature Selection

In [7]:
percent_important_columns = ['Year', 'TRB%', 'AST%', 'STL%',
                    'BLK%', 'TOV%', 'USG%', '3P%', '2P%', 'FT%']
percent_nba_players = nba_players[percent_important_columns]

percent_nba_players.to_csv('percent_stats.csv')

In [8]:
raw_important_columns = ['Year', 'G', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS' ]
raw_nba_players = nba_players[raw_important_columns]

raw_nba_players.to_csv('raw_stats.csv')

# Test/Train Split for Percentage Model

In [43]:
# #Define the selected features for the percentage model
# percent_selected_features = ['TRB%','AST%','STL%','BLK%','TOV%','USG%','3P%','2P%','FT%']

# #Define X and y sets
# X = percent_nba_players[percent_selected_features]
# y = one_hot_y

# # Split data into training and testing
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# #Print the X_train dataframe
# X_train.head()

Unnamed: 0,TRB%,AST%,STL%,BLK%,TOV%,USG%,3P%,2P%,FT%
11173,11.0,7.7,2.0,0.9,10.4,13.1,0.375,0.514,0.827
9174,5.7,15.1,2.9,0.3,16.6,22.0,0.154,0.373,0.724
3412,14.2,4.6,1.8,1.4,12.8,22.5,0.0,0.513,0.813
5939,2.7,22.3,0.0,0.0,63.6,23.4,0.5,0.5,0.0
4719,3.3,37.0,0.0,0.0,33.3,7.5,1.0,0.0,0.0


# Data Scaling for Percentage Model

In [44]:
# #Import the StandardScaler for scaling the dataset
# from sklearn.preprocessing import StandardScaler

# #Generate the scaling function for the features
# X_scaler = StandardScaler().fit(X_train)

# #Apply the scaling function to the features
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

# Random Forest Classifier for Percentage Model

In [50]:
# #Import the Random Forest Classifier
# from sklearn.ensemble import RandomForestClassifier

# #Create a random forest classifier
# rf = RandomForestClassifier(n_estimators=200)
# rf = rf.fit(X_train_scaled, y_train)

# importances = rf.feature_importances_

# #Print the importances of the features
# for i in range(len(percent_selected_features)):
#     print(f'{percent_selected_features[i]}: {importances[i]}')

TRB%: 0.2264579122900254
AST%: 0.19051163577241728
STL%: 0.07520344392392657
BLK%: 0.10882300031166571
TOV%: 0.0887739461493585
USG%: 0.08653012554000612
3P%: 0.07108993545879962
2P%: 0.07818832695739068
FT%: 0.07442167359641003
