In [64]:
# 1. Create dummy or indicator features for categorical variables
# 2. Standardize the magnitude of numeric features using a scaler
# 3. Split your data into testing and training datasets

# Capstone 2 Exploratory Data Analysis
## NBA Salary Predictor and Trade Suggestion
## Austin Cody

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# increasing the number of displayable columns to better examine all dummy variables 
pd.set_option('display.max_columns', 31)

In [66]:
df = pd.read_csv('nba_salaries_cleaned.csv')

In [67]:
df.head(3)

Unnamed: 0,name,salary,position,age,team,games_played,games_started,minutes_pg,field_goals_pg,field_goal_attempts_pg,field_goal_%,3_pointers_pg,3_point_attempts_pg,3_point_%,2_pointers_pg,2_point_attempts_pg,2_point_%,effective_field_goal_%,free_throws_pg,free_throw_attempts_pg,free_throw_%,o_rebounds_pg,d_rebounds_pg,total_rebounds_pg,assists_pg,steals_pg,blocks_pg,turnovers_pg,personal_fouls_pg,points_pg
0,Stephen Curry,48070014,['PG'],34,['GSW'],56,56,34.7,10.0,20.2,0.493,4.9,11.4,0.427,5.1,8.8,0.579,0.614,4.6,5.0,0.915,0.7,5.4,6.1,6.3,0.9,0.4,3.2,2.1,29.4
1,John Wall,47345760,['PG'],32,['LAC'],34,3,22.2,4.1,9.9,0.408,1.0,3.2,0.303,3.1,6.7,0.459,0.457,2.3,3.3,0.681,0.4,2.3,2.7,5.2,0.8,0.4,2.4,1.7,11.4
2,Russell Westbrook,47080179,['PG'],34,"['LAL', 'LAC']",73,24,29.1,5.9,13.6,0.436,1.2,3.9,0.311,4.7,9.7,0.487,0.481,2.8,4.3,0.656,1.2,4.6,5.8,7.5,1.0,0.5,3.5,2.2,15.9


## Dummy Variable Creation
1. converting categorical data into lists because some rows belong to multiple classes in each category
2. creating dummy variables
3. concatenating dummy variables to our original dataframe and removing the original categorical columns

In [None]:
# our categorical variables were an issue before due to the conversion of list objects to strings
# during the dataframe to .csv process.
print(type(df['position'].iloc[0]), type(df['team'].iloc[0]))

In [None]:
# converting strings back into lists
# eval() function concisely converts strings formatted as '['team1','team2']' back into a list object
df['position'] = df['position'].apply(eval)
df['team'] = df['team'].apply(eval)

In [None]:
# confirming our position and team columns are now list objects
print(type(df['position'].iloc[0]), type(df['team'].iloc[0]))

In [None]:
# because each category is a list I need to handle pd.get_dummies differently:
# 1. turn the list objects into Series
# 2. stack it to change it from wide format to long format
# 3. adding team prefix (this part is standard)
# 4. groupby(level=0) gets around the multi indexing
team_dummies = pd.get_dummies(df['team'].apply(pd.Series).stack(), prefix='team').groupby(level=0).sum()

In [None]:
# finding player with 2 positions and 2 teams for verification of dummy variables
df[df['name'] == 'Mikal Bridges'][['name','position','team']]
# noting index of 58

In [None]:
# examining 58th index -> player whom we know has multiple teams
# you can see this was successful because multiple 1s appear in the third row for LAC and LAL
print(team_dummies.iloc[58][['team_BRK','team_PHO']])
team_dummies.head(3)

In [68]:
# repeating for position
position_dummies = pd.get_dummies(df['position'].apply(pd.Series).stack(), prefix='position').groupby(level=0).sum()
print(position_dummies.iloc[58][['position_SF','position_SG']])
position_dummies.head(3)

KeyError: "None of [Index(['position_SF', 'position_SG'], dtype='object')] are in the [index]"