In [None]:
# importing modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Loading nba_salaries.csv into pandas DataFrame
salaries = pd.read_csv('nba_salaries.csv')

In [None]:
# Basic info on each column
salaries.info()

In [None]:
# in case I want to examine the entire dataframe in jupyter notebook later without pandas limiting the number of visible rows
pd.set_option('display.max_rows', None)

# displaying first 10 rows
salaries.head()

In [None]:
# transposing to more easily examine an individual player
salaries[salaries['Player Name'] == 'Dorian Finney-Smith'].T

In [None]:
# Currently we have two unnecessary columns at the beginning and end of our DataFrame: 
# 'unnamed: 0' appears to be a number from zero to len(salaries).
# 'Player-additional' which is some sort of player-id.

# We will get rid of the 'unnamed: 0' column. It's not useful, even if we were to use it as an index.
# dropping first unnecessary column
salaries.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# BEFORE eliminating the 'Player-additional' column  we need to make sure there are no duplicate player names.
# if there are duplicate player names then an id column such as this would be extremely important.

#checking if there are duplicate player names
print(salaries.duplicated('Player Name').sum())

#Excellent, no player duplicates. Dropping the second unnecessary column
salaries.drop('Player-additional', axis=1, inplace=True)

In [None]:
# Many column names are abbreviated names of different statistics. Renaming will make our data more readable.
# pg, meaning 'per game', is so common in these stats that I left it in.

# all stats explained on https://www.basketball-reference.com/about/glossary.html

# renaming columns
salaries.rename(columns = {
    'Player Name':'name',
    'GP':'games_played',
    'GS':'games_started',
    'MP':'minutes_pg',
    'FG':'field_goals_pg', # includes any shot worth 2 or 3 points that is not a free throw
    'FGA':'field_goal_attempts_pg',
    'FG%':'field_goal_%',
    '3P':'3_pointers_pg',  # 3 point field goal
    '3PA':'3_point_attempts_pg',
    '3P%':'3_point_%',
    '2P':'2_pointers_pg', # 2 point field goal
    '2PA':'2_point_attempts_pg',
    '2P%':'2_point_%',
    'eFG%':'effective_field_goal_%', # field goal % adjusted by the fact that 3 point field goal attempts are worth more than 2 point field goal attempts
    'FT':'free_throws_pg',
    'FTA':'free_throw_attempts_pg',
    'FT%':'free_throw_%',
    'ORB':'o_rebounds_pg', # offensive
    'DRB':'d_rebounds_pg', # defensive
    'TRB':'total_rebounds_pg',
    'AST':'assists_pg',
    'STL':'steals_pg',
    'BLK':'blocks_pg',
    'TOV':'turnovers_pg',
    'PF':'personal_fouls_pg',
    'PTS':'points_pg'},
               inplace=True)

# lowercasing the remaining columns for uniformity
salaries.columns = salaries.columns.str.lower()

In [None]:
#checking all the columns
salaries[salaries.name=='Dorian Finney-Smith'].T

In [None]:
# Some players have played for multiple teams. If so, their team name uses the format 'ABC/DEF'.
# Some players have played multiple positions. If so, their position uses the format 'AB-CD'
# This might cause trouble later so we'll turn each value in the column into a list of team names by splitting using '/' or '-' as a delimiter.

# confirming that columns are of data type 'object' (they are)
print(salaries['team'].dtype)
print(salaries['position'].dtype)

#turning each value into a list
salaries['team'] = salaries['team'].str.split('/')
salaries['position'] = salaries['position'].str.split('-')

#checking if team split worked
print(salaries[salaries.name == 'Dorian Finney-Smith'].team) # player who played for two teams this season
print(salaries[salaries.name == 'LeBron James'].team)        # player who played for one team this season

#checking if position split worked
print(salaries[salaries.name == 'Mikal Bridges'].position) # player who played two positions this season
print(salaries[salaries.name == 'LeBron James'].position)  # player who played one position this season

# all of this worked ^

In [None]:
# re-examining DataFrame
salaries.head()

## Missing Values

In [None]:
# checking number of missing values in each column
print(salaries.isna().sum())

In [None]:
# Every column containing missing values is a percentage column - interesting.
# My first hypothesis is that these players played so little that they didn't take shots.
# This would result in zero division when calculating the percentages and thus explain our missing values.
# Let's investigate.

In [None]:
# first examining the missing value in field_goal_%

salaries[salaries['field_goal_%'].isna()].T

# Alondes Williams is the person with missing field_goal_%. It looks like he didn't shoot any field goals.
# In fact, he only played for 5 minutes in a single game.

In [None]:
# next examining missing values in 3_point_%

salaries[salaries['3_point_%'].isna()].T

# Very interesting. 13 players have missing 3_point_%
# All of them have 0 3_pointers_pg, but why? What do they have in common?
# Six of these players have 5 or fewer games_played which is a feasible explanation.
# The other seven players have position in common. They all play Center.
# Centers do not typically have the opportunity to shoot 3 pointers because they hover so close to the basket.

# This is starting to make sense.

In [None]:
#next examining missing values in 2_point_%

salaries[salaries['2_point_%'].isna()]

# again these four players all played very few games.

In [None]:
#next examining missing value in effective_field_goal_%

salaries[salaries['effective_field_goal_%'].isna()]

#Alondes Williams again - the person who didn't take any shots.

In [None]:
#lastly examining free_throw_%
salaries[salaries['free_throw_%'].isna()].T

#these players again have a relatively low number of games_played

In [None]:
#### Missing Values conclusion

# Every missing value was the result of zero divison from a player not taking a field goal or a free throw
# However, not every missing value was the result of these players playing few games (the Centers taking no 3 point shots)
# I think imputing the values as 0% makes the most sense in this case because they didn't make any attempts.
# Deleting the entire player's row doesn't make sense because they may have compensated by having a high 2 point percentage.
# Also I think dropping players who have played 10 games or fewer will reduce the skew in our data. 

In [None]:
#impute missing values as 0

In [None]:
#drop players with fewer than 10 games played

#counting how many players have fewer than 10 games
(salaries['games_played'] < 10).sum()

In [None]:
# check for outliers

In [None]:
#dummy variable for position and team

In [None]:
#import injury dataset?