In [None]:
import numpy as np
import pandas as pd
import pickle

## Cleaning the Scraped NBA Player Data

In [None]:
with open('saved_final_player_info.pickle', 'rb') as handle:
    final_player_info = pickle.load(handle)

In [None]:
names = ['position', 'height', 'weight', 'D_of_B', 'Country', 'Exp', 'College', 'Age', 'games_played', 
         'games_started', 'min_played/G', 'fg_made/G', 'fg_att/G', 'fg%', '3PM/G', '3PA/G', '3P%', '2PM/G', 
         '2PA/G', '2P%', 'eFG%', 'FTM/G', 'FTA/G', 'FT%','ORB/G', 'DRB/G', 'TRB/G', 'AST/G', 'STL/G', 'BLK/G', 
         'TOV/G', 'PF/G', 'PTS/G', 'Age2', 'games_played2', 'min_played_total', 'PER', 'TS%', '3PAr', 'FTr', 
         'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'random1', 'OWS', 'DWS', 'WS', 'WS/48', 
         'random2', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Age3', 'games_played3', 'min_played_total2', 'FG%2', 'Dist', 
         '%_shots_2PA', '2PA%_0-3', '2PA%_3-10', '2PA%_10-16', '2PA%_16<3', '3PA%', '%_shots_2PM', '2PM%_0-3', 
         '2PM%_3-10', '2PM%_10-16', '2PM%_16<3', '3PM%', '%Astd_forFGM', '%_FGA_dunks', 'dunks', '%Astd_for3FGM', 
         '%_3PA_corner', '3PM%_corner', 'heaves_att', 'heaves_made', 'salary']

In [None]:
player_df = pd.DataFrame(final_player_info, index=names)
player_df_original = player_df.T
player_df_original.tail()

In [None]:
# Dropping the two random columns out of the dataframe 

player_df_original = player_df_original.drop('random1', axis=1)
player_df_original = player_df_original.drop('random2', axis=1)

In [None]:
player_df_original.columns

In [None]:
# Some of the salary information for some of the players are in the age2 and age3 columns because on basketball
#   -reference.com, these players didn't have the advanced and/or shooting stats available since those statistics
#   weren't a thing during those particular seasons

AGE2 = 33
AGE3 = 56

In [None]:
for i in range(0,len(player_df_original)):
    if '$' in player_df_original.iloc[i,AGE2]:
        player_df_original.iloc[i,AGE2], player_df_original.iloc[i,-1] = player_df_original.iloc[i,-1], player_df_original.iloc[i,AGE2]
    elif '$' in player_df_original.iloc[i,AGE3]:
        player_df_original.iloc[i,AGE3], player_df_original.iloc[i,-1] = player_df_original.iloc[i,-1], player_df_original.iloc[i,AGE3]

In [None]:
# Getting rid of the $ and , signs in the salary column

player_df_original['salary'] = list(map(lambda x: ''.join([c for c in x if c not in ('$', ',')]), player_df_original['salary']))

In [None]:
# Getting the team name, season, and player's name into separate columns from the pandas dataframe index

player_df_original['Team'] = list(map(lambda x: x[0:3], player_df_original.index))
player_df_original['Season'] = list(map(lambda x: int(x[4:8]), player_df_original.index))
player_df_original['Name'] = list(map(lambda x: x[9:], player_df_original.index))

In [None]:
# Grabbing the birth year from the day of birth column

player_df_original['Birth_Year'] = list(map(lambda x: int(x[-4:]), player_df_original['D_of_B']))

In [None]:
player_df_original['Height'] = 'NA'
player_df_original['Weight'] = 'NA'

In [None]:
# Changing the string type weight values to int types and height values to inches

for i in range(0,len(player_df_original)):
    if player_df_original.iloc[i,2] != '':
        player_df_original.iloc[i,87] = int(player_df_original.iloc[i,2])
    elif player_df_original.iloc[i,2] == '':
        player_df_original.iloc[i,87] = 'NA'
        
    if player_df_original.iloc[i,1] != '':
        dimensions = player_df_original.iloc[i,1].split('-')
        player_df_original.iloc[i,86] = int(dimensions[0])*12 + int(dimensions[1])
    elif player_df_original.iloc[i,1] == '':
        player_df_original.iloc[i,86] = 'NA'

In [None]:
player_df_original['Position'] = ''

In [None]:
# Changing the position values to numbers corresponding to the position

positional_dict = {'PG':1, 'SG':2, 'G':12, 'SF':3, 'PF':4, 'F':34, 'C':5, 'G-F':'NA', 'F-G':'NA', 'F-C':'NA', 'C-F':'NA'}

for i in range(0,len(player_df_original)):
    for position, num in positional_dict.items():
        if player_df_original.iloc[i,0] == position:
            player_df_original.iloc[i,88] = num

In [None]:
player_df_original['Experience'] = ''

In [None]:
# Changing the rookie status to 0

for i in range(0,len(player_df_original)):
    if player_df_original.iloc[i,5] != 'R':
        player_df_original.iloc[i,89] = int(player_df_original.iloc[i,5])
    elif player_df_original.iloc[i,5] == 'R':
        player_df_original.iloc[i,89] = 0

In [None]:
# Changing all the statistics from string types to float types 

for i in range(0,len(player_df_original)):
    for j in range(7,80):
        if player_df_original.iloc[i,j] == '':
            player_df_original.iloc[i,j] = 'NA'
        elif player_df_original.iloc[i,j] != 'NA':
            player_df_original.iloc[i,j] = float(player_df_original.iloc[i,j])


In [None]:
alphabet_dict = {'A':2,'B':2,'C':2,'D':3,'E':3,'F':3,'G':4,'H':4,'I':4,'J':5,'K':5,'L':5,'M':6,'N':6,'O':6,'P':7,
                'Q':7,'R':7,'S':7,'T':8,'U':8,'V':8,'W':9,'X':9,'Y':9,'Z':9}

player_df_original['Team_Number'] = ''

team_name = 82

In [None]:
# Creating a new column called Team_Number to change the abbreviated team names to a corresponding number code

for i in range(0,len(player_df_original)):
    character_list = [str(alphabet_dict[character]) for character in player_df_original.iloc[i,team_name]]
    player_df_original.iloc[i,-1] = int(''.join(character_list))


In [None]:
# Convert pandas dataframe back to a dictionary

usable_player_dict = player_df_original.T.to_dict()

In [None]:
# Dumping the dictionary into pickle file to use whenever without having to clean the dataframe from before

with open('saved_usable_player_info.pickle', 'wb') as handle:
    pickle.dump(usable_player_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
clean_player_df = pd.DataFrame(usable_player_dict)

In [None]:
clean_player_df = clean_player_df.T
clean_player_df.tail()