# Cleaning the NBA dataset

## Initial Inspection and Formatting

In [1]:
# Import dependencies
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read in the raw CSV file
raw_NBA=pd.read_csv("../../Data/DataSets/NBA.csv")

In [3]:
# Inspect the dataframe columns/structure
raw_NBA.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,10-Jun,240.0,24-Jun-68,Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,9-Jun,235.0,7-Apr-46,Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,2-Jul,225.0,16-Apr-47,"University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,1-Jun,162.0,9-Mar-69,Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-Jun,223.0,3-Nov-74,San Jose State University


In [4]:
# Drop the unecessary columns
del raw_NBA['name']
del raw_NBA['height']
del raw_NBA['weight']
del raw_NBA['college']

In [5]:
# Determine counts per column
raw_NBA.count()

year_start    4550
year_end      4550
position      4549
birth_date    4519
dtype: int64

#### The counts are not equal to eachother, which means there are likely null values for some entries

In [6]:
# Determine if there are null values
raw_NBA.isnull().sum()

year_start     0
year_end       0
position       1
birth_date    31
dtype: int64

#### From this result we can see that one entry does not have a value for 'position' and 'birthdate' and will need to be dropped.

In [7]:
# Drop the rows with null values
NBA_df = raw_NBA.dropna()

In [8]:
# We check the count again
NBA_df.isnull().sum()

year_start    0
year_end      0
position      0
birth_date    0
dtype: int64

In [9]:
NBA_df.count()

year_start    4518
year_end      4518
position      4518
birth_date    4518
dtype: int64

#### Now we have only the columns we need and no null entries

In [10]:
NBA_df.head()

Unnamed: 0,year_start,year_end,position,birth_date
0,1991,1995,F-C,24-Jun-68
1,1969,1978,C-F,7-Apr-46
2,1970,1989,C,16-Apr-47
3,1991,2001,G,9-Mar-69
4,1998,2003,F,3-Nov-74


## Data Analysis

### Position Column

#### For our code to run, I am going to take the position column and reformat as a list of indices, so we have a numeric equivalent: first let's check to see which positions are present in the dataset

In [11]:
NBA_df['position'].unique()

array(['F-C', 'C-F', 'C', 'G', 'F', 'G-F', 'F-G'], dtype=object)

#### Based on the above, the following indecies will be applied:
##### 0 = multiple positions
##### 1 = Center
##### 2 = Guard
##### 3 = Forward

In [12]:
for index, row in NBA_df.iterrows():
    
    if len(row['position']) > 1:
        NBA_df['position'][index] = 0
        
    elif row['position'] == 'C':
        NBA_df['position'][index] = 1
    
    elif row['position'] == 'G':
        NBA_df['position'][index] = 2
        
    else: 
        NBA_df['position'][index] = 3

In [13]:
# Make dataframe column from result
posns = NBA_df['position']

In [14]:
# Check the dataframe
posns.head()

0    0
1    0
2    1
3    2
4    3
Name: position, dtype: object

### Retirement Year/Years played

#### The retirement year is already included as a column, but we can use it along with the start year to calculate the career length 

In [15]:
# Extract the columns into an array
year_start = NBA_df['year_start']
year_end = NBA_df['year_end']

# Calculate career length from start/end year (add one to account for players who start/stop in same year)
years_played = year_end - year_start + 1

In [16]:
# Check values
years_tot = pd.DataFrame(years_played)

### Age of retirement

#### For the final calculation, we will need to extract the birth year and reformat the age of retirement.

In [17]:
# Temporary 'dummy' array to hold last two charachters of birth_date string
temp=[]

# Iterate through 'birth_date' column 
for index, row in NBA_df.iterrows():
    
    # Grab the last two characters, convert to int and add 1900 to make it compatible with retirement age year
    temp.append(int(row['birth_date'][-2:])+ 1900)

In [18]:
len(temp)

4518

In [19]:
# Make a dataFrame from the result
birth_year=pd.DataFrame(temp)
birth_year.columns = ['birth_year']

In [20]:
len(birth_year)

4518

In [21]:
len(NBA_df['year_end'])

4518

In [22]:
# Check dataframe
birth_year.head()

Unnamed: 0,birth_year
0,1968
1,1946
2,1947
3,1969
4,1974


In [23]:
# Calculate the age of retirement
age_retired = NBA_df['year_end'] - birth_year['birth_year']

In [24]:
temp2 = []

for index, row in NBA_df.iterrows():
    
    temp2.append('NBA')

In [25]:
sport_list = pd.DataFrame(temp2)
sport_list.columns = ['Sport']
sport_name=sport_list['Sport']

In [26]:
NBA_summary_df = pd.DataFrame({
    'sport'       : sport_name,
    'position'    : posns,
    'retire_year' : year_end,
    'retire_age'  : age_retired,
    'years_played': years_played})

In [27]:
NBA_summary_df=NBA_summary_df.dropna()

In [28]:
NBA_summary_df

Unnamed: 0,sport,position,retire_year,retire_age,years_played
0,NBA,0,1995.0,27.0,5.0
1,NBA,0,1978.0,32.0,10.0
2,NBA,1,1989.0,42.0,20.0
3,NBA,2,2001.0,32.0,11.0
4,NBA,3,2003.0,29.0,6.0
...,...,...,...,...,...
4513,NBA,0,2003.0,6.0,1.0
4514,NBA,0,1960.0,7.0,7.0
4515,NBA,3,1972.0,24.0,1.0
4516,NBA,1,1962.0,-35.0,1.0


In [29]:
NBA_summary_df['retire_year'] = NBA_summary_df['retire_year'].map("{:.0f}".format)

NBA_summary_df['retire_age'] = NBA_summary_df['retire_age'].map("{:.0f}".format)

NBA_summary_df['years_played'] = NBA_summary_df['years_played'].map("{:.0f}".format)

In [30]:
NBA_summary_df.head()

Unnamed: 0,sport,position,retire_year,retire_age,years_played
0,NBA,0,1995,27,5
1,NBA,0,1978,32,10
2,NBA,1,1989,42,20
3,NBA,2,2001,32,11
4,NBA,3,2003,29,6


In [31]:
# Export as CSV
NBA_summary_df.to_csv("../../Data/Clean_Data/clean_retire_nba.csv")