In [1]:
#Load libraries for analysis 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import plotly.figure_factory as ff
from plotly import subplots
from plotly.subplots import make_subplots

In [2]:
#Load the dataset

nba_df = pd.read_excel("./Resource/NBA All Star Games (1).xlsx")
nba_df.head(25)

Unnamed: 0,Year,Player,Pos,HT,HT.1,WT,Team,Selection Type,NBA Draft Status,Nationality
0,2016,Stephen Curry,G,6'3,2017-06-03 00:00:00,190,Golden State Warriors,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 7,United States
1,2016,James Harden,SG,6'5,2017-06-05 00:00:00,220,Houston Rockets,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 3,United States
2,2016,Kevin Durant,SF,6'9,2017-06-09 00:00:00,240,Golden State Warriors,Western All-Star Fan Vote Selection,2007 Rnd 1 Pick 2,United States
3,2016,Kawhi Leonard,F,6'7,2017-06-07 00:00:00,230,San Antonio Spurs,Western All-Star Fan Vote Selection,2011 Rnd 1 Pick 15,United States
4,2016,Anthony Davis,PF,6'11,2017-06-11 00:00:00,253,New Orleans Pelicans,Western All-Star Fan Vote Selection,2012 Rnd 1 Pick 1,United States
5,2016,Russell Westbrook,G,6'3,2017-06-03 00:00:00,200,Oklahoma City Thunder,Western All-Star Coaches Selection,2008 Rnd 1 Pick 4,United States
6,2016,DeMarcus Cousins,C,6'11,2017-06-11 00:00:00,270,Sacramento Kings,Western All-Star Coaches Selection,2010 Rnd 1 Pick 5,United States
7,2016,Klay Thompson,G,6'7,2017-06-07 00:00:00,215,Golden State Warriors,Western All-Star Coaches Selection,2011 Rnd 1 Pick 11,United States
8,2016,Draymond Green,F,6'7,2017-06-07 00:00:00,230,Golden State Warriors,Western All-Star Coaches Selection,2012 Rnd 2 Pick 5,United States
9,2016,Marc Gasol,C,7'1,2017-07-01 00:00:00,255,Memphis Grizzlies,Western All-Star Coaches Selection,2007 Rnd 2 Pick 18,Spain


In [3]:
#Drop HT.1 Column since it renders as date and not height

nba_clean = nba_df.drop(['HT.1'], axis= 1)
nba_clean.head()

Unnamed: 0,Year,Player,Pos,HT,WT,Team,Selection Type,NBA Draft Status,Nationality
0,2016,Stephen Curry,G,6'3,190,Golden State Warriors,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 7,United States
1,2016,James Harden,SG,6'5,220,Houston Rockets,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 3,United States
2,2016,Kevin Durant,SF,6'9,240,Golden State Warriors,Western All-Star Fan Vote Selection,2007 Rnd 1 Pick 2,United States
3,2016,Kawhi Leonard,F,6'7,230,San Antonio Spurs,Western All-Star Fan Vote Selection,2011 Rnd 1 Pick 15,United States
4,2016,Anthony Davis,PF,6'11,253,New Orleans Pelicans,Western All-Star Fan Vote Selection,2012 Rnd 1 Pick 1,United States


In [4]:
#Check number of records 

len(nba_clean)


439

In [5]:
#Examine data types of each column
nba_df.dtypes

nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 10 columns):
Year                439 non-null int64
Player              439 non-null object
Pos                 439 non-null object
HT                  439 non-null object
HT.1                439 non-null object
WT                  439 non-null int64
Team                439 non-null object
Selection Type      439 non-null object
NBA Draft Status    439 non-null object
Nationality         439 non-null object
dtypes: int64(2), object(8)
memory usage: 34.4+ KB


In [6]:
#Check for null values 
nba_clean.isnull().sum()


Year                0
Player              0
Pos                 0
HT                  0
WT                  0
Team                0
Selection Type      0
NBA Draft Status    0
Nationality         0
dtype: int64

# Data Cleaning 

1. The NBA Draft Status can be separated into three different columns for better analysis: Draft Year, Round, Pick Number
2. The Position column is inconsistent as well. There are different iterations for the same position and entry types (F-C vs. FC, both stand for forward-center)
3. Some players were undrafted so will have to modify 'Round' column to have right values 



In [7]:
#Split the NBA Draft Status Column into three separate columns for better analysis 
nba_clean['Draft Year'] = nba_clean["NBA Draft Status"].str.split(" ").apply(lambda x:x[0])
nba_clean['Round'] = nba_clean["NBA Draft Status"].str.split(" ").apply(lambda x:x[2])
nba_clean['Pick'] = nba_clean["NBA Draft Status"].str.split(" ").apply(lambda x:x[-1])

# nba_clean= nba_clean.drop(['NBA Draft Status'], axis=1)

nba_clean

Unnamed: 0,Year,Player,Pos,HT,WT,Team,Selection Type,NBA Draft Status,Nationality,Draft Year,Round,Pick
0,2016,Stephen Curry,G,6'3,190,Golden State Warriors,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 7,United States,2009,1,7
1,2016,James Harden,SG,6'5,220,Houston Rockets,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 3,United States,2009,1,3
2,2016,Kevin Durant,SF,6'9,240,Golden State Warriors,Western All-Star Fan Vote Selection,2007 Rnd 1 Pick 2,United States,2007,1,2
3,2016,Kawhi Leonard,F,6'7,230,San Antonio Spurs,Western All-Star Fan Vote Selection,2011 Rnd 1 Pick 15,United States,2011,1,15
4,2016,Anthony Davis,PF,6'11,253,New Orleans Pelicans,Western All-Star Fan Vote Selection,2012 Rnd 1 Pick 1,United States,2012,1,1
5,2016,Russell Westbrook,G,6'3,200,Oklahoma City Thunder,Western All-Star Coaches Selection,2008 Rnd 1 Pick 4,United States,2008,1,4
6,2016,DeMarcus Cousins,C,6'11,270,Sacramento Kings,Western All-Star Coaches Selection,2010 Rnd 1 Pick 5,United States,2010,1,5
7,2016,Klay Thompson,G,6'7,215,Golden State Warriors,Western All-Star Coaches Selection,2011 Rnd 1 Pick 11,United States,2011,1,11
8,2016,Draymond Green,F,6'7,230,Golden State Warriors,Western All-Star Coaches Selection,2012 Rnd 2 Pick 5,United States,2012,2,5
9,2016,Marc Gasol,C,7'1,255,Memphis Grizzlies,Western All-Star Coaches Selection,2007 Rnd 2 Pick 18,Spain,2007,2,18


In [14]:
#4 NBA teams have gone through new changes: New Jersey Nets, New Orlean Hornets, Charlotte Bobcats, and Seattle Supersonics.
#Create a function to address current, consistant values 

def change_team(df):
    df['Team'] = np.where(df['Team'] == 'New Jersey Nets', 'Brooklyn Nets', df['Team'])
    df['Team'] = np.where(df['Team'] == 'New Orleans Hornets', 'New Orleans Pelicans', df['Team'])
    df['Team'] = np.where(df['Team'] == 'Charlotte Bobcats', 'Charlotte Hornets', df['Team'])
    df['Team'] = np.where(df['Team'] == 'Seattle SuperSonics', 'Oklahoma City Thunder', df['Team'])
    
    return df 

change_team(nba_clean)

nba_clean

Unnamed: 0,Year,Player,Pos,HT,WT,Team,Selection Type,NBA Draft Status,Nationality,Draft Year,Round,Pick
0,2016,Stephen Curry,G,6'3,190,Golden State Warriors,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 7,United States,2009,1,7
1,2016,James Harden,SG,6'5,220,Houston Rockets,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 3,United States,2009,1,3
2,2016,Kevin Durant,SF,6'9,240,Golden State Warriors,Western All-Star Fan Vote Selection,2007 Rnd 1 Pick 2,United States,2007,1,2
3,2016,Kawhi Leonard,F,6'7,230,San Antonio Spurs,Western All-Star Fan Vote Selection,2011 Rnd 1 Pick 15,United States,2011,1,15
4,2016,Anthony Davis,PF,6'11,253,New Orleans Pelicans,Western All-Star Fan Vote Selection,2012 Rnd 1 Pick 1,United States,2012,1,1
5,2016,Russell Westbrook,G,6'3,200,Oklahoma City Thunder,Western All-Star Coaches Selection,2008 Rnd 1 Pick 4,United States,2008,1,4
6,2016,DeMarcus Cousins,C,6'11,270,Sacramento Kings,Western All-Star Coaches Selection,2010 Rnd 1 Pick 5,United States,2010,1,5
7,2016,Klay Thompson,G,6'7,215,Golden State Warriors,Western All-Star Coaches Selection,2011 Rnd 1 Pick 11,United States,2011,1,11
8,2016,Draymond Green,F,6'7,230,Golden State Warriors,Western All-Star Coaches Selection,2012 Rnd 2 Pick 5,United States,2012,2,5
9,2016,Marc Gasol,C,7'1,255,Memphis Grizzlies,Western All-Star Coaches Selection,2007 Rnd 2 Pick 18,Spain,2007,2,18


In [20]:
#Change Nationality for better analysis comparsion: US vs World? 

def change_nationality(df):
    df['Nationality'] = np.where(df['Nationality'] != 'United States', 'World', df['Nationality'])
    
    return df 

change_nationality(nba_clean)

nba_clean 

Unnamed: 0,Year,Player,Pos,HT,WT,Team,Selection Type,NBA Draft Status,Nationality,Draft Year,Round,Pick
0,2016,Stephen Curry,G,6'3,190,Golden State Warriors,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 7,United States,2009,1,7
1,2016,James Harden,G,6'5,220,Houston Rockets,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 3,United States,2009,1,3
2,2016,Kevin Durant,F,6'9,240,Golden State Warriors,Western All-Star Fan Vote Selection,2007 Rnd 1 Pick 2,United States,2007,1,2
3,2016,Kawhi Leonard,F,6'7,230,San Antonio Spurs,Western All-Star Fan Vote Selection,2011 Rnd 1 Pick 15,United States,2011,1,15
4,2016,Anthony Davis,F,6'11,253,New Orleans Pelicans,Western All-Star Fan Vote Selection,2012 Rnd 1 Pick 1,United States,2012,1,1
5,2016,Russell Westbrook,G,6'3,200,Oklahoma City Thunder,Western All-Star Coaches Selection,2008 Rnd 1 Pick 4,United States,2008,1,4
6,2016,DeMarcus Cousins,C,6'11,270,Sacramento Kings,Western All-Star Coaches Selection,2010 Rnd 1 Pick 5,United States,2010,1,5
7,2016,Klay Thompson,G,6'7,215,Golden State Warriors,Western All-Star Coaches Selection,2011 Rnd 1 Pick 11,United States,2011,1,11
8,2016,Draymond Green,F,6'7,230,Golden State Warriors,Western All-Star Coaches Selection,2012 Rnd 2 Pick 5,United States,2012,2,5
9,2016,Marc Gasol,C,7'1,255,Memphis Grizzlies,Western All-Star Coaches Selection,2007 Rnd 2 Pick 18,World,2007,2,18


In [22]:
#Change the positions to ensure consistency 

real_position = {'SF':'F','F-C':'F','PG':'G','SG':'G',
                 'G-F':'G','G':'G','F':'F','C':'C','GF':'F',
                 'FC':'F','PF':'F'}

nba_clean['Pos'] = nba_clean['Pos'].map(real_position)
nba_clean.head(10)

Unnamed: 0,Year,Player,Pos,HT,WT,Team,Selection Type,NBA Draft Status,Nationality,Draft Year,Round,Pick
0,2016,Stephen Curry,G,6'3,190,Golden State Warriors,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 7,United States,2009,1,7
1,2016,James Harden,G,6'5,220,Houston Rockets,Western All-Star Fan Vote Selection,2009 Rnd 1 Pick 3,United States,2009,1,3
2,2016,Kevin Durant,F,6'9,240,Golden State Warriors,Western All-Star Fan Vote Selection,2007 Rnd 1 Pick 2,United States,2007,1,2
3,2016,Kawhi Leonard,F,6'7,230,San Antonio Spurs,Western All-Star Fan Vote Selection,2011 Rnd 1 Pick 15,United States,2011,1,15
4,2016,Anthony Davis,F,6'11,253,New Orleans Pelicans,Western All-Star Fan Vote Selection,2012 Rnd 1 Pick 1,United States,2012,1,1
5,2016,Russell Westbrook,G,6'3,200,Oklahoma City Thunder,Western All-Star Coaches Selection,2008 Rnd 1 Pick 4,United States,2008,1,4
6,2016,DeMarcus Cousins,C,6'11,270,Sacramento Kings,Western All-Star Coaches Selection,2010 Rnd 1 Pick 5,United States,2010,1,5
7,2016,Klay Thompson,G,6'7,215,Golden State Warriors,Western All-Star Coaches Selection,2011 Rnd 1 Pick 11,United States,2011,1,11
8,2016,Draymond Green,F,6'7,230,Golden State Warriors,Western All-Star Coaches Selection,2012 Rnd 2 Pick 5,United States,2012,2,5
9,2016,Marc Gasol,C,7'1,255,Memphis Grizzlies,Western All-Star Coaches Selection,2007 Rnd 2 Pick 18,World,2007,2,18


In [19]:
nba_clean.isnull().sum()

Year                0
Player              0
Pos                 0
HT                  0
WT                  0
Team                0
Selection Type      0
NBA Draft Status    0
Nationality         0
Draft Year          0
Round               0
Pick                0
dtype: int64

In [18]:
nba_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439 entries, 0 to 438
Data columns (total 12 columns):
Year                439 non-null int64
Player              439 non-null object
Pos                 439 non-null object
HT                  439 non-null object
WT                  439 non-null int64
Team                439 non-null object
Selection Type      439 non-null object
NBA Draft Status    439 non-null object
Nationality         439 non-null object
Draft Year          439 non-null object
Round               439 non-null object
Pick                439 non-null object
dtypes: int64(2), object(10)
memory usage: 41.2+ KB
