In [26]:
import pandas as pd
import numpy as np

In [60]:
df = pd.read_csv('nba.csv')

In [61]:
df.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


### Describe the dataset

In [62]:
df.shape

(458, 9)

In [63]:
df.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


### Check for missing values

In [65]:
df.isnull().sum()

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

### Remove missing values

In [66]:
# 1. Forward Fill
# df.ffill()

In [67]:
# 2. Backward Fill
# df.bfill()

In [68]:
# 3. Fill with mean or median - only for numeric columns
# df.fillna(df.median())

In [69]:
# 4, Drop the column/row
# df.dropna(axis=0, inplace=True)

In [70]:
df.dropna(axis=0)
df.isnull().sum()

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

### Summarize variable types

In [71]:
df.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

### Convert categorical variables to numerical

In [79]:
newdf = df.copy()
newdf2 = df.copy()

In [76]:
# METHOD 1

positions = newdf.Position.unique()
positions

array(['PG', 'SF', 'SG', 'PF', 'C', nan], dtype=object)

In [77]:
numeric_positions = [0,1,2,3,4,5]
newdf['Position'].replace(positions, numeric_positions, inplace=True)

In [78]:
newdf

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,0.0,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,1.0,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,2.0,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,2.0,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,3.0,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,0.0,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,0.0,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,4.0,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,4.0,26.0,7-0,231.0,Kansas,947276.0


In [80]:
# METHOD 2

dummies = pd.get_dummies(newdf2.Position)
dummies

Unnamed: 0,C,PF,PG,SF,SG
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,0,0,1
3,0,0,0,0,1
4,0,1,0,0,0
...,...,...,...,...,...
453,0,0,1,0,0
454,0,0,1,0,0
455,1,0,0,0,0
456,1,0,0,0,0


In [81]:
merged = pd.concat([newdf2, dummies], axis='columns')
merged

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,C,PF,PG,SF,SG
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,0,0,1,0,0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,0,0,0,1,0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,0,0,0,0,1
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,0,0,0,0,1
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,0,0,1,0,0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0,0,0,1,0,0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0,1,0,0,0,0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,1,0,0,0,0


In [83]:
merged.drop(['Position'], axis=1, inplace=True)

In [84]:
merged

Unnamed: 0,Name,Team,Number,Age,Height,Weight,College,Salary,C,PF,PG,SF,SG
0,Avery Bradley,Boston Celtics,0.0,25.0,6-2,180.0,Texas,7730337.0,0,0,1,0,0
1,Jae Crowder,Boston Celtics,99.0,25.0,6-6,235.0,Marquette,6796117.0,0,0,0,1,0
2,John Holland,Boston Celtics,30.0,27.0,6-5,205.0,Boston University,,0,0,0,0,1
3,R.J. Hunter,Boston Celtics,28.0,22.0,6-5,185.0,Georgia State,1148640.0,0,0,0,0,1
4,Jonas Jerebko,Boston Celtics,8.0,29.0,6-10,231.0,,5000000.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,26.0,6-3,203.0,Butler,2433333.0,0,0,1,0,0
454,Raul Neto,Utah Jazz,25.0,24.0,6-1,179.0,,900000.0,0,0,1,0,0
455,Tibor Pleiss,Utah Jazz,21.0,26.0,7-3,256.0,,2900000.0,1,0,0,0,0
456,Jeff Withey,Utah Jazz,24.0,26.0,7-0,231.0,Kansas,947276.0,1,0,0,0,0
