# Python for Data Analysis - Workbook 3 (Data Cleaning and Preparation)

### Preliminaries

In [1]:
import numpy as np
import pandas as pd

## Handling Missing Data

### Dropping null values

In [18]:
# Pandas defaults to showing numeric values as NaN. The Python 'None' value is also treated as NA (not available)
# Can test for null values with the isnull() method

string_data = pd.Series(['aardvark', 'artichoke', None, 'avocado'])
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
# Very easy to drop missing data in a Series using the dropna() method

data = pd.Series([1, np.nan, 3.5, 7])
print(data)
data.dropna()

0    1.0
1    NaN
2    3.5
3    7.0
dtype: float64


0    1.0
2    3.5
3    7.0
dtype: float64

In [9]:
# For DataFrames it is more complex, because Pandas automatically drops an entire row if it contains an NA value

df = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan]])
print(df)
df.dropna()

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
# Passing in the how ='all' flag will only drop rows that are all NA

df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,


In [15]:
# Can also column-wise drop instead of row-wise

df[4] = np.nan
print(df)
df.dropna(axis=1, how='all')

     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,


In [17]:
# Can also set a threshold for how many NAs in a row or column before you drop

df.dropna(axis = 1, thresh = 2)

Unnamed: 0,0
0,1.0
1,1.0
2,


### Filling Null Values

In [19]:
# Can also fill in for nulls using fillna()

df.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0


In [21]:
# if you pass a dict, it fills different values for each column or row (depending on what axis you specified)

df.fillna({0: 0, 1: -1, 2: -2, 4:-4})

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,-4.0
1,1.0,-1.0,-2.0,-4.0
2,0.0,-1.0,-2.0,-4.0


In [24]:
# Can also use the same interpolation methods as with reindexing, like ffill and bfill, by specifying in 'method' arg

df.fillna(axis = 1, method = 'ffill', limit = 2)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,3.0
1,1.0,1.0,1.0,
2,,,,


In [27]:
# Can also get creative and use things like mean

df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
print(df)

df.fillna(df.mean())


          0         1         2
0 -0.766894  0.128638  0.460981
1 -3.014465 -1.856439  1.800927
2 -0.091198       NaN  0.817156
3 -1.282198       NaN -2.058563
4  0.053238       NaN       NaN
5  0.060676       NaN       NaN


Unnamed: 0,0,1,2
0,-0.766894,0.128638,0.460981
1,-3.014465,-1.856439,1.800927
2,-0.091198,-0.8639,0.817156
3,-1.282198,-0.8639,-2.058563
4,0.053238,-0.8639,0.255125
5,0.060676,-0.8639,0.255125


### Data Transformation