# Chapter 7: Data Cleaning and Preparation
## 7.1 Handling Missing Data

In [3]:
import pandas as pd
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [4]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [6]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
string_data[0] = None

In [9]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

In [10]:
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, 7])

In [11]:
data.dropna()

0    1.0
2    3.5
3    7.0
dtype: float64

In [12]:
data[data.notnull()]

0    1.0
2    3.5
3    7.0
dtype: float64

In [17]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])

In [18]:
cleaned = data.dropna()

In [19]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [24]:
data.dropna(how='all')

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
3,,6.5,3.0,


In [21]:
data[4] = NA 

In [22]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [23]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [25]:
df = pd.DataFrame(np.random.randn(7,3))

In [26]:
df.iloc[:4, 1] = NA

In [27]:
df.iloc[:2, 2] = NA

In [28]:
df

Unnamed: 0,0,1,2
0,-0.740883,,
1,0.432511,,
2,-1.111026,,-2.30434
3,0.219839,,0.606394
4,-0.812791,1.296446,-2.596612
5,-0.918143,0.801093,-1.017625
6,0.453696,0.613075,1.074116


In [29]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.812791,1.296446,-2.596612
5,-0.918143,0.801093,-1.017625
6,0.453696,0.613075,1.074116


In [30]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.111026,,-2.30434
3,0.219839,,0.606394
4,-0.812791,1.296446,-2.596612
5,-0.918143,0.801093,-1.017625
6,0.453696,0.613075,1.074116


### Filling in Missing Data