# Chapter 7

# 7.1 Handling Missing Data

In [3]:
import numpy as np
import pandas as pd

string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [4]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data[0]=None

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

Filtering Out Missing Data

In [8]:
from numpy import nan as NA

In [9]:
data = pd.Series([1,NA,3.5,NA,7])

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:

data = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],
                  [NA,NA,NA],[NA,6.5,3.]])

In [16]:
cleaned=data.dropna()

In [17]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [19]:
cleaned = data.dropna(how='all')

In [20]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [21]:
data[4] = NA

In [22]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [23]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [25]:
df = pd.DataFrame(np.random.randn(7,3))

In [26]:
df.iloc[:4,1]=NA

In [27]:
df.iloc[:2,2]=NA

In [28]:
df

Unnamed: 0,0,1,2
0,-0.20133,,
1,1.676363,,
2,0.191662,,1.179946
3,2.032223,,-0.892933
4,0.531431,0.998173,-1.190202
5,-0.538407,1.160787,-0.497341
6,-1.187278,0.675565,-0.986333


In [29]:
df.dropna()

Unnamed: 0,0,1,2
4,0.531431,0.998173,-1.190202
5,-0.538407,1.160787,-0.497341
6,-1.187278,0.675565,-0.986333


In [30]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.191662,,1.179946
3,2.032223,,-0.892933
4,0.531431,0.998173,-1.190202
5,-0.538407,1.160787,-0.497341
6,-1.187278,0.675565,-0.986333


Filling In Missing Data

In [32]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.20133,0.0,0.0
1,1.676363,0.0,0.0
2,0.191662,0.0,1.179946
3,2.032223,0.0,-0.892933
4,0.531431,0.998173,-1.190202
5,-0.538407,1.160787,-0.497341
6,-1.187278,0.675565,-0.986333


In [33]:
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-0.20133,0.5,0.0
1,1.676363,0.5,0.0
2,0.191662,0.5,1.179946
3,2.032223,0.5,-0.892933
4,0.531431,0.998173,-1.190202
5,-0.538407,1.160787,-0.497341
6,-1.187278,0.675565,-0.986333


In [35]:
_ = df.fillna(0, inplace=True)

In [36]:
df

Unnamed: 0,0,1,2
0,-0.20133,0.0,0.0
1,1.676363,0.0,0.0
2,0.191662,0.0,1.179946
3,2.032223,0.0,-0.892933
4,0.531431,0.998173,-1.190202
5,-0.538407,1.160787,-0.497341
6,-1.187278,0.675565,-0.986333


In [39]:
df = pd.DataFrame(np.random.randn(6,3))

In [40]:
df.iloc[2:,1]=NA

In [41]:
df.iloc[4:,2]=NA

In [42]:
df

Unnamed: 0,0,1,2
0,0.807543,0.04891,-1.070463
1,0.076123,-1.863747,1.212111
2,1.156676,,0.693217
3,0.361469,,1.170709
4,-1.312615,,
5,-0.280294,,


In [43]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.807543,0.04891,-1.070463
1,0.076123,-1.863747,1.212111
2,1.156676,-1.863747,0.693217
3,0.361469,-1.863747,1.170709
4,-1.312615,-1.863747,1.170709
5,-0.280294,-1.863747,1.170709


In [44]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.807543,0.04891,-1.070463
1,0.076123,-1.863747,1.212111
2,1.156676,-1.863747,0.693217
3,0.361469,-1.863747,1.170709
4,-1.312615,,1.170709
5,-0.280294,,1.170709


In [45]:
data = pd.Series([1.,NA,3.5,NA,7])

In [None]:
data