# Introduction and 7.1 Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

Data preparation: loading, cleaning, tranforming and rearranging.

---

For numeric data, pandas uses the floating-point value Nan to represent missing data. We call this a sentinel value.

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [4]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data[0] = None # built in missing data type for python

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

---

## Filtering Out Missing Data

In [11]:
from numpy import nan as NA

In [8]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [9]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])

In [13]:
cleaned = data.dropna()

In [14]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [15]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [17]:
data.dropna(how='any') # default value

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [18]:
data[4] = NA

In [19]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [20]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
df = pd.DataFrame(np.random.randn(7, 3))

In [22]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA

In [23]:
df

Unnamed: 0,0,1,2
0,0.080216,,
1,-0.130821,,
2,0.02577,,-0.140144
3,1.019692,,-0.190472
4,0.608137,1.256682,0.158163
5,-1.15792,-0.007712,1.388783
6,0.839929,0.710213,-0.150898


In [24]:
df.dropna()

Unnamed: 0,0,1,2
4,0.608137,1.256682,0.158163
5,-1.15792,-0.007712,1.388783
6,0.839929,0.710213,-0.150898


In [25]:
df.dropna(thresh=2) # thresh specifies the minumum non-null

Unnamed: 0,0,1,2
2,0.02577,,-0.140144
3,1.019692,,-0.190472
4,0.608137,1.256682,0.158163
5,-1.15792,-0.007712,1.388783
6,0.839929,0.710213,-0.150898


---

## Filling in Missing Data

In [26]:
df

Unnamed: 0,0,1,2
0,0.080216,,
1,-0.130821,,
2,0.02577,,-0.140144
3,1.019692,,-0.190472
4,0.608137,1.256682,0.158163
5,-1.15792,-0.007712,1.388783
6,0.839929,0.710213,-0.150898


In [28]:
df.fillna(0) # missing values replaced by 0

Unnamed: 0,0,1,2
0,0.080216,0.0,0.0
1,-0.130821,0.0,0.0
2,0.02577,0.0,-0.140144
3,1.019692,0.0,-0.190472
4,0.608137,1.256682,0.158163
5,-1.15792,-0.007712,1.388783
6,0.839929,0.710213,-0.150898


In [29]:
df.fillna({1 : 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.080216,0.5,0.0
1,-0.130821,0.5,0.0
2,0.02577,0.5,-0.140144
3,1.019692,0.5,-0.190472
4,0.608137,1.256682,0.158163
5,-1.15792,-0.007712,1.388783
6,0.839929,0.710213,-0.150898


In [30]:
_ = df.fillna(0, inplace=True)

In [31]:
df

Unnamed: 0,0,1,2
0,0.080216,0.0,0.0
1,-0.130821,0.0,0.0
2,0.02577,0.0,-0.140144
3,1.019692,0.0,-0.190472
4,0.608137,1.256682,0.158163
5,-1.15792,-0.007712,1.388783
6,0.839929,0.710213,-0.150898


In [32]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA

In [33]:
df

Unnamed: 0,0,1,2
0,0.794444,1.187216,-0.524884
1,-0.151492,0.38244,0.25248
2,-1.244361,,0.479646
3,-0.207705,,0.410415
4,1.212453,,
5,-0.198713,,


In [34]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.794444,1.187216,-0.524884
1,-0.151492,0.38244,0.25248
2,-1.244361,0.38244,0.479646
3,-0.207705,0.38244,0.410415
4,1.212453,0.38244,0.410415
5,-0.198713,0.38244,0.410415


In [35]:
df.fillna(method='ffill', limit=2) 
# limit specifies the number of missing values to be repleaced by method

Unnamed: 0,0,1,2
0,0.794444,1.187216,-0.524884
1,-0.151492,0.38244,0.25248
2,-1.244361,0.38244,0.479646
3,-0.207705,0.38244,0.410415
4,1.212453,,0.410415
5,-0.198713,,0.410415


In [36]:
data = pd.Series([1., NA, 3.5, NA, 7])

In [37]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [40]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.794444,1.187216,-0.524884
1,-0.151492,0.38244,0.25248
2,-1.244361,0.784828,0.479646
3,-0.207705,0.784828,0.410415
4,1.212453,0.784828,0.154414
5,-0.198713,0.784828,0.154414
