## Data Cleaning

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys
p = os.path.join(os.path.dirname('__file__'), '..')
sys.path.append(p)
from common import *

### Numpy

Missing Data in Pandas

Pandas chose to use sentinels for missing data, and further chose to use two already-existing Python null values: the special floating-point NaN value, and the Python None object. This choice has some side effects, as we will see, but in practice ends up being a good compromise in most cases of interest.

In [3]:
# NaN = IEEE Floating Point comptible
vals2 = np.array([1, np.nan, 3, 4]) 
vals2.dtype

dtype('float64')

In [7]:
# Regardless of the operation, the result of arithmetic with NaN will be another NaN
1 + np.nan, vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan, nan)

In [9]:
# Numpy ignore np.nan values
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

### Pandas

In [11]:
# NaN and None both have their place
# Pandas is built to handle the two of them nearly interchangeably
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [13]:
# Pandas will convert the data type to accomodate IEEE floating point NaN values
x = pd.Series(range(2), dtype=int)
print(x)
x[0] = None
x

0    0
1    1
dtype: int64


0    NaN
1    1.0
dtype: float64

### Mask Missing Values

* isnull(): Generate a boolean mask indicating missing values
* notnull(): Opposite of isnull()
* dropna(): Return a filtered version of the data
* fillna(): Return a copy of the data with missing values filled or imputed

In [28]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df.isnull()

Unnamed: 0,0,1,2
0,False,True,False
1,False,False,False
2,True,False,False


In [29]:
df[df.notnull()]

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


### Drop Missing Values

In [30]:
# Drop Rows with NaN values
df.dropna()    # not in-place

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [31]:
# Drop columns with NaN values
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


### Fill Missing Values

In [32]:
# Fill with constant value
data.fillna(0)

0        1
1        0
2    hello
3        0
dtype: object

In [34]:
# forward-fill (take previous value in column)
# Good for time-series like stocks
data.fillna(method='ffill')

0        1
1        1
2    hello
3    hello
dtype: object

In [35]:
# back-fill
data.fillna(method='bfill')

0        1
1    hello
2    hello
3     None
dtype: object