# 7.1 Handling Missing Data

- Missing data occurs commonly in many data analysis applications
- For numeric data, pandas uses floating-point value NaN to represent missing data
    - this is called *sentinel value* that can be easily detected
    

In [3]:
# dependency for this notebook
import pandas
import numpy

In [2]:
import pandas as pd
import numpy as np

str_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
str_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
str_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

- pandas refers to missing data as NA, which stands for *not available*
- NA data may either be data that does not exist or that exists but was not observed
- when cleaning up data for analysis, it is often important to do analysis on the missing data itself to identify data collection problems or potential biases in the data caused by missing data

In [4]:
str_data[0] = None
str_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Null handling methods

- `dropna`: filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate
- `fillna`: fill in missing data with some value or using an interpolation method such as `ffill`, or `bfill`


## Filtering Out Missing Data

- `dropna` or do it by hand using `pandas.isnull` and boolean indexing

In [5]:
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
data = pd.DataFrame([
    [1., 6.5, 3],
    [1., NA, NA],
    [NA, NA, NA],
    [NA, 6.5, 3.]
])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [11]:
# drop columns
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,0.262773,0.253127,0.126695
1,0.176128,-0.481652,-1.394004
2,-0.23942,-0.777343,-0.96522
3,0.766141,-0.213277,0.820087
4,-1.031533,1.083553,-0.501898
5,-0.281323,0.508774,0.196938
6,-0.293224,-0.848254,-0.313888


In [14]:
df.iloc[:4, 1]

0    0.253127
1   -0.481652
2   -0.777343
3   -0.213277
Name: 1, dtype: float64

In [15]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.262773,,
1,0.176128,,
2,-0.23942,,-0.96522
3,0.766141,,0.820087
4,-1.031533,1.083553,-0.501898
5,-0.281323,0.508774,0.196938
6,-0.293224,-0.848254,-0.313888


In [16]:
# suppose you want to keep only rows containing 
# a certain number of observations: indicate by `thresh`

df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.23942,,-0.96522
3,0.766141,,0.820087
4,-1.031533,1.083553,-0.501898
5,-0.281323,0.508774,0.196938
6,-0.293224,-0.848254,-0.313888


## Filling In Missing Data

- call `fillna` with a constant to replace missing values

In [17]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.262773,0.0,0.0
1,0.176128,0.0,0.0
2,-0.23942,0.0,-0.96522
3,0.766141,0.0,0.820087
4,-1.031533,1.083553,-0.501898
5,-0.281323,0.508774,0.196938
6,-0.293224,-0.848254,-0.313888


In [18]:
# call fillna with a dict to use a different fill value 
# for each column

df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.262773,0.5,0.0
1,0.176128,0.5,0.0
2,-0.23942,0.5,-0.96522
3,0.766141,0.5,0.820087
4,-1.031533,1.083553,-0.501898
5,-0.281323,0.508774,0.196938
6,-0.293224,-0.848254,-0.313888


In [19]:
# one can modify the existing object in-place

_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.262773,0.0,0.0
1,0.176128,0.0,0.0
2,-0.23942,0.0,-0.96522
3,0.766141,0.0,0.820087
4,-1.031533,1.083553,-0.501898
5,-0.281323,0.508774,0.196938
6,-0.293224,-0.848254,-0.313888


In [20]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,1.47432,-0.315785,-1.977004
1,-1.324743,0.241188,-0.875778
2,-0.366567,,-0.914421
3,-0.438641,,-0.556353
4,-1.777648,,
5,0.368239,,


In [21]:
# forward filling
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.47432,-0.315785,-1.977004
1,-1.324743,0.241188,-0.875778
2,-0.366567,0.241188,-0.914421
3,-0.438641,0.241188,-0.556353
4,-1.777648,0.241188,-0.556353
5,0.368239,0.241188,-0.556353


In [22]:
# forward filling with limit 
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,1.47432,-0.315785,-1.977004
1,-1.324743,0.241188,-0.875778
2,-0.366567,0.241188,-0.914421
3,-0.438641,0.241188,-0.556353
4,-1.777648,,-0.556353
5,0.368239,,-0.556353


# 7.2 Data Transformation

## Remove Duplicates

In [23]:
data = pd.DataFrame({
    'k1': ['one', 'two'] * 3 + ['two'],
    'k2': [1, 1, 2, 3, 3, 4, 4]
})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [25]:
# the DataFrame `duplicated` returns a boolean Series indicating
# whether each row is a duplicate (has been observed in a previous row)
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [26]:
# `drop_duplicates` returns a DataFrame where the `duplicated` 
# array is False
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [27]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
6,two,4


## Transforming Data Using a Function or Mapping

- for many datasets, you may wish to perform some transformation based on the values in an array, Series, or column in a DataFrame

In [3]:
data = pd.DataFrame({
    'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]
})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [4]:
# Suppose you wanted to add a column indicating the type 
# of animal that each food came from

meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}


In [14]:
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon
