# Chapter 2 - Data Preparation Basics
## Segment 2 - Treating missing values

In [2]:
import numpy as np
import pandas as pd 

from pandas import Series, DataFrame

### Figuring out what data is missing

In [3]:
missing = np.nan

series_obj = Series(['row 1', 'row 2', missing, 'row 4', 'row 5', 'row 6', missing, 'row 8'])
series_obj

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [4]:
# Checking where the null values
series_obj.isnull()


0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

### Filling in for missing values

In [5]:
# Create a DataFrame
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape((6,6)), index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6'])
DF_obj


Unnamed: 0,0,1,2,3,4,5
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row 5,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [6]:
# Creating missing values
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

  indexer = self._get_setitem_indexer(key)


Unnamed: 0,0,1,2,3,4,5
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,
row 4,,0.836375,0.481343,0.516502,0.383048,
row 5,,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [7]:
# Filling missing values
filled_DF =DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
row 4,0.0,0.836375,0.481343,0.516502,0.383048,0.0
row 5,0.0,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [8]:
# Filling in null values with dictionary
filled_DF =  DF_obj.fillna({0: 0.1, 5: 1.25})
filled_DF

Unnamed: 0,0,1,2,3,4,5
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,1.25
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,1.25
row 4,0.1,0.836375,0.481343,0.516502,0.383048,1.25
row 5,0.1,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [9]:
# Fill with last non-null element
fill_DF = DF_obj.fillna(method='ffill')
fill_DF

Unnamed: 0,0,1,2,3,4,5
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
row 4,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
row 5,0.447031,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


### Counting missing values

In [10]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape((6,6)), index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6'])
DF_obj
# Creating missing values
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

  indexer = self._get_setitem_indexer(key)


Unnamed: 0,0,1,2,3,4,5
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,
row 4,,0.836375,0.481343,0.516502,0.383048,
row 5,,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [11]:
# Count all null values by column
DF_obj.isnull().sum()


0    2
1    0
2    0
3    0
4    0
5    3
dtype: int64

### Filtering out missing values

In [12]:
# Drop rows with null values
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [13]:
# Drop columns with null values
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN


Unnamed: 0,1,2,3,4
row 1,0.582277,0.278839,0.185911,0.4111
row 2,0.437611,0.556229,0.36708,0.402366
row 3,0.585445,0.161985,0.520719,0.326051
row 4,0.836375,0.481343,0.516502,0.383048
row 5,0.559053,0.03445,0.71993,0.421004
row 6,0.900274,0.669612,0.456069,0.289804
