# Data Preparation Basics
## Segment 2 - Treating missing values

In [3]:
import numpy as np
import pandas as pd 

from pandas import Series, DataFrame

### Figuring out what data is missing

In [4]:
missing = np.nan
series_obj = Series(['row 1','row 2',missing,'row 4','row 5',missing,'row 7','row 8',missing,'row 10'])
series_obj

0     row 1
1     row 2
2       NaN
3     row 4
4     row 5
5       NaN
6     row 7
7     row 8
8       NaN
9    row 10
dtype: object

In [5]:
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8     True
9    False
dtype: bool

### Filling in for missing values

In [12]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(49).reshape(7,7))
DF_obj

Unnamed: 0,0,1,2,3,4,5,6
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969
1,0.437611,0.556229,0.36708,0.402366,0.113041,0.447031,0.585445
2,0.161985,0.520719,0.326051,0.699186,0.366395,0.836375,0.481343
3,0.516502,0.383048,0.997541,0.514244,0.559053,0.03445,0.71993
4,0.421004,0.436935,0.281701,0.900274,0.669612,0.456069,0.289804
5,0.525819,0.559242,0.745284,0.828346,0.823694,0.07714,0.644862
6,0.309258,0.524254,0.958092,0.883201,0.295432,0.512376,0.088702


In [13]:
DF_obj.loc[1:4, 1]=missing
DF_obj.loc[3:6, 5]=missing
DF_obj

Unnamed: 0,0,1,2,3,4,5,6
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969
1,0.437611,,0.36708,0.402366,0.113041,0.447031,0.585445
2,0.161985,,0.326051,0.699186,0.366395,0.836375,0.481343
3,0.516502,,0.997541,0.514244,0.559053,,0.71993
4,0.421004,,0.281701,0.900274,0.669612,,0.289804
5,0.525819,0.559242,0.745284,0.828346,0.823694,,0.644862
6,0.309258,0.524254,0.958092,0.883201,0.295432,,0.088702


In [14]:
filled_DF = DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5,6
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969
1,0.437611,0.0,0.36708,0.402366,0.113041,0.447031,0.585445
2,0.161985,0.0,0.326051,0.699186,0.366395,0.836375,0.481343
3,0.516502,0.0,0.997541,0.514244,0.559053,0.0,0.71993
4,0.421004,0.0,0.281701,0.900274,0.669612,0.0,0.289804
5,0.525819,0.559242,0.745284,0.828346,0.823694,0.0,0.644862
6,0.309258,0.524254,0.958092,0.883201,0.295432,0.0,0.088702


In [16]:
filled_DF = DF_obj.fillna({0:0.1,1:0.2,5:1.33})
filled_DF

Unnamed: 0,0,1,2,3,4,5,6
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969
1,0.437611,0.2,0.36708,0.402366,0.113041,0.447031,0.585445
2,0.161985,0.2,0.326051,0.699186,0.366395,0.836375,0.481343
3,0.516502,0.2,0.997541,0.514244,0.559053,1.33,0.71993
4,0.421004,0.2,0.281701,0.900274,0.669612,1.33,0.289804
5,0.525819,0.559242,0.745284,0.828346,0.823694,1.33,0.644862
6,0.309258,0.524254,0.958092,0.883201,0.295432,1.33,0.088702


In [17]:
fill = DF_obj.fillna(method='ffill')
fill

Unnamed: 0,0,1,2,3,4,5,6
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969
1,0.437611,0.582277,0.36708,0.402366,0.113041,0.447031,0.585445
2,0.161985,0.582277,0.326051,0.699186,0.366395,0.836375,0.481343
3,0.516502,0.582277,0.997541,0.514244,0.559053,0.836375,0.71993
4,0.421004,0.582277,0.281701,0.900274,0.669612,0.836375,0.289804
5,0.525819,0.559242,0.745284,0.828346,0.823694,0.836375,0.644862
6,0.309258,0.524254,0.958092,0.883201,0.295432,0.836375,0.088702


### Counting missing values

In [28]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(49).reshape(7,7))
DF_obj.loc[3:5, 0]=missing
DF_obj.loc[1, 0]=missing
DF_obj.loc[1:6, 6]=missing
DF_obj.loc[4, 4]=missing
DF_obj

Unnamed: 0,0,1,2,3,4,5,6
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969
1,,0.556229,0.36708,0.402366,0.113041,0.447031,
2,0.161985,0.520719,0.326051,0.699186,0.366395,0.836375,
3,,0.383048,0.997541,0.514244,0.559053,0.03445,
4,,0.436935,0.281701,0.900274,,0.456069,
5,,0.559242,0.745284,0.828346,0.823694,0.07714,
6,0.309258,0.524254,0.958092,0.883201,0.295432,0.512376,


In [29]:
DF_obj.isnull().sum()

0    4
1    0
2    0
3    0
4    1
5    0
6    6
dtype: int64

In [31]:
DF_obj.isnull().sum(axis=1)

0    0
1    2
2    1
3    2
4    3
5    2
6    1
dtype: int64

### Filtering out missing values

In [32]:
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5,6
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969


In [33]:
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN

Unnamed: 0,1,2,3,5
0,0.582277,0.278839,0.185911,0.117376
1,0.556229,0.36708,0.402366,0.447031
2,0.520719,0.326051,0.699186,0.836375
3,0.383048,0.997541,0.514244,0.03445
4,0.436935,0.281701,0.900274,0.456069
5,0.559242,0.745284,0.828346,0.07714
6,0.524254,0.958092,0.883201,0.512376
