In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
missing = np.nan # NaN value

# emulate test data object
series_obj = Series([
    'row 1',
    'row 2',
    missing,
    'row 4',
    'row 5',
    'row 6',
    missing,
    'row 8'
])
series_obj

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [3]:
# check non-empty cells (NaNs are considered to be empty cells)
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

In [4]:
# emulate a random test dataset of 6 x 6 matrix table
np.random.seed(25) # create random numbers seed
# randn - generate numbers with normal (Gaussian) distribution (mean = 0, var = 1)
DF_obj = DataFrame(np.random.randn(36).reshape(6,6))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,1.05661,-0.419678,2.294842,-2.594487,2.822756,0.680889
4,-1.577693,-1.976254,0.53334,-0.29087,-0.51352,1.982626
5,0.226001,-1.839905,1.607671,0.388292,0.399732,0.405477


In [5]:
# emulate empty values in our test dataset
DF_obj.loc[3:5, 0] = missing # 3-5 lines, 0 column
DF_obj.loc[1:4, 5] = missing # 1-4 lines, 5 column
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,
3,,-0.419678,2.294842,-2.594487,2.822756,
4,,-1.976254,0.53334,-0.29087,-0.51352,
5,,-1.839905,1.607671,0.388292,0.399732,0.405477


In [6]:
# 1st approach: fill everything with 0s
# Cons: Excel will calculate incorrect averages, because it treats empty & 0s differently
filled_DF = DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,0.0
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,0.0
3,0.0,-0.419678,2.294842,-2.594487,2.822756,0.0
4,0.0,-1.976254,0.53334,-0.29087,-0.51352,0.0
5,0.0,-1.839905,1.607671,0.388292,0.399732,0.405477


In [7]:
# 2nd approach: fill with numbers
# good approach - to fill values with average value

# fill 0st column empty values with 0.1
# fill 5th column empty values with 1.25
# let's assume they're average values
filled_DF = DF_obj.fillna({ 0: 0.1, 5: 1.25 })
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,1.25
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,1.25
3,0.1,-0.419678,2.294842,-2.594487,2.822756,1.25
4,0.1,-1.976254,0.53334,-0.29087,-0.51352,1.25
5,0.1,-1.839905,1.607671,0.388292,0.399732,0.405477


In [8]:
# 3rd approach - use 'ffill' built-in method
# it fills empty cells with last met value
fill_DF = DF_obj.fillna(method='ffill')
fill_DF

  fill_DF = DF_obj.fillna(method='ffill')


Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.222326
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-0.222326
3,2.152957,-0.419678,2.294842,-2.594487,2.822756,-0.222326
4,2.152957,-1.976254,0.53334,-0.29087,-0.51352,-0.222326
5,2.152957,-1.839905,1.607671,0.388292,0.399732,0.405477


In [9]:
np.random.seed(25)
DF_obj = DataFrame(np.random.randn(36).reshape(6,6))

DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,
3,,-0.419678,2.294842,-2.594487,2.822756,
4,,-1.976254,0.53334,-0.29087,-0.51352,
5,,-1.839905,1.607671,0.388292,0.399732,0.405477


In [10]:
# calculate empty cells count by columns (they're returned as 'col_index - empty count'
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

In [11]:
# 4st approach - drop all rows with empty values
DF_no_NAN = DF_obj.dropna()
DF

NameError: name 'DF' is not defined