## Handling Missing Data with Pandas

In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.isnull(np.nan)

True

In [4]:
pd.isnull(None)

True

In [5]:
pd.isna(np.nan)

True

In [6]:
pd.isna(None)

True

In [7]:
pd.notnull(np.nan)

False

In [8]:
pd.notnull(3)

True

In [9]:
pd.isnull(pd.Series([1, np.nan,7]))

0    False
1     True
2    False
dtype: bool

In [12]:
pd.isnull(pd.DataFrame({
    'Column A': [1, np.nan, 7],
    'Column B': [np.nan, 2, 3],
    'Column C': [np.nan, 2, np.nan]
}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


## Filtering Missing Data

In [17]:
s = pd.Series([1, 2, 3, np.nan, np.nan, 4])

In [18]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [19]:
pd.notnull(s).count()

6

In [20]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [21]:
pd.notnull(s).sum()

4

In [22]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [23]:
s.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [24]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

## Dropping null values

In [26]:
s.dropna() # immutable

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [27]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

# Dropping null values on DataFrames

In [28]:
df = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})

In [29]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [30]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [31]:
df.isnull().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 260.0 bytes


In [34]:
df.shape

(4, 4)

In [35]:
df.dropna() # içinde NaN olan tüm rowları (yatay) siliyor

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [36]:
df.dropna(axis=1) # NaN olan columnları siliyor

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [37]:
df2 = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})

In [40]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [41]:
df.dropna(how = 'all') # eğer columndaki tüm valuelar NaNsa siliyor

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [42]:
df.dropna(how = 'any')

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [43]:
df.dropna(thresh=3) # içinde üç tane value olmalı onu tutabilmek için

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


In [44]:
df.dropna(thresh=4) 

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


## Filling null values

In [45]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [46]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [47]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [48]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [51]:
s.fillna(method='ffill') #forward fill

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [53]:
s.fillna(method='bfill') # backward fill

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [55]:
pd.Series([np.nan, 3, np.nan, 9]).fillna(method= 'ffill') # ilk sıradaki null olarak kalır

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [56]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [57]:
df.fillna(method= 'ffill', axis =0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [59]:
df.fillna(method= 'ffill', axis =1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


## Checking if there are NAs

In [60]:
s.dropna().count()

4

In [62]:
missing_values = len(s.dropna()) != len(s)
missing_values

True

The methods any and all check if either there's any True value in Series or all the values are True. They work in the same way as in Python

In [63]:
pd.Series([True, False, False]).any() # any values are valid

True

In [64]:
pd.Series([True, False, False]).all() # all values are valid

False

In [66]:
pd.Series([True, True, True]).all() 

True