In [1]:
import numpy as np
import pandas as pd

In [9]:
# funções do Pandas
# valor não presente, fora do domínio dos valores
pd.isnull(None), pd.isnull(np.nan), pd.isna(None)

(True, True, True)

In [8]:
pd.notnull(None), pd.notnull(np.nan), pd.notnull(3)

(False, False, True)

In [10]:
pd.isnull(pd.Series([1, np.nan, 7]))

0    False
1     True
2    False
dtype: bool

In [11]:
pd.notnull(pd.Series([1, np.nan, 7]))

0     True
1    False
2     True
dtype: bool

In [12]:
pd.isnull(pd.DataFrame({
    "A": [1, np.nan, 7],
    "B": [np.nan, 2, 3],
    "C": [np.nan, 2, np.nan]
}))

Unnamed: 0,A,B,C
0,False,True,True
1,True,False,False
2,False,False,True


### Filtrando dados faltantes

In [13]:
s = pd.Series([1, 2, 3, np.nan, np.nan, 4])

In [15]:
pd.notnull(s), pd.notnull(s).sum()

(0     True
 1     True
 2     True
 3    False
 4    False
 5     True
 dtype: bool,
 4)

In [16]:
pd.isnull(s), pd.isnull(s).sum()

(0    False
 1    False
 2    False
 3     True
 4     True
 5    False
 dtype: bool,
 2)

In [17]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [18]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [19]:
s.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [20]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

### Dropping null values

In [21]:
s.dropna()  # exclui valores faltando (não modifca a serie)

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

### Dropping null values no DF

In [22]:
df = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})

In [25]:
df

(   Column A  Column B  Column C  Column D
 0       1.0       2.0       NaN         5
 1       NaN       8.0       9.0         8
 2      30.0      31.0      32.0        34
 3       NaN       NaN     100.0       110,
 (4, 4))

In [26]:
df.shape, df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


((4, 4), None)

In [24]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [27]:
df.isnull().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [29]:
df.dropna()  # remove linhas que tem pelo menos um NaN

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [30]:
df.dropna(axis=1)  # remove coluna

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [31]:
df2 = pd.DataFrame({
    'Column A': [1, np.nan, 30],
    'Column B': [2, np.nan, 31],
    'Column C': [np.nan, np.nan, 100]
})

In [32]:
df2.dropna(how="any")  # comportamento padrao

Unnamed: 0,Column A,Column B,Column C
2,30.0,31.0,100.0


In [35]:
df.dropna(thresh=3)  # precisa de ao menos 3 dados válidos para não dropar

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


### Preenchendo valores nulos

In [36]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [37]:
s.fillna(0)  # ou qualquer outro valor

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [38]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [39]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [41]:
 # argumento method de fillna é a forma que vai preencher em relação ao mais próximo do nan
 # pode deixar valores nulos nas pontas da Serie/DF
s.fillna(method="ffill")  # preenche de cima para baixo

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [43]:
s.fillna(method="bfill")  # preenche de baixo para cima

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [44]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [46]:
df.fillna({"Column A": 0, "Column B": 99, "Column C": df["Column C"].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,110


In [47]:
df.fillna(method="ffill", axis=0)  # coluna

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [48]:
df.fillna(method="ffill", axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0
