<a href="https://colab.research.google.com/github/anicelysantos/book-python-para-analise-de-dados/blob/main/limpeza_preparacao_dados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# **Tratando dados ausentes**

**NaN** = Não é um número<br>
**NA** = Não avaliável (indisponível)

In [3]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan,'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

**Filtrando dados ausentes**

In [7]:
from numpy import nan as NA

In [8]:
data = pd.Series([1,NA, 3.5,NA,7])

In [11]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
#mesma coisa que o anterior
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
#descarta qualquer linha contendo valor ausente
data = pd.DataFrame([[1., 6.5, 3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])


In [14]:
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
#descartar apenas as colunas que contenham NAs
data.dropna(how='all')


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [17]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [18]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
df

Unnamed: 0,0,1,2
0,0.695373,,
1,0.767821,,
2,1.609004,,0.220196
3,-0.504546,,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [22]:
df.dropna()

Unnamed: 0,0,1,2
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [23]:
#deixar somente as linhas que tenham menos de 2 NaN
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.609004,,0.220196
3,-0.504546,,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


**Preenchendo valores ausentes**

In [26]:
#passa por parametro o valor a ser preenchido
df.fillna(0)

Unnamed: 0,0,1,2
0,0.695373,0.0,0.0
1,0.767821,0.0,0.0
2,1.609004,0.0,0.220196
3,-0.504546,0.0,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [27]:
#preencher com valores diferentes
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,0.695373,0.5,0.0
1,0.767821,0.5,0.0
2,1.609004,0.5,0.220196
3,-0.504546,0.5,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [28]:
#modificar o dataframe sem gerar uma cópia
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.695373,0.0,0.0
1,0.767821,0.0,0.0
2,1.609004,0.0,0.220196
3,-0.504546,0.0,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [29]:
df = pd.DataFrame(np.random.randn(6,3))


In [30]:
df.iloc[2:,1] = NA

In [31]:
df.iloc[4:,2] = NA

In [32]:
df

Unnamed: 0,0,1,2
0,0.848341,-0.39619,0.889161
1,0.684247,0.043599,-2.883327
2,2.595659,,1.677131
3,-0.236061,,0.854327
4,1.456005,,
5,-1.792206,,


In [33]:
df.fillna(method ='ffill')

Unnamed: 0,0,1,2
0,0.848341,-0.39619,0.889161
1,0.684247,0.043599,-2.883327
2,2.595659,0.043599,1.677131
3,-0.236061,0.043599,0.854327
4,1.456005,0.043599,0.854327
5,-1.792206,0.043599,0.854327


In [34]:
#numero máximo de valores a ser preenchido pra frente ou pra trás
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.848341,-0.39619,0.889161
1,0.684247,0.043599,-2.883327
2,2.595659,0.043599,1.677131
3,-0.236061,0.043599,0.854327
4,1.456005,,0.854327
5,-1.792206,,0.854327


In [36]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# **Transformação de dados**

In [40]:
data = pd.DataFrame({'k1':['one','two'] * 3 + ['two'],
                     'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [41]:
#Esse metodo devolve um boolean
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [42]:
#devolve um dataframe onde o duplicado em false
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [43]:
data['v1'] = range(7)


In [44]:
#identificando valores a partir de uma coluna
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [45]:
#mantem a ultima combinação de valores duplicados ao invés da primeira que aparece
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


**Transformando dados usando uma função ou um mapeamento**