In [24]:
import pandas as pd
import numpy as np

In [3]:
data = pd.DataFrame({'k1' : ['one'] * 3 + ['two'] * 4,
                  'k2' : [1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


## Duplicated

In [4]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [5]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [6]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [7]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


## Preservando a última duplicata

In [9]:
data.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


## Transformando dados usando uma função ou mapping

In [11]:
data2 = pd.DataFrame({'food'   : ['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                   'ounces' : [4,3,12,6,7.5,8,3,5,6]})
data2

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [20]:
meat_to_animal = {'bacon'       : 'pig',
               'pulled pork' : 'pig',
               'pastrami'    : 'cow',
               'corned beef' : 'cow',
               'honey ham'   : 'pig',
               'nova lox'    : 'salmon'}
display(data2['food'].map(str.lower))
data2['animal'] = data2['food'].map(str.lower).map(meat_to_animal)
display(data2)
display(data2['food'].map(lambda x:meat_to_animal[x.lower()]))

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

## Substituindo valores

In [30]:
data = pd.Series([1.,-999.,2.,-999.,-1000.,3.])
display(data)
display(data.replace(-999,np.nan))
display(data.replace([-999,-1000],np.nan))
display(data.replace([-999,-1000],[np.nan,0]))
display(data.replace({-999:np.nan,-1000:0}))


0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## Renomeando índices

In [33]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                 index=['Ohio','Colorado', 'New York'],
                 columns=['one','two','three','four'])
display(data)

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [37]:
list(data.index.map(str.upper))

['OHIO', 'COLORADO', 'NEW YORK']

In [38]:
data.index = data.index.map(str.upper)
display(data)

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [39]:
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [42]:
display(data.rename(index={'OHIO': 'INDIANA'},
            columns={'three' : 'peekaboo'}))

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11
