In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    "Sex": ["M", "F", "F", "D", "?"],
    "Age": [29, 30, 23, 290, 24]
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,23
3,D,290
4,?,24


### Encontrando valores únicos

In [3]:
df["Sex"].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [4]:
df["Sex"].value_counts()

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [5]:
df["Sex"].replace("D", "F")

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

### Valores inválidos

In [9]:
df[df["Age"] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [10]:
df.loc[df["Age"] > 100, "Age"] = df.loc[df["Age"] > 100, "Age"] / 10
# df.loc[df["Age"] > 100, "Age"] retorna o valor da idade da busca df["Age"] > 100

In [11]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,23
3,D,29
4,?,24


### Dados duplicados 

In [12]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [13]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [14]:
ambassadors.duplicated()  # nao trata como duplicado o primeiro que ver com aquele valor
# nesse caso, o valor é o pais

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [15]:
ambassadors.duplicated(keep="last")  # trata como não duplicado o ultimo que ele ver

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [16]:
ambassadors.duplicated(False)  # não pode ter dados duplicados

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [23]:
ambassadors[~ambassadors.duplicated()]

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [22]:
ambassadors.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [24]:
ambassadors.drop_duplicates(keep="last")

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [25]:
ambassadors.drop_duplicates(False)

  ambassadors.drop_duplicates(False)


Gérard Araud          France
Armando Varricchio     Italy
dtype: object

### Dados duplicados em DF

In [27]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [28]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [29]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [30]:
players.duplicated(subset=["Name"])  # o que verificar a duplicação

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [31]:
players.duplicated(subset=["Name"], keep="last")

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [36]:
players.drop_duplicates(subset=["Name"], keep="first")

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
