<a href="https://colab.research.google.com/github/anicelysantos/book-python-para-analise-de-dados/blob/main/limpeza_preparacao_dados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import pandas as pd
import numpy as np

# **Tratando dados ausentes**

**NaN** = Não é um número<br>
**NA** = Não avaliável (indisponível)

In [None]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan,'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [None]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [None]:
string_data[0] = None

In [None]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

**Filtrando dados ausentes**

In [None]:
from numpy import nan as NA

In [None]:
data = pd.Series([1,NA, 3.5,NA,7])

In [None]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [None]:
#mesma coisa que o anterior
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [None]:
#descarta qualquer linha contendo valor ausente
data = pd.DataFrame([[1., 6.5, 3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])


In [None]:
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [None]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [None]:
#descartar apenas as colunas que contenham NAs
data.dropna(how='all')


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [None]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [None]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [None]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
df

Unnamed: 0,0,1,2
0,0.695373,,
1,0.767821,,
2,1.609004,,0.220196
3,-0.504546,,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [None]:
df.dropna()

Unnamed: 0,0,1,2
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [None]:
#deixar somente as linhas que tenham menos de 2 NaN
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.609004,,0.220196
3,-0.504546,,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


**Preenchendo valores ausentes**

In [None]:
#passa por parametro o valor a ser preenchido
df.fillna(0)

Unnamed: 0,0,1,2
0,0.695373,0.0,0.0
1,0.767821,0.0,0.0
2,1.609004,0.0,0.220196
3,-0.504546,0.0,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [None]:
#preencher com valores diferentes
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,0.695373,0.5,0.0
1,0.767821,0.5,0.0
2,1.609004,0.5,0.220196
3,-0.504546,0.5,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [None]:
#modificar o dataframe sem gerar uma cópia
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.695373,0.0,0.0
1,0.767821,0.0,0.0
2,1.609004,0.0,0.220196
3,-0.504546,0.0,-0.732475
4,0.120819,-1.56533,-0.082433
5,-2.334512,1.80634,-0.900132
6,0.688283,-1.162494,-1.491648


In [None]:
df = pd.DataFrame(np.random.randn(6,3))


In [None]:
df.iloc[2:,1] = NA

In [None]:
df.iloc[4:,2] = NA

In [None]:
df

Unnamed: 0,0,1,2
0,0.848341,-0.39619,0.889161
1,0.684247,0.043599,-2.883327
2,2.595659,,1.677131
3,-0.236061,,0.854327
4,1.456005,,
5,-1.792206,,


In [None]:
df.fillna(method ='ffill')

Unnamed: 0,0,1,2
0,0.848341,-0.39619,0.889161
1,0.684247,0.043599,-2.883327
2,2.595659,0.043599,1.677131
3,-0.236061,0.043599,0.854327
4,1.456005,0.043599,0.854327
5,-1.792206,0.043599,0.854327


In [None]:
#numero máximo de valores a ser preenchido pra frente ou pra trás
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.848341,-0.39619,0.889161
1,0.684247,0.043599,-2.883327
2,2.595659,0.043599,1.677131
3,-0.236061,0.043599,0.854327
4,1.456005,,0.854327
5,-1.792206,,0.854327


In [None]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# **Transformação de dados**

In [None]:
data = pd.DataFrame({'k1':['one','two'] * 3 + ['two'],
                     'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [None]:
#Esse metodo devolve um boolean
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [None]:
#devolve um dataframe onde o duplicado em false
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [None]:
data['v1'] = range(7)


In [None]:
#identificando valores a partir de uma coluna
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [None]:
#mantem a ultima combinação de valores duplicados ao invés da primeira que aparece
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


**Transformando dados usando uma função ou um mapeamento**

In [None]:
data = pd.DataFrame({'food': ['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon', 'pastrami','honey ham','nova lox'],
                     'ounces': [4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [None]:
meat_to_animal = {
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}

In [None]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [None]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [None]:
#Uma função fazendo a mesma coisa
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

**Substituindo valores**

In [None]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [None]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [None]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [None]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [None]:
data.replace({-999: np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

**Renomeando os índices dos eixos**

In [None]:
data = pd.DataFrame(np.arange(12).reshape(3,4), index=['Ohio','Colorado','New York'], columns=['one','two','three','four'])

In [None]:
transform = lambda x: x[:4].upper()

In [None]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [None]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [None]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [None]:
data.rename(index={'OHIO':'INDIANA'}, columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [None]:
data.rename(index={'OHIO':'INDIANA'}, columns={'three':'peekaboo'}, inplace=True)
data

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


**Discretização e compartimentalização (binning)**

In [None]:
ages = [20,22, 25,27,21,23,37,31,61,45,41,32]

In [None]:
bins = [18,25,35,60,100]

In [None]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [None]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [None]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [None]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [None]:
pd.cut(ages, [18,26,36,61,100], right = False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [None]:
group_names = ['Jovem', 'Jovem adulto', 'Meia idade','Terceira idade']

In [None]:
pd.cut(ages, bins, labels=group_names)

['Jovem', 'Jovem', 'Jovem', 'Jovem adulto', 'Jovem', ..., 'Jovem adulto', 'Terceira idade', 'Meia idade', 'Meia idade', 'Jovem adulto']
Length: 12
Categories (4, object): ['Jovem' < 'Jovem adulto' < 'Meia idade' < 'Terceira idade']

In [None]:
data = np.random.rand(20)

In [None]:
#precision limita a precisão decimal de acordo com o valor do argumento, nesse caso aqui é 2 digitos
pd.cut(data, 4, precision=2)

[(0.5, 0.74], (0.5, 0.74], (0.0071, 0.25], (0.0071, 0.25], (0.25, 0.5], ..., (0.25, 0.5], (0.5, 0.74], (0.5, 0.74], (0.5, 0.74], (0.74, 0.99]]
Length: 20
Categories (4, interval[float64]): [(0.0071, 0.25] < (0.25, 0.5] < (0.5, 0.74] < (0.74, 0.99]]

In [None]:
data = np.random.randn(1000) #Normalmente distribuidos

In [None]:
cats = pd.qcut(data, 4) #Separa em quantis
cats

[(-3.179, -0.673], (-3.179, -0.673], (-3.179, -0.673], (0.735, 3.579], (-3.179, -0.673], ..., (-3.179, -0.673], (0.0206, 0.735], (0.0206, 0.735], (0.735, 3.579], (0.0206, 0.735]]
Length: 1000
Categories (4, interval[float64]): [(-3.179, -0.673] < (-0.673, 0.0206] < (0.0206, 0.735] <
                                    (0.735, 3.579]]

In [None]:
pd.value_counts(cats)

(0.735, 3.579]      250
(0.0206, 0.735]     250
(-0.673, 0.0206]    250
(-3.179, -0.673]    250
dtype: int64

In [None]:
pd.qcut(data,[0, 0.1, 0.5, 0.9, 1.])

[(-1.207, 0.0206], (-1.207, 0.0206], (-3.179, -1.207], (1.263, 3.579], (-3.179, -1.207], ..., (-1.207, 0.0206], (0.0206, 1.263], (0.0206, 1.263], (1.263, 3.579], (0.0206, 1.263]]
Length: 1000
Categories (4, interval[float64]): [(-3.179, -1.207] < (-1.207, 0.0206] < (0.0206, 1.263] <
                                    (1.263, 3.579]]

**Detectando e filtrando valores discrepantes**

In [None]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.014372,0.012546,0.013221,-0.001468
std,0.979038,1.003474,1.004417,1.011125
min,-3.194518,-3.889709,-2.936595,-3.155329
25%,-0.667454,-0.645327,-0.670687,-0.655365
50%,-0.022961,-0.032683,0.029464,-0.022404
75%,0.662527,0.658337,0.710523,0.726073
max,2.61989,3.288959,3.200848,2.951847


In [None]:
#Encontrar valores que excedem 3 na coluna
col =  data[2]
col[np.abs(col) > 3]

886    3.200848
Name: 2, dtype: float64

In [None]:
#Selecionar todas as linhas que exceda o valor 3 ou -3
data[(np.abs(data) > 3). any(1)]

Unnamed: 0,0,1,2,3
117,-3.194518,0.686649,0.240275,1.324565
176,0.50556,3.288959,-1.579313,1.055773
209,-0.029916,-1.143328,-1.080277,-3.155329
429,-0.863958,-3.889709,-0.276041,0.63555
743,1.547451,-3.101,-0.38378,-1.37803
836,-1.1225,-3.035373,0.325446,-0.590685
886,-0.962866,0.566082,3.200848,-1.032252
922,2.169612,3.010049,0.1228,1.808348
931,-3.165009,-1.156416,-1.379186,1.10876
949,0.691143,3.058347,-1.765345,-0.787384


In [None]:
#Eliminar os valores fora do intervalo -3 e 3
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.014012,0.013215,0.01302,-0.001313
std,0.977906,0.998871,1.003799,1.010652
min,-3.0,-3.0,-2.936595,-3.0
25%,-0.667454,-0.645327,-0.670687,-0.655365
50%,-0.022961,-0.032683,0.029464,-0.022404
75%,0.662527,0.658337,0.710523,0.726073
max,2.61989,3.0,3.0,2.951847


In [None]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,1.0,1.0
2,1.0,1.0,-1.0,-1.0
3,1.0,-1.0,-1.0,-1.0
4,1.0,-1.0,-1.0,1.0


**Permutação e amostragem aleatória**

In [None]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5,4)))


In [None]:
sampler = np.random.permutation(5)
sampler

array([1, 0, 2, 3, 4])

In [None]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [None]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [None]:
#trás linhas aleatórias do dataframe, a quantidade pode ser passada por argumento. Nesse exemplo são 3
df.sample(n=3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
0,0,1,2,3


In [None]:
choices = pd.Series([5,7,-1,6,4])

In [None]:
#Gerar uma amostra com substituição
draws = choices.sample(n=10, replace=True)
draws

3    6
3    6
1    7
4    4
0    5
1    7
0    5
2   -1
4    4
2   -1
dtype: int64

**Calculando variáveis indicadoras/dummy**

In [None]:
df = pd.DataFrame({'key': ['b','b','a','c','a','b'], 'data': range(6)})

In [None]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [None]:
dummie = pd.get_dummies(df['key'], prefix='key')

In [None]:
df_with_dummy = df[['data']].join(dummie)
df_with_dummy

Unnamed: 0,data,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


**Separando muitas informações dentro de uma mesma célula**

In [None]:
mnames = ['movie_id','title','genres']

In [None]:
movies = pd.read_table('/content/drive/MyDrive/dados_pandas/pydata-book-2nd-edition/datasets/movielens/movies.dat', sep='::', header=None, names=mnames)

  return read_csv(**locals())


In [None]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
all_genres = []

In [None]:
for x in movies.genres:
  all_genres.extend(x.split('|'))

In [None]:
genres = pd.unique(all_genres)

In [None]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [None]:
zero_matrix = np.zeros((len(movies), len(genres)))

In [None]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [None]:
gen = movies.genres[0]

In [None]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [None]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2])

In [None]:
for i, gen in enumerate(movies.genres):
  indices = dummies.columns.get_indexer(gen.split('|'))
  dummies.iloc[i, indices] = 1

In [None]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [None]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [None]:
np.random.seed(12345)

In [None]:
values = np.random.rand(10)

In [None]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [None]:
bins=[0, 0.2, 0.4, 0.6, 0.8, 1]

In [None]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


# **Manipulação de strings**

**Métodos de objetos string**

In [None]:
val = 'a,b, guido'

In [None]:
val.split(',')

['a', 'b', ' guido']

In [None]:
pieces = [x.strip() for x in val.split(',')]

In [None]:
pieces

['a', 'b', 'guido']

In [None]:
first, second, thrid = pieces

In [None]:
first + '::' + second + '::' + thrid

'a::b::guido'

In [None]:
#outro jeito de fazer a mesma coisa
'::'.join(pieces)


'a::b::guido'

In [None]:
'guido' in val

True

In [None]:
val.index(',')

1

In [None]:
val.find(':')

-1

In [None]:
val.count(',')

2

In [None]:
val.replace(',','::')

'a::b:: guido'

In [None]:
val.replace(',','')

'ab guido'

**Expressões regulares**

In [None]:
import re

Suponha que quiséssemos separar uma string com um número variável de caracteres de espaços em brnco (tabulações, espaços e quebras de linha). A regex que descreve um ou mais caracteres para espaços em branco é `\s+`.

In [None]:
text = "foo bar\t baz \tqux"

In [None]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [None]:
#outro jeito de fazer a mesma coisa
regex = re.compile('\s+')

In [None]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [None]:
#findall mostra uma lista de padrões que tem a vê com regex
regex.findall(text)

[' ', '\t ', ' \t']

In [None]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

In [None]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [None]:
#IGNORECASE faz a regex não diferenciar maiuscula de minuscula
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [None]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [None]:
text[m.start():m.end()]

'dave@google.com'

In [None]:
#Aparece None se o padrão não existir no inicio da string
print(regex.match(text))

None


In [None]:
#Sub devolve uma nova strig com as ocorrencias do padrão
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


Suponha que quiséssemos encontrar os endereços de email e, simultaneamente, segmentar cada endereço em seus três componentes: nome do usuário, nome do domínio e sufico do domínio.

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [None]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('wesm@bright.net')

In [None]:
m.groups()

('wesm', 'bright', 'net')

In [None]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [None]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


**Funções de string vetorizadas no pandas**

In [None]:
data = {'Dave': 'dave@google.com', 'Steve':'steve@gmail.com','Rob':'rod@gmail.com','Wes':np.nan}

In [None]:
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rod@gmail.com
Wes                  NaN
dtype: object

In [None]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [None]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [None]:
pattern

'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'

In [None]:
#outro jeito de fazer usando expressões regulares
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [dave@google.com]
Steve    [steve@gmail.com]
Rob        [rod@gmail.com]
Wes                    NaN
dtype: object

In [None]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [None]:
#erro do próprio livro. Aguardando correção do autor do livro e criador do Pandas
matches.str.get(1)
matches.str[0]

AttributeError: ignored

In [48]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rod@g
Wes        NaN
dtype: object