## String Object Methods

In [6]:
import numpy as np
import pandas as pd

In [24]:
val = 'a,b,  guido'

In [25]:
val.split(',')

['a', 'b', '  guido']

In [29]:
pices = [x.strip() for x in val.split(',')]

In [30]:
pices

['a', 'b', 'guido']

In [31]:
first, second, third = pices
first + '::' + second + '::' + third

'a::b::guido'

In [32]:
'::'.join(pices)

'a::b::guido'

In [33]:
'guido' in val

True

In [35]:
'a' in val

True

In [36]:
val.index('b')

2

In [38]:
val.find(':') # It will return -1 because it cannot found.

-1

In [39]:
val.index(':')

ValueError: substring not found

In [43]:
val.count(',') # Returns the number of occurence.

2

In [44]:
val.replace(',', '::')

'a::b::  guido'

In [45]:
val.replace(',', '')

'ab  guido'

## Regular Expressions

In [46]:
import re

In [47]:
text = 'foo       bar\t baz   \tqux'

In [51]:
re.split('\s+', text) # \s+ will remove one or more white space characters and tabs as well.

['foo', 'bar', 'baz', 'qux']

In [54]:
regex = re.compile('\s+')

In [55]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [56]:
regex.findall(text)

['       ', '\t ', '   \t']

In [60]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-sensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [63]:
m = regex.search(text) # it will store first matched string as an object in 'm'(variable).

In [64]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [65]:
text[m.start():m.end()]

'dave@google.com'

In [68]:
print(regex.match(text)) # it will return the first match thats why it will returns None.

None


In [69]:
print(regex.sub('REACTED', text))

Dave REACTED
Steve REACTED
Rob REACTED
Ryan REACTED



In [72]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.match('bhavikjadav@gmail.com')

m.groups()

('bhavikjadav', 'gmail', 'com')

In [73]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [75]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



## Vectorized String Functions in Pandas

In [76]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [78]:
data = pd.Series(data)

In [79]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [80]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [81]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [82]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [83]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [84]:
matches = data.str.match(pattern, flags=re.IGNORECASE)

In [85]:
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [99]:
data.str.get(1)

Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object

In [98]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object