# String Object Methods


**Python built-in string methods**


**.split('delimeter'): Break string into list of substrings using passed delimiter.**

**.strip/ rstrip/ lstrip(): Trim whitespace, including newlines; equivalent to x.strip() (and rstrip,
lstrip, respectively) for each element.**

**.join()**

**.find()**

**.replace()**



In [18]:
 val = 'a,b, guido'

In [19]:
val.split(',')

['a', 'b', ' guido']

In [29]:
pieces = [x.strip() for x in val.split(',')]

In [30]:
pieces

['a', 'b', 'guido']

In [5]:
first, second, third = pieces

In [6]:
first + '::' + second + '::' + third

'a::b::guido'

In [7]:
'::'.join(pieces)

'a::b::guido'

In [9]:
'guido' in val

True

In [10]:
val.find(':')

-1

In [13]:
val.count(',')

2

In [11]:
val.replace(',', '::')

'a::b:: guido'

In [12]:
val.replace(',', '')

'ab guido'

# Regular expressions

**re.split()**

Regular expressions provide a flexible way to search or match string patterns in text. A
single expression, commonly called a **regex**, is a string formed according to the regular
expression language.The re module functions fall into three categories: pattern matching, substitution, and
splitting.

In [12]:
 import re

In [13]:
 text = "Mahi The\t Awesome \tMan"

In [14]:
re.split('\s+', text) #spliting the text

['Mahi', 'The', 'Awesome', 'Man']

In [15]:
regex = re.compile('\s+') #we can use re.compile too

In [16]:
regex.split(text)

['Mahi', 'The', 'Awesome', 'Man']

In [17]:
regex.findall(text) #Return all non-overlapping matching patterns in a string

[' ', '\t ', ' \t']

In [18]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
#re.compile(pattern, repl, string)

In [19]:
 regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [20]:
m = regex.search(text) #Scan string for match to pattern; returning a match object if so.

In [21]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [22]:
text[m.start():m.end()]

'dave@google.com'

In [23]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [24]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [25]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [26]:
m = regex.match('wesm@bright.net')

In [27]:
m.groups()

('wesm', 'bright', 'net')

In [28]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [29]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [30]:
regex = re.compile(r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)

In [31]:
m = regex.match('wesm@bright.net')
#Match pattern at start of string and optionally segment pattern components into groups.If the pattern matches, returns a match object

In [32]:
 m.groupdict()

{'username': 'wesm', 'domain': 'bright', 'suffix': 'net'}

# Vectorized string functions in pandas

**.contains(' ')= Return boolean array if each string contains pattern/regex**

**.isnull()=Return boolean array**

**.str.findall(pattern, flags=re.IGNORECASE):Compute list of all occurrences of pattern/regex for each string**

**.str.match(pattern, flags=re.IGNORECASE):Use re.match with the passed regular expression on each element, returning matched
groups as list.**

In [71]:
import numpy as np
import pandas as pd

In [72]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com','Rob': 'rob@gmail.com', 'Wes': np.nan}

In [73]:
data = pd.Series(data)

In [74]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [75]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [76]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [77]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [84]:
matches=data.str.findall(pattern, flags=re.IGNORECASE)
matches

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [85]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [96]:
data.str[0:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

# Example: USDA Food Database

In [185]:
json_data="""
{"Country":["Canada"],
 "Products": ["Home Appliances","Grocery","Travel Services"],
 "Location": ["Brittish Colombia"],
 "Contact": null

}
"""

In [186]:
company_data=json.loads(json_data)

In [187]:
company_data

{'Country': ['Canada'],
 'Products': ['Home Appliances', 'Grocery', 'Travel Services'],
 'Location': ['Brittish Colombia'],
 'Contact': None}

In [194]:
data="""
{
"id": 21441,
"description": ["KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,Wing, meat and skin with breading"],
"tags": ["KFC"],
"manufacturer": ["Kentucky Fried Chicken"],
"group": ["Fast Foods"],
"portions": [
{
"amount": 1,
"unit": ["wing, with skin"],
"grams": 68.0
}
],
"nutrients": [
{
"value": 20.8,
"units": ["g"],
"description": ["Protein"],
"group": ["Composition"]
}
]
}
"""

In [197]:
db=json.loads(data)

In [213]:
db

{'id': 21441,
 'description': ['KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,Wing, meat and skin with breading'],
 'tags': ['KFC'],
 'manufacturer': ['Kentucky Fried Chicken'],
 'group': ['Fast Foods'],
 'portions': [{'amount': 1, 'unit': ['wing, with skin'], 'grams': 68.0}],
 'nutrients': [{'value': 20.8,
   'units': ['g'],
   'description': ['Protein'],
   'group': ['Composition']}]}

In [214]:
len(db)

7