# Searching for Basic Patterns

In [1]:
text = "The person's phone number is 408-555-1234. Call soon!"

In [2]:
'phone' in text

True

In [3]:
import re #. REGULAR EXPRESSIONS

In [4]:
pattern = 'phone'

In [5]:
re.search(pattern,text)

<re.Match object; span=(13, 18), match='phone'>

In [6]:
pattern = 'wrong'

In [7]:
re.search(pattern,text)

In [8]:
pattern = 'phone'

In [9]:
match = re.search(pattern,text)

In [10]:
match.start()

13

In [11]:
match.end()

18

In [12]:
text = "my phone is a new phone"

In [13]:
match = re.search("phone",text)

In [14]:
match.span()

(3, 8)

In [15]:
matches = re.findall("phone",text)

In [16]:
matches

['phone', 'phone']

In [17]:
len(matches)

2

In [18]:
for match in re.finditer("phone",text):
    print(match)
#     print(match.span())

<re.Match object; span=(3, 8), match='phone'>
<re.Match object; span=(18, 23), match='phone'>


In [19]:
match.group()

'phone'

# Patterns

Identifiers for Characters in Patterns

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [20]:
text = "My telephone number is 408-555-1234"
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)
phone


<re.Match object; span=(23, 35), match='408-555-1234'>

In [21]:
phone.group()

'408-555-1234'

Quantifiers

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [22]:
re.search(r'\d{3}-\d{3}-\d{4}',text)

<re.Match object; span=(23, 35), match='408-555-1234'>

## Groups

What if we wanted to do two tasks, find phone numbers, but also be able to quickly extract their area code (the first three digits). We can use groups for any general task that involves grouping together regular expressions (so that we can later break them down). 

Using the phone number example, we can separate groups of regular expressions using parenthesis:

In [23]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})') 
# separated by () for grouping

In [24]:
results = re.search(phone_pattern,text)

In [25]:
# The entire result
results.group()

'408-555-1234'

In [26]:
# Can then also call by group position.
# remember groups were separated by parenthesis ()
# Something to note is that group ordering starts at 1. Passing in 0 returns everything
results.group(1)

'408'

In [27]:
results.group(2)

'555'

In [28]:
results.group(3)

'1234'

In [29]:
# We only had three groups of parenthesis
results.group(4)

IndexError: no such group

# Additional Regex Syntax
### Or operator |

In [30]:
re.search(r"man|woman","This man was here.")

<re.Match object; span=(5, 8), match='man'>

In [31]:
re.search(r"man|woman","This woman was here.")

<re.Match object; span=(5, 10), match='woman'>

## The Wildcard Character

In [32]:
re.findall(r".at","The cat in the hat sat here.")

['cat', 'hat', 'sat']

In [33]:
re.findall(r".at","The bat went splat")

['bat', 'lat']

In [34]:
re.findall(r"...at","The bat went splat")

['e bat', 'splat']

In [35]:
# One or more non-whitespace that ends with 'at'
re.findall(r'\S+at',"The bat went splat")

['bat', 'splat']

## Starts with and Ends With

In [36]:
# Ends with a number
re.findall(r'\d$','This ends with a number 2')

['2']

In [37]:
# Starts with a number
re.findall(r'^\d','1 is the loneliest number.')

['1']

## Exclusion ^

In [38]:
phrase = "there are 3 numbers 34 inside 5 this sentence."


In [39]:
re.findall(r'[^\d]',phrase) # exclude anu digits

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 '.']

In [40]:
re.findall(r'[^\d]+',phrase) # annds the letters back  together


['there are ', ' numbers ', ' inside ', ' this sentence.']

In [41]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [42]:
re.findall('[^!.? ]+',test_phrase)# used to get rid of exclamation

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [43]:
clean = ' '.join(re.findall('[^!.? ]+',test_phrase))

In [44]:
clean

'This is a string But it has punctuation How can we remove it'

## Brackets for Grouping

In [56]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

In [57]:
re.findall(r'[\w]+',text)

['Only',
 'find',
 'the',
 'hypen',
 'words',
 'in',
 'this',
 'sentence',
 'But',
 'you',
 'do',
 'not',
 'know',
 'how',
 'long',
 'ish',
 'they',
 'are']

In [58]:
re.findall(r'[\w]+-[\w]+',text)

['hypen-words', 'long-ish']

## Parenthesis for Multiple Options

In [59]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [60]:
re.search(r'cat(fish|nap|claw)',text)

<re.Match object; span=(27, 34), match='catfish'>

In [61]:
re.search(r'cat(fish|nap|claw)',texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [62]:
# None returned
re.search(r'cat(fish|nap|claw)',textthree)