## Python Regular Expression

### Part: 1

In [43]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [44]:
'phone' in text


True

In [45]:
import re

In [46]:
pattern  = 'phone'

In [47]:
re.search(pattern, text)

<re.Match object; span=(12, 17), match='phone'>

In [48]:
pattern = 'Not in text'

In [49]:
re.search(pattern, text)

In [50]:
pattern = 'phone'

In [51]:
match = re.search(pattern, text)

In [52]:
match

<re.Match object; span=(12, 17), match='phone'>

In [53]:
match.span()

(12, 17)

In [54]:
match.start()

12

In [55]:
match.end()

17

In [56]:
text = 'my phone once, my phone twice'

In [57]:
match = re.search(pattern, text)

In [58]:
match

<re.Match object; span=(3, 8), match='phone'>

In [59]:
matches = re.findall(pattern, text)

In [60]:
matches

['phone', 'phone']

In [61]:
len(matches)

2

In [62]:
for match in re.finditer(pattern, text):
    print(match.group())


phone
phone


### Part: 2

###### Charecter Identifier

| Charecter | Description      | Example Pattern Code | Example Match |
|-----------|------------------|----------------------|---------------|
| \d        | A Digit          | file_\d\d            | file_25       |
| \w        | Alphanumeric     | \w_\w\w\w            | A-b_1         |
| \s        | White Space      | a\sb\sc              | a b c         |
| \D        | A non Digit      | \D\D\D               | ABC           |
| \W        | Non-Alphanumeric | \W\W\W\W\W           | *-+=)         |
| \S        | Non-Whitespace   | \S\S\S\S             | Yoyo          |

In [63]:
text = "My phone nimber is 408-555-1234"

In [64]:
phone = re.search('408-555-1234', text)

In [65]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [66]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d', text)

In [67]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [68]:
phone.group()

'408-555-1234'

#### Quantifiers

| Charecter | Description               | Example Pattern Code | Example Match  |
|-----------|---------------------------|----------------------|----------------|
| +         | Occurs one or more time   | Version \w-\w+       | Version A-b1_1 |
| {3}       | Occurs exactly 3 times    | \D{3}                | abc            |
| {2,4}     | Occurs 2 to 4 times       | \d{2,4}              | 123            |
| {3,}      | Occurs 3 or more          | \w{3,}               | anycharecters  |
| *         | Occurs zero or more times | A*B*C*               | AAACC          |
| ?         | Once or more              | plurals?             | plural         |

In [69]:
phone = re.search(r'\d{3}-\d{3}-\d{4}', text)

In [70]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [71]:
phone.group()

'408-555-1234'

In [72]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [73]:
results = re.search(phone_pattern,text)

In [74]:
results.group()

'408-555-1234'

In [75]:
results.group(1)

'408'

In [76]:
results.group(2)

'555'

In [77]:
results.group(3)

'1234'

In [78]:
results.group(4) # It shows error because there is only three part of the pattern. Therefore 
                #there is no possibility of existing group(4)


IndexError: no such group

### Aditional Regex Syntax

- OR operator or pipe operator

In [79]:
re.search(r'cat','The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [80]:
re.search(r'cat','The dog is here') # no output will be here as there is no match for the pattern

In [81]:
#applying the pipe operator
re.search(r'cat|dog','The dog is here')

<re.Match object; span=(4, 7), match='dog'>

In [82]:
re.search(r'cat|dog','The cat is here')

<re.Match object; span=(4, 7), match='cat'>

- Wild Card Operator

In [83]:
re.findall(r'at', 'The cat in the hat sat there')

['at', 'at', 'at']

In [84]:
# applying the wild card(.)
re.findall(r'.at', 'The cat in the hat sat there')

['cat', 'hat', 'sat']

In [85]:
re.findall(r'...at', 'The cat in the hat went splat')

['e cat', 'e hat', 'splat']

In [86]:
re.findall(r'^\d', '1 is the number')

['1']

- ^: using for finding the number that "start" the sentence 
- $: using for finding the number that "end" the sentence 

In [88]:
re.findall(r'^\d', 'The 1 is the number')

[]

In [89]:
re.findall(r'\d$', 'The number is 2')

['2']

In [90]:
re.findall(r'\d$', '2 is the number')

[]

In [91]:
phrase = 'there are 3 numbers 34 inside 5 this sentence'

### Brace notation for exclusion and inclusion

- Grouping for exclusion

- [^]: use to exclude from the searching

In [92]:
pattern  = r'[^\d]'

In [93]:
re.findall(pattern, phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [94]:
pattern  = r'[^\d]+'## without the number digit return all words

In [95]:
re.findall(pattern,phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [96]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [99]:
re.findall(r'[^!.?]+', test_phrase)

['This is a string', ' But it has punctuation', ' How can we remove it']

In [100]:
re.findall(r'[^!.? ]+', test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [101]:
clean = re.findall(r'[^!.?]+', test_phrase)

In [102]:
' '.join(clean)

'This is a string  But it has punctuation  How can we remove it'

- Grouping for inclusion

In [103]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are.'

In [106]:
pattern = r'[\w]+-[\w]+'

In [107]:
re.findall(pattern,text)

['hypen-words', 'long-ish']