## Finding Patterns of Text Without Regular Expressions

In [1]:
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True

print('Is 415-555-4242 a phone number?')
print(isPhoneNumber('415-555-4242'))
print('Is Moshi moshi a phone number?')
print(isPhoneNumber('Moshi moshi'))

Is 415-555-4242 a phone number?
True
Is Moshi moshi a phone number?
False


In [2]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found:' + chunk)
print('Done')

Phone number found:415-555-1011
Phone number found:415-555-9999
Done


## Finding Patterns of Text with Regular Expressions

In [3]:
import re

phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('Phone number found:' + mo.group())

Phone number found:415-555-4242


In [5]:
# Grouping with Parentheses
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.group())

415
555-4242
415-555-4242
415-555-4242


In [6]:
# If you would like to retrieve all the groups at once, use the groups()
# method—note the plural form for the name

print(mo.groups())
areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)

('415', '555-4242')
415
555-4242


In [8]:
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My phone number is (415) 555-4242.')
print(mo.group(1))
print(mo.group(2))

(415)
555-4242


## Matching Multiple Groups with the Pipe " | "

In [10]:
heroRegex = re.compile(r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey')
print(mo1.group())
mo2 = heroRegex.search('Tina Fey and Batman')
print(mo2.group())

Batman
Tina Fey


In [13]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
print(mo.group())
print(mo.group(1))

Batmobile
mobile


## Optional Matching with the Question Mark

In [14]:
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())
mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

Batman
Batwoman


In [15]:
# Using the earlier phone number example, you can make the regex look 
# for phone numbers that do or do not have an area code.

phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('My number is 415-555-4242')
print(mo1.group())
mo2 = phoneRegex.search('My number is 555-4242')
print(mo2.group())

415-555-4242
555-4242


## Matching Zero or More with the Star

In [19]:
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batwowowowoman')
print(mo3.group())

Batman
Batwoman
Batwowowowoman


## Matching One or More with the Plus
While * means “match zero or more,” the + (or plus) means “match one or 
more.” Unlike the star, which does not require its group to appear in the 
matched string, the group preceding a plus must appear at least once.

In [20]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwowowowoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batman')
print(mo3 == None)

Batwoman
Batwowowowoman
True


## Matching Specific Repetitions with Braces

In [21]:
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('Ha')
print(mo2 == None)

HaHaHa
True


## Greedy and Non-greedy Matching
Python’s regular expressions are greedy by default, which means that in 
ambiguous situations they will match the longest string possible. The nongreedy (also called lazy) version of the braces, which matches the shortest 
string possible, has the closing brace followed by a question mark

In [23]:
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
print(mo1.group())

nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
print(mo2.group())

HaHaHaHaHa
HaHaHa


## The findall() Method

In [27]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
print(phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000'))

phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)')
print(phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000'))

['415-555-9999', '212-555-0000']
[('415', '555', '9999'), ('212', '555', '0000')]


## Character Classes

In [29]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

## Making Your Own Character Classes

In [30]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [31]:
# By placing a caret character (^) just after the character class’s opening 
# bracket, you can make a negative character class. A negative character class 
# will match all the characters that are not in the character class.

consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

## The Caret and Dollar Sign Characters
You can also use the caret symbol (^) at the start of a regex to indicate that 
a match must occur at the beginning of the searched text. Likewise, you can 
put a dollar sign ($) at the end of the regex to indicate the string must end
with this regex pattern. And you can use the ^ and $ together to indicate 
that the entire string must match the regex

In [33]:
beginsWithHello = re.compile(r'^Hello')
print(beginsWithHello.search('Hello, world!'))

print(beginsWithHello.search('He said hello.') == None)

<re.Match object; span=(0, 5), match='Hello'>
True


In [36]:
endsWithNumber = re.compile(r'\d$')
print(endsWithNumber.search('Your number is 42'))
print(endsWithNumber.search('Your number is forty two.') == None)

<re.Match object; span=(16, 17), match='2'>
True


In [37]:
wholeStringIsNum = re.compile(r'^\d+$')
print(wholeStringIsNum.search('1234567890'))
print(wholeStringIsNum.search('12345xyz67890') == None)
print(wholeStringIsNum.search('12 34567890') == None)

<re.Match object; span=(0, 10), match='1234567890'>
True
True


## The Wildcard Character
The . (or dot) character in a regular expression is called a wildcard and will 
match any character except for a newline

In [38]:
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

### Matching Everything with Dot-Star

In [3]:
import re

nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Anupam Last Name: Kumawat')
print(mo.group(1))
print(mo.group(2))
print(nameRegex.findall('First Name: Anupam Last Name: Kumawat'))

Anupam
Kumawat
[('Anupam', 'Kumawat')]


In [41]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To serve man>
<To serve man> for dinner.>


### Matching Newlines with the Dot Character

In [45]:
 noNewlineRegex = re.compile('.*')
print(noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group())

print('\n')

newlineRegex = re.compile('.*', re.DOTALL)
print(newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group())

Serve the public trust.


Serve the public trust.
Protect the innocent.
Uphold the law.


## Case-Insensitive Matching

In [46]:
robocop = re.compile(r'robocop', re.I)
print(robocop.search('RoboCop is part man, part machine, all cop.').group())

print(robocop.search('ROBOCOP protects the innocent.').group())

print(robocop.search('Al, why does your programming book talk about robocop so much?').group())

RoboCop
ROBOCOP
robocop


## Substituting Strings with the sub() Method

In [47]:
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [48]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Even knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

## Managing Complex Regexes
Regular expressions are fine if the text pattern you need to match is simple. 
But matching complicated text patterns might require long, convoluted regular expressions. You can mitigate this by telling the re.compile() function 
to ignore whitespace and comments inside the regular expression string. 
This “verbose mode” can be enabled by passing the variable re.VERBOSE as 
the second argument to re.compile().

In [49]:
# Now instead of a hard-to-read regular expression like this:
#   phoneRegex = re.compile(r'((\d{3}|\(\d{3}\))?(\s|-|\.)?\d{3}(\s|-|\.)\d{4}(\s*(ext|x|ext.)\s*\d{2,5})?)')


phoneRegex = re.compile(r'''(
 (\d{3}|\(\d{3}\))? # area code
 (\s|-|\.)? # separator
 \d{3} # first 3 digits
 (\s|-|\.) # separator
 \d{4} # last 4 digits
Pattern Matching with Regular Expressions 179
 (\s*(ext|x|ext.)\s*\d{2,5})? # extension
 )''', re.VERBOSE)

## Combining re.IGNORECASE, re.DOTALL, and re.VERBOSE
What if you want to use re.VERBOSE to write comments in your regular 
expression but also want to use re.IGNORECASE to ignore capitalization? 
Unfortunately, the re.compile() function takes only a single value as its 
second argument. You can get around this limitation by combining the 
re.IGNORECASE, re.DOTALL, and re.VERBOSE variables using the pipe character 
(|), which in this context is known as the bitwise or operator

In [50]:
someRegexValue = re.compile('foo', re.IGNORECASE|re.DOTALL|re.VERBOSE)

In [17]:
import re

phoneNumber = re.compile(r'(\+\d\d)?(\d\d\d\d\d\d\d\d\d\d)')
ph1 = phoneNumber.search('+918955211055')

print(ph1.groups())

('+91', '8955211055')


# Project: Phone Number and Email Address Extractor

### Step 1: Create a Regex for Phone Numbers

In [9]:
import pyperclip, re

phoneRegex = re.compile(r'''(
(\d{3}|\(\d{3}\))?   # area code
(\s|-|\.)?           # separator
(\d{3})              # first 3 digits
(\s|-|\.)            # separator
(\d{4})              # last 4 digits
(\s*(ext|x|ext.)\s*(\d{2,5}))?  # extension
)''',re.VERBOSE)

### Step 2: Create a Regex for Email Addresses

In [10]:
emailRegex = re.compile(r'''(
[a-zA-Z-0-9._%+-]+     # username
@                      # @ symbol
[a-zA-Z0-9.-]+         # domain name
(\.[a-zA-Z]{2,4})      # dot-something 
)''',re.VERBOSE)

### Step 3: Find All Matches in the Clipboard Text

In [11]:
# Find matches in clipboard text.

text = str(pyperclip.paste())

matches = []
for groups in phoneRegex.findall(text):
    phoneNum = '-'.join([groups[1],groups[3],groups[5]])
    if groups[8] != '':
        phoneNum += ' x' + groups[8]
    matches.append(phoneNum)
for groups in emailRegex.findall(text):
    matches.append(groups[0])

### Step 4: Join the Matches into a String for the Clipboard

In [16]:
for groups in emailRegex.findall(text):
    matches.append(groups[0])
    
if len(matches) > 0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard:')
    print('\n'.join(matches))
else:
    print('No phone numbers or email addresses found.')

Copied to clipboard:
support@nostarch.com
academic@nostarch.com
sales@nostarch.com
conferences@nostarch.com
errata@nostarch.com
info@nostarch.com
media@nostarch.com
editors@nostarch.com
rights@nostarch.com
support@nostarch.com
academic@nostarch.com
sales@nostarch.com
conferences@nostarch.com
errata@nostarch.com
info@nostarch.com
media@nostarch.com
editors@nostarch.com
rights@nostarch.com
support@nostarch.com
academic@nostarch.com
sales@nostarch.com
conferences@nostarch.com
errata@nostarch.com
info@nostarch.com
media@nostarch.com
editors@nostarch.com
rights@nostarch.com
support@nostarch.com
academic@nostarch.com
sales@nostarch.com
conferences@nostarch.com
errata@nostarch.com
info@nostarch.com
media@nostarch.com
editors@nostarch.com
rights@nostarch.com
support@nostarch.com
academic@nostarch.com
sales@nostarch.com
conferences@nostarch.com
errata@nostarch.com
info@nostarch.com
media@nostarch.com
editors@nostarch.com
rights@nostarch.com
