# Finding Patterns of Text With using Regular Expressions.

# Basic phone number searching with RegEx.

In [1]:
import re

In [2]:
phoneNumRegEx = re.compile("\d\d-\d\d\d\d\d-\d\d\d\d\d")

In [3]:
mo = phoneNumRegEx.search("Hey, Bhavik my phone number is 91-12345-12345")

In [4]:
print("Phone Number Found: " + mo.group())

Phone Number Found: 91-12345-12345


# Grouping with parenthesis.

In [5]:
import re

In [6]:
phoneNumRegEx = re.compile("(\d\d)-(\d\d\d\d\d-\d\d\d\d\d)")

In [7]:
mo = phoneNumRegEx.search("Hey, Gopal my phone number is 91-12356-89789")

In [8]:
print("Group 1 of Mo : " + mo.group(1))

Group 1 of Mo : 91


In [9]:
print("Group 2 of Mo : " + mo.group(2))

Group 2 of Mo : 12356-89789


In [11]:
print("Group 0 of Mo : " + mo.group(0))

Group 0 of Mo : 91-12356-89789


In [10]:
print("Entire Mobile Number : " + mo.group())

Entire Mobile Number : 91-12356-89789


# groups() method.

In [12]:
mo.groups() # Returns a Tuple with multiple values.

('91', '12356-89789')

In [13]:
areaCode, MobNo = mo.groups()

In [15]:
print("Area Code : " + areaCode)

Area Code : 91


In [16]:
print("Mobile Number : " + MobNo)

Mobile Number : 12356-89789


# Finding phone number with parenthesis.

In [1]:
import re

In [13]:
phoneNumRegEx = re.compile(r'(\(\d\d\)) (\d\d\d\d\d-\d\d\d\d\d)')

In [14]:
moNumber = phoneNumRegEx.search("Oh, My phone is (91) 52525-52123 ?")

In [15]:
print("Mobile Number found : " + moNumber.group())

Mobile Number found : (91) 52525-52123


In [16]:
areaCode, mainNumber = moNumber.groups()

In [17]:
print("Area Code : " + areaCode)

Area Code : (91)


In [18]:
print("Main Mobile number : " + mainNumber)

Main Mobile number : 52525-52123


In [19]:
print("Full Mobile Number : " + moNumber.group())

Full Mobile Number : (91) 52525-52123


# Matching Multiple Groups with the Pipe.

In [16]:
import re

In [17]:
singerRegEx = re.compile(r'Arijit|Tansen')

In [18]:
singer1 = singerRegEx.search("Oh, No another song of Arijit Singh.")

In [19]:
print("favorite Singer : " + singer1.group())

favorite Singer : Arijit


In [20]:
singer2 = singerRegEx.search("Among 9 diamonds of Akbar the Tansen is one of them.")

In [21]:
print("Favorite Singer : " + singer2.group())

Favorite Singer : Tansen


# Optional matching with Question Mark (?)

In [1]:
import re

In [5]:
batRegEx = re.compile(r'Bat(wo)?man')

In [6]:
bat1 = batRegEx.search('The Adventures of Batman.')

In [7]:
print("Found : " + bat1.group())

Found : Batman


In [8]:
bat2 = batRegEx.search("The Adventures of Batwoman")

In [10]:
print("Found : " + bat2.group())

Found : Batwoman


# # More optional matching with Question Mark (?)

In [2]:
import re

In [3]:
nameRegEx = re.compile(r"Bhavi(i)?k")

In [4]:
name1 = nameRegEx.search("Hello, My name is Bhavik Jadav")

In [5]:
name1.group()

'Bhavik'

In [6]:
name2 = nameRegEx.search("Hello, My name is Bhaviik Jadav")

In [7]:
name2.group()

'Bhaviik'

# Matching Zero or More with star (*)

In [8]:
import re

In [13]:
batRegEx = re.compile(r"Bat(wo)*man")

In [27]:
bat1 = batRegEx.search("The Adventures of Batman")

In [28]:
bat2 = batRegEx.search("The Adventures of Batwowowoman")

In [29]:
bat3 = batRegEx.search("The Adventures of Batwowowowowowowowowoman")

In [30]:
bat1.group()

'Batman'

In [31]:
bat2.group()

'Batwowowoman'

In [32]:
bat3.group()

'Batwowowowowowowowowoman'

# Matching One or More with the Plus (+)

In [60]:
import re

In [92]:
baburaoRegEx = re.compile(r"Baburao (Ganpatrao)+ Apte")

In [94]:
baburao1 = baburaoRegEx.search("Name is Baburao Ganpatrao Apte")

In [95]:
baburao1.group()

'Baburao Ganpatrao Apte'

In [96]:
baburao2 = baburaoRegEx.search("Name is Baburao Apte")

In [1]:
baburao2.group() # It will display error becasue we must have 1 or more Ganpatrao in a string of baburao2. :)

NameError: name 'baburao2' is not defined

In [100]:
baburao3 = baburaoRegEx.search("Name is Baburao GanpatraoGanpatraoGanpatrao Apte")

In [101]:
baburao3.group()

'Baburao GanpatraoGanpatraoGanpatrao Apte'

# findall() method.

In [2]:
import re

In [3]:
phoneNumRegEx = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegEx.search("Cell: 123-987-5665 Work: 210-562-7894")
mo.group()

'123-987-5665'

In [4]:
# Above program will return only first match from the searching string.

In [5]:
# Let's use findall() methos for search all matched string.

In [10]:
phoneNumRegEx = re.compile(r'\d\d-\d\d\d\d\d-\d\d\d\d\d')
phoneNumRegEx.findall("Cell: 91-12345-56789 Work: 91-15151-53535")

['91-12345-56789', '91-15151-53535']

In [7]:
# Above program with findall() method will return all matched string in form of list/array.

In [11]:
# Lets match phone number with groups by paranthesis.

In [12]:
phoneNumRegEx = re.compile(r'(\d\d)-(\d\d\d\d\d)-(\d\d\d\d\d)')
phoneNumRegEx.findall("Cell: 91-12345-56789 Work: 91-23232-45452")

[('91', '12345', '56789'), ('91', '23232', '45452')]

In [13]:
# To summarize what the findall() method returns, remember the following:
# 1. When called on a regex with no groups, such as \d\d\d-\d\d\d-\d\d\d\d, 
# the method findall() returns a list of string matches, such as ['415-555-
# 9999', '212-555-0000'].
# 2. When called on a regex that has groups, such as (\d\d\d)-(\d\d\d)-(\d\
# d\d\d), the method findall() returns a list of tuples of strings (one string
# for each group), such as [('415', '555', '1122'), ('212', '555', '0000')].

# Character Classes

In [16]:
xmasRegEx = re.compile(r'\d+\s\w+') # \d+ means more or 1 digit. # \s means a white space. #\w+ means one or more letters.
xmasRegEx.findall("There are so many interesting things books in their book-shelf like, 12 Cooking, 10 Novels, 5 Spiritual etc.")

['12 Cooking', '10 Novels', '5 Spiritual']

In [18]:
"""
\d - 0 to 9 digit.
\D - other than 0 to 9 digit.
\w - A letter, a digit, an underscore. (Simply it will match the words.)
\W - Other than a letter, a digit, amd an underscore.
\s - A white-space, a tab etc. (Simply it will match the spaces.)
\S - Other than white-space, a atab etc.
"""

'\\d - 0 to 9 digit.\n\\D - other than 0 to 9 digit.\n\\w - A letter, a digit, an underscore. (Simply it will match the words.)\n\\W - Other than a letter, a digit, amd an underscore.\n\\s - A white-space, a tab etc. (Simply it will match the spaces.)\n\\S - Other than white-space, a atab etc.'

# Greedy and Non-Greedy matching.

In [21]:
greedyHaRegEx = re.compile(r'(Ha){3,5}')
mo = greedyHaRegEx.search("HaHaHaHaHa")
mo.group()
# It will return ha which occurs between 3 to 5 time with maximum times of occurance.

'HaHaHaHaHa'

In [25]:
nongreedyRegEx = re.compile(r'(Ha){3,5}?')
mo = nongreedyRegEx.search("HaHaHaHa")
mo.group()
# It will return shortest occurance of the Ha which is 3 tiumes as per re.compile() argument and Don not forget to
# put "?".

'HaHaHa'

# Making Your Own Character Class

In [26]:
vowelRegEx = re.compile(r'[aeiouAEIOU]')
vowelRegEx.findall("My name is Bhavik Chandrakant Jadav and I want to be a Machine Learning Engioneer in the Future.")

['a',
 'e',
 'i',
 'a',
 'i',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'I',
 'a',
 'o',
 'e',
 'a',
 'a',
 'i',
 'e',
 'e',
 'a',
 'i',
 'E',
 'i',
 'o',
 'e',
 'e',
 'i',
 'e',
 'u',
 'u',
 'e']

In [41]:
# By placing carot (^) sign, we can easily find the negative or opposite string.
vowelRegEx = re.compile(r'[^aeiouAEIOU]')
vowelRegEx.findall("My name is Bhavik Chandrakant Jadab and I want to be a Machine Learning Engioneer in the Future.")

['M',
 'y',
 ' ',
 'n',
 'm',
 ' ',
 's',
 ' ',
 'B',
 'h',
 'v',
 'k',
 ' ',
 'C',
 'h',
 'n',
 'd',
 'r',
 'k',
 'n',
 't',
 ' ',
 'J',
 'd',
 'b',
 ' ',
 'n',
 'd',
 ' ',
 ' ',
 'w',
 'n',
 't',
 ' ',
 't',
 ' ',
 'b',
 ' ',
 ' ',
 'M',
 'c',
 'h',
 'n',
 ' ',
 'L',
 'r',
 'n',
 'n',
 'g',
 ' ',
 'n',
 'g',
 'n',
 'r',
 ' ',
 'n',
 ' ',
 't',
 'h',
 ' ',
 'F',
 't',
 'r',
 '.']

# The Caret and Dollor sign.

In [None]:
"""
You can also use the caret symbol (^) at the start of a regex to indicate that
a match must occur at the beginning of the searched text. Likewise, you can
put a dollar sign ($) at the end of the regex to indicate the string must end
with this regex pattern. And you can use the ^ and $ together to indicate
that the entire string must match the regex—that is, it’s not enough for a
match to be made on some subset of the string.
"""

In [49]:
import re
beginsWithHelloRegEx = re.compile(r'^Hello')

In [53]:
beginsWithHelloRegEx.search("Bhavik Jadav Hello World") == None

True

In [56]:
beginsWithHelloRegEx.search("Hello Bhavik Jadav")

<re.Match object; span=(0, 5), match='Hello'>

In [57]:
# Lets use ($) sign.

In [1]:
import re

In [5]:
endsWithNumber = re.compile(r'\d$')

In [13]:
endsWithNumber.search("My roll number is 45")

<re.Match object; span=(19, 20), match='5'>

In [10]:
# Lets find long number.

In [11]:
wholeStringIsNum = re.compile(r'\d+$')

In [12]:
wholeStringIsNum.search("My Mobile Number is 123343543654")

<re.Match object; span=(20, 32), match='123343543654'>