# 1. Import Libraries

In [2]:
import re

# 2. Raw Strings

In [23]:
print("Hello\nWorld") #normal string

Hello
World


In [None]:
print(r"Hello\nWorld") #raw string

# 3. Searching Operations

In [43]:
test_string = "123abc456789abc123ABC"

### 3.1 re.match()

In [39]:
pattern = re.compile(r"abc")

match = re.match(pattern, test_string)

print(match) #it returned None bc it was not at the beginning of the string

None


In [47]:
# re.match()
pattern = re.compile(r"123")

match = re.match(pattern, test_string)

print(match) #it shows indexes of 123 and what it matches

<re.Match object; span=(0, 3), match='123'>


In [51]:
match.span() #you can now use function associated with it

(0, 3)

In [59]:
start_index, end_index = match.span()
test_string[start_index: end_index]

'123'

In [61]:
print(match.end())
print(match.start())
print(match.group())

3
0
123


### 3.2 re.search()

In [67]:
test_string

'123abc456789abc123ABC'

In [81]:
# This will only return the first occurrence of the string
pattern = re.compile(r"abc")

match = re.search(pattern, test_string)  
print(match)

<re.Match object; span=(3, 6), match='abc'>


In [79]:
start_index, end_index = match.span()
test_string[start_index: end_index] 

'abc'

### 3.3 re.findall()

In [84]:
# This will only return all occurrences of the string
pattern = re.compile(r"abc")

match = re.findall(pattern, test_string)  
print(match)

['abc', 'abc']


### 3.4 re.finditer()

In [95]:
# This will return the objects of all the occurrences pattern. They are more efficient than re.findall() 
pattern = re.compile(r"abc")

matches = re.finditer(pattern, test_string)  
print(matches)

<callable_iterator object at 0x000001D162D69C90>


In [97]:
# So you need to iterate over them
for match in matches:
    print(i)

<re.Match object; span=(12, 15), match='abc'>
<re.Match object; span=(12, 15), match='abc'>


# Meta Charcters

* `.` - Any character (except a newline character)
* `^` - Beginning of a string
* `$` - Ending of a string
* `*` - Zero or more occurrences
* `+` - One or more occurrences

### 4.1 dot (.)

In [582]:
test_string_1 = "123abc.456789abc123defGHI.ABC \n xyz345WXYZ"


test_string_2 = "The cat sat on the mat with a bat and a hat."


test_string_3 = "The cat sat on the mat with a bat and a hat. $at, 8at, at."

 
text = """Hello World!
This is a test.
Hello again!
Python is fun.
Hello Python"""

In [129]:
# It captures every character except whatever beyond the new line character("/n")
print(test_string_1)
pattern = re.compile(r".")

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

123abc.456789abc123defGHI.ABC 
 xyz345WXYZ
<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(1, 2), match='2'>
<re.Match object; span=(2, 3), match='3'>
<re.Match object; span=(3, 4), match='a'>
<re.Match object; span=(4, 5), match='b'>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(6, 7), match='4'>
<re.Match object; span=(7, 8), match='5'>
<re.Match object; span=(8, 9), match='6'>
<re.Match object; span=(9, 10), match='7'>
<re.Match object; span=(10, 11), match='8'>
<re.Match object; span=(11, 12), match='9'>
<re.Match object; span=(12, 13), match='a'>
<re.Match object; span=(13, 14), match='b'>
<re.Match object; span=(14, 15), match='c'>
<re.Match object; span=(15, 16), match='1'>
<re.Match object; span=(16, 17), match='2'>
<re.Match object; span=(17, 18), match='3'>
<re.Match object; span=(18, 19), match='A'>
<re.Match object; span=(19, 20), match='B'>
<re.Match object; span=(20, 21), match='C'>


### 4.2 carat (^)

In [148]:
print(text, end='\n\n')

pattern = re.compile(r'Hello')
matches = re.finditer(pattern, text)

for i in matches:
    print(i)

Hello World!
This is a test.
Hello again!
Python is fun.
Hello Python

<re.Match object; span=(0, 5), match='Hello'>
<re.Match object; span=(29, 34), match='Hello'>
<re.Match object; span=(57, 62), match='Hello'>


In [150]:
# It returns only if it matches "Hello" only if it appears at the start of a line or string. But only 1st occurrence from the beginning 
print(text, end='\n\n')

pattern = re.compile(r'^Hello')
matches = re.finditer(pattern, text)

for i in matches:
    print(i)

Hello World!
This is a test.
Hello again!
Python is fun.
Hello Python

<re.Match object; span=(0, 5), match='Hello'>


### 4.3 dollar ($)

In [584]:
# It returns only if it matches "Python" only if it appears at the end of a line or string. But only 1st occurrence from the end 
print(text, end='\n\n')

pattern = re.compile(r'Python$')
matches = re.finditer(pattern, text)

for i in matches:
    print(i)

Hello World!
This is a test.
Hello again!
Python is fun.
Hello Python

<re.Match object; span=(63, 69), match='Python'>


# 5. Character Sets

### 5.1 Custom Character Sets
 * `{ }` - Exactly the specified number of occurrences

 * `[]` - A set of characters

 * `\` - Signals a special sequence (can also be used to escape special characters)

 * `|` - Either Or

 * `( )` - Capture a group

####  [] brackets

In [180]:
test_string = "123abc.4567XYZ$89abc123  def%GHIxyz.ABC#i."

In [190]:
# Extract vowels from the string. The [] will return all the matched characters mentioned in []
print(test_string, end="\n\n")

pattern = re.compile(r'[aieouAEIOU]')

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

123abc.4567XYZ$89abc123  def%GHIxyz.ABC#i.

<re.Match object; span=(3, 4), match='a'>
<re.Match object; span=(17, 18), match='a'>
<re.Match object; span=(26, 27), match='e'>
<re.Match object; span=(31, 32), match='I'>
<re.Match object; span=(36, 37), match='A'>
<re.Match object; span=(40, 41), match='i'>


In [195]:
matches = re.findall(pattern, test_string)
matches

['a', 'a', 'e', 'I', 'A', 'i']

In [197]:
# Extract numbers from the string
print(test_string, end="\n\n")

pattern = re.compile(r'[1234567890]')

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

123abc.4567XYZ$89abc123  def%GHIxyz.ABC#i.

<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(1, 2), match='2'>
<re.Match object; span=(2, 3), match='3'>
<re.Match object; span=(7, 8), match='4'>
<re.Match object; span=(8, 9), match='5'>
<re.Match object; span=(9, 10), match='6'>
<re.Match object; span=(10, 11), match='7'>
<re.Match object; span=(15, 16), match='8'>
<re.Match object; span=(16, 17), match='9'>
<re.Match object; span=(20, 21), match='1'>
<re.Match object; span=(21, 22), match='2'>
<re.Match object; span=(22, 23), match='3'>


#### character ranges [0-9]

In [200]:
# range of digits
print(test_string, end="\n\n")

pattern = re.compile(r'[0-9]')

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

123abc.4567XYZ$89abc123  def%GHIxyz.ABC#i.

<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(1, 2), match='2'>
<re.Match object; span=(2, 3), match='3'>
<re.Match object; span=(7, 8), match='4'>
<re.Match object; span=(8, 9), match='5'>
<re.Match object; span=(9, 10), match='6'>
<re.Match object; span=(10, 11), match='7'>
<re.Match object; span=(15, 16), match='8'>
<re.Match object; span=(16, 17), match='9'>
<re.Match object; span=(20, 21), match='1'>
<re.Match object; span=(21, 22), match='2'>
<re.Match object; span=(22, 23), match='3'>


In [202]:
# range of alphabets
print(test_string, end="\n\n")

pattern = re.compile(r'[a-zA-Z]')

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

123abc.4567XYZ$89abc123  def%GHIxyz.ABC#i.

<re.Match object; span=(3, 4), match='a'>
<re.Match object; span=(4, 5), match='b'>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(11, 12), match='X'>
<re.Match object; span=(12, 13), match='Y'>
<re.Match object; span=(13, 14), match='Z'>
<re.Match object; span=(17, 18), match='a'>
<re.Match object; span=(18, 19), match='b'>
<re.Match object; span=(19, 20), match='c'>
<re.Match object; span=(25, 26), match='d'>
<re.Match object; span=(26, 27), match='e'>
<re.Match object; span=(27, 28), match='f'>
<re.Match object; span=(29, 30), match='G'>
<re.Match object; span=(30, 31), match='H'>
<re.Match object; span=(31, 32), match='I'>
<re.Match object; span=(32, 33), match='x'>
<re.Match object; span=(33, 34), match='y'>
<re.Match object; span=(34, 35), match='z'>
<re.Match object; span=(36, 37), match='A'>
<re.Match object; span=(37, 38), match='B'>
<re.Match object; span=(38, 39), match='C'>
<re.Match object; span=(40, 41), match

In [204]:
# hexa decimal charactes
print(test_string, end="\n\n")

pattern = re.compile(r'[0-9a-fA-F]')

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

123abc.4567XYZ$89abc123  def%GHIxyz.ABC#i.

<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(1, 2), match='2'>
<re.Match object; span=(2, 3), match='3'>
<re.Match object; span=(3, 4), match='a'>
<re.Match object; span=(4, 5), match='b'>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(7, 8), match='4'>
<re.Match object; span=(8, 9), match='5'>
<re.Match object; span=(9, 10), match='6'>
<re.Match object; span=(10, 11), match='7'>
<re.Match object; span=(15, 16), match='8'>
<re.Match object; span=(16, 17), match='9'>
<re.Match object; span=(17, 18), match='a'>
<re.Match object; span=(18, 19), match='b'>
<re.Match object; span=(19, 20), match='c'>
<re.Match object; span=(20, 21), match='1'>
<re.Match object; span=(21, 22), match='2'>
<re.Match object; span=(22, 23), match='3'>
<re.Match object; span=(25, 26), match='d'>
<re.Match object; span=(26, 27), match='e'>
<re.Match object; span=(27, 28), match='f'>
<re.Match object; span=(36, 37), match='A'>
<re.M

#### Character Negation [^]

In [223]:
# Extract consonants (everything except vowels). [^] You have to mention it in the brackets in the beginning
# We removed all the vowels, capital vowes, digits, and special symbols
print(test_string, end="\n\n")

pattern = re.compile(r'[^aieouAIEOU0-9$#%. ]')

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

123abc.4567XYZ$89abc123  def%GHIxyz.ABC#i.

<re.Match object; span=(4, 5), match='b'>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(11, 12), match='X'>
<re.Match object; span=(12, 13), match='Y'>
<re.Match object; span=(13, 14), match='Z'>
<re.Match object; span=(18, 19), match='b'>
<re.Match object; span=(19, 20), match='c'>
<re.Match object; span=(25, 26), match='d'>
<re.Match object; span=(27, 28), match='f'>
<re.Match object; span=(29, 30), match='G'>
<re.Match object; span=(30, 31), match='H'>
<re.Match object; span=(32, 33), match='x'>
<re.Match object; span=(33, 34), match='y'>
<re.Match object; span=(34, 35), match='z'>
<re.Match object; span=(37, 38), match='B'>
<re.Match object; span=(38, 39), match='C'>


## 5.2 Pre-defined Character Sets

#### It is like a short-hand for what we learned before

* `\d`: Matches any digit; equivalent to [0-9]
* `\D`: Matches any non-digit character; equivalent to [^0-9]
* `\s`: Matches any whitespace character (space, tab, newline)
* `\S`: Matches any non-whitespace character
* `\w`: Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]
* `\W`: Matches any non-alphanumeric character; equivalent to [^a-zA-Z0-9_]
* `\b`: Matches any whitespace or non-alphanumeric character before or after character(s) (useful for identifying individual words in a string)
* `\B`: Negation of \b

In [253]:
test_string_1 = "123abc.4567XYZ$89abc123 def%GHIxyz .ABC#i."

test_string_2 = "123abc.4567\nXYZ$89abc123 def%GHI\nxyz .ABC#i."

test_string_3 = 'heyhello 123_ heyho hohey'

In [289]:
# Instead of writing [0-9] we write \d to find all the digits
print(test_string_1, end="\n\n")

pattern = re.compile(r'\d')

matches = re.finditer(pattern, test_string_1)

for match in matches:
    print(match)

123abc.4567XYZ$89abc123 def%GHIxyz .ABC#i.

<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(1, 2), match='2'>
<re.Match object; span=(2, 3), match='3'>
<re.Match object; span=(7, 8), match='4'>
<re.Match object; span=(8, 9), match='5'>
<re.Match object; span=(9, 10), match='6'>
<re.Match object; span=(10, 11), match='7'>
<re.Match object; span=(15, 16), match='8'>
<re.Match object; span=(16, 17), match='9'>
<re.Match object; span=(20, 21), match='1'>
<re.Match object; span=(21, 22), match='2'>
<re.Match object; span=(22, 23), match='3'>


In [291]:
# whitespaces
print(test_string_2, end="\n\n")

pattern = re.compile(r'\s')

matches = re.finditer(pattern, test_string_2)

for match in matches:
    print(match)

123abc.4567
XYZ$89abc123 def%GHI
xyz .ABC#i.

<re.Match object; span=(11, 12), match='\n'>
<re.Match object; span=(24, 25), match=' '>
<re.Match object; span=(32, 33), match='\n'>
<re.Match object; span=(36, 37), match=' '>


In [285]:
# alphanumeric 
print(test_string_2, end="\n\n")

pattern = re.compile(r'\w')

matches = re.finditer(pattern, test_string_2)

for match in matches:
    print(match)

123abc.4567
XYZ$89abc123 def%GHI
xyz .ABC#i.

<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(1, 2), match='2'>
<re.Match object; span=(2, 3), match='3'>
<re.Match object; span=(3, 4), match='a'>
<re.Match object; span=(4, 5), match='b'>
<re.Match object; span=(5, 6), match='c'>
<re.Match object; span=(7, 8), match='4'>
<re.Match object; span=(8, 9), match='5'>
<re.Match object; span=(9, 10), match='6'>
<re.Match object; span=(10, 11), match='7'>
<re.Match object; span=(12, 13), match='X'>
<re.Match object; span=(13, 14), match='Y'>
<re.Match object; span=(14, 15), match='Z'>
<re.Match object; span=(16, 17), match='8'>
<re.Match object; span=(17, 18), match='9'>
<re.Match object; span=(18, 19), match='a'>
<re.Match object; span=(19, 20), match='b'>
<re.Match object; span=(20, 21), match='c'>
<re.Match object; span=(21, 22), match='1'>
<re.Match object; span=(22, 23), match='2'>
<re.Match object; span=(23, 24), match='3'>
<re.Match object; span=(25, 26), match='d'>
<re

In [293]:
# word boundary. We need only 'hey' if it is at the start of any word. 
# This works bc if before "hey" there is a non-alphanumeric value it will return that word
# white space and "" nothing are non-alphanumeric chars
print(test_string_3, end="\n\n")

pattern = re.compile(r'\bhey')

matches = re.finditer(pattern, test_string_3)

for match in matches:
    print(match)

heyhello 123_ heyho hohey

<re.Match object; span=(0, 3), match='hey'>
<re.Match object; span=(14, 17), match='hey'>


# 6. Quantifiers

## 6.1 Greedy Quantifiers

#### It try to match as many characters as possible

* `*` : 0 or more occurrences of the preceding element
* `+` : 1 or more occurrences of the preceding element
* `?` : 0 or 1, used when a character can be optional
* `{m}` : exactly 'm' characters
* `{m, n}` : range of characters (m, n)

In [588]:
test_string_1 = "no no noo nooo noooothing noo"

test_string_2 = "The colour of the sky and the color of the ocean are different."

test_string_3 = "We can use either http or https to generate URLs."

test_string_4 = "aaXaaaXaaaaXaaaaaX"

test_string_5 = "<div>First div</div><div>Second div</div>"

#### 6.1.1 `*`

In [331]:
# It says it can be the same as string, less and beyond thats why it captured 'no' as well
print(test_string_1, end='\n\n')
pattern  = re.compile("noo*")

matches = re.finditer(pattern, test_string_1)

for match in matches:
    print(match)

no no noo nooo noooothing noo

<re.Match object; span=(0, 2), match='no'>
<re.Match object; span=(3, 5), match='no'>
<re.Match object; span=(6, 9), match='noo'>
<re.Match object; span=(10, 14), match='nooo'>
<re.Match object; span=(15, 20), match='noooo'>
<re.Match object; span=(26, 29), match='noo'>


#### 6.1.2 `+`

In [328]:
# + says it has to be same as string or beyond
print(test_string_1, end='\n\n')
pattern  = re.compile("noo+")

matches = re.finditer(pattern, test_string_1)

for match in matches:
    print(match)

no no noo nooo noooothing noo

<re.Match object; span=(6, 9), match='noo'>
<re.Match object; span=(10, 14), match='nooo'>
<re.Match object; span=(15, 20), match='noooo'>
<re.Match object; span=(26, 29), match='noo'>


#### 6.1.3 `?`

In [338]:
# We need to find both 'color' and 'colour'
# By adding "?" next to 'u' it means 'u' is either present or absent
print(test_string_2, end='\n\n')
pattern  = re.compile("colou?r")

matches = re.finditer(pattern, test_string_2)

for match in matches:
    print(match)

The colour of the sky and the color of the ocean are different.

<re.Match object; span=(4, 10), match='colour'>
<re.Match object; span=(30, 35), match='color'>


In [344]:
# We need to find both 'http' and 'https'
print(test_string_3, end='\n\n')
pattern  = re.compile("https?")

matches = re.finditer(pattern, test_string_3)

for match in matches:
    print(match)

We can use either http or https to generate URLs.

<re.Match object; span=(18, 22), match='http'>
<re.Match object; span=(26, 31), match='https'>


#### 6.1.4 {m}

In [349]:
# We want 'a' that is occurring at least 3 times
print(test_string_4, end='\n\n')
pattern  = re.compile("a{3}")

matches = re.finditer(pattern, test_string_4)

for match in matches:
    print(match)

aaXaaaXaaaaXaaaaaX

<re.Match object; span=(3, 6), match='aaa'>
<re.Match object; span=(7, 10), match='aaa'>
<re.Match object; span=(12, 15), match='aaa'>


#### 6.1.5 {m,n}

In [355]:
# We want 'a' that is occurring at least 2, 3, 4 and 5 times
print(test_string_4, end='\n\n')
pattern  = re.compile("a{2,5}")

matches = re.finditer(pattern, test_string_4)

for match in matches:
    print(match)

aaXaaaXaaaaXaaaaaX

<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(3, 6), match='aaa'>
<re.Match object; span=(7, 11), match='aaaa'>
<re.Match object; span=(12, 17), match='aaaaa'>


## 6.2 Non-Greedy Quantifiers
### A.K.A Lazy Quantifiers: It will try to match as few characters as possible

* `*?` : 0 or more
* `+?` : 1 or more
* `??` : 0 or 1, used when a character can be optional (as few as possible)
* `{m}?` : exactly 'm' characters (as few as possible)
* `{m, n}?` : range of characters (m, n) (as few as possible)

#### 6.2.1 `*?`

In [367]:
# It will only capture any word with 'no' and exactly 'no'. Just add '?' after '*'
print(test_string_1, end='\n\n')
pattern  = re.compile("noo*?")

matches = re.finditer(pattern, test_string_1)

for match in matches:
    print(match)

no no noo nooo noooothing noo

<re.Match object; span=(0, 2), match='no'>
<re.Match object; span=(3, 5), match='no'>
<re.Match object; span=(6, 8), match='no'>
<re.Match object; span=(10, 12), match='no'>
<re.Match object; span=(15, 17), match='no'>
<re.Match object; span=(26, 28), match='no'>


#### 6.2.2 `+?`

In [373]:
# It will only capture words with 'noo' and exactly 'noo'. Just add '?' after '*'
print(test_string_1, end='\n\n')
pattern  = re.compile("noo+?")

matches = re.finditer(pattern, test_string_1)

for match in matches:
    print(match)

no no noo nooo noooothing noo

<re.Match object; span=(6, 9), match='noo'>
<re.Match object; span=(10, 13), match='noo'>
<re.Match object; span=(15, 18), match='noo'>
<re.Match object; span=(26, 29), match='noo'>


#### 6.2.3 `{m,n}`

In [389]:
# It will capture as few as possible. Since in our case 2 is the few and not 5 thats why it showed 2 'aa'
print(test_string_4, end='\n\n')
pattern  = re.compile("a{2,5}?")

matches = re.finditer(pattern, test_string_4)

for match in matches:
    print(match)

aaXaaaXaaaaXaaaaaX

<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(3, 5), match='aa'>
<re.Match object; span=(7, 9), match='aa'>
<re.Match object; span=(9, 11), match='aa'>
<re.Match object; span=(12, 14), match='aa'>
<re.Match object; span=(14, 16), match='aa'>


In [696]:
# We want to extract the whole div tags and whatever is inside it
# \w will give all the alphanumeric chars, + will give exact match, 
# </\w+> means we want to capture this as well</div>
# .* means every char inside, '?' means as few as you can non-greedy '?' 
# Try to run it without "?" this. This will return a greedy quantifier of things
print(test_string_5, end='\n\n')
pattern  = re.compile(r"<\w+>.*?</\w+>")

matches = re.findall(pattern, test_string_5)

print(matches)

<div>First div</div><div>Second div</div>

['<div>First div</div>', '<div>Second div</div>']


# 7. Grouping

* Used for identifying group(s) of matching substrings within a larger string
* Grouping is useful for extracting specific parts of a string which could provide useful information
* Characters to form a group are mentioned within parentheses ()
* By default, the captured groups are stored for 'later use'
* Groups allow the usage of back-references?

In [5]:
test_string_1 = "Call me at (123) 456-7890 or (456) 7890123 or (789) 201 3465."

test_string_2 = """
<div>This is a div tag</div>
<p>This is a paragraph tag</p>
<div>This is another div tag</div>
<h2>This is header div tag</h2>
"""

test_string_3 = """
hello hello
world world
python python
"""

test_string_4 = "Visit our website at https://example.com or http://example.net"

test_string_5 = """
Mr. Wayne
Mr Kent
Ms Romanoff
Mrs. Stark
Mr. M
"""

#### 7.1 Capture Groups

* By default, all groups are 'Capture Groups' until explicitly altered
* By default, all identified groups are assigned integral names
* Syntax: `(pattern)`

In [30]:
# Extract all the phone numbers
# So in our string we have numbers in 3 groups 
# "3 digits" = \((d{3})\),
# space = \s,
# "3 digits" = (\d{3}),
# Either we select everything or nothing bc the hyphen is present or not present = .? 
# "4 digits" = (\d{4})
print(test_string_1, end='\n\n')

pattern = re.compile(r"\((\d{3})\)\s(\d{3}).?(\d{4})")

matches = re.finditer(pattern, test_string_1)

for i, match in enumerate(matches, start=1):
    print(f"Match {i}")
    print("--------")
    print(match)
    print(f"Group: {match.group()}")
    print(f"Groups: {match.groups()}")
    print(f"Group 1: {match.group(1)}")
    print(f"Group 2: {match.group(2)}")
    print(f"Group 3: {match.group(3)}") 
    print()

Call me at (123) 456-7890 or (456) 7890123 or (789) 201 3465.

Match 1
--------
<re.Match object; span=(11, 25), match='(123) 456-7890'>
Group: (123) 456-7890
Groups: ('123', '456', '7890')
Group 1: 123
Group 2: 456
Group 3: 7890

Match 2
--------
<re.Match object; span=(29, 42), match='(456) 7890123'>
Group: (456) 7890123
Groups: ('456', '789', '0123')
Group 1: 456
Group 2: 789
Group 3: 0123

Match 3
--------
<re.Match object; span=(46, 60), match='(789) 201 3465'>
Group: (789) 201 3465
Groups: ('789', '201', '3465')
Group 1: 789
Group 2: 201
Group 3: 3465



#### 7.2 Named Capture Groups
* Behaves similarly like Capture Groups
* Each captured group can be given a name explicitly
* This improves code readability and group access
* Syntax: `(?P<group_name>pattern)`

In [57]:
# Extract all the tags + content between them
# <(?P<tag>\w+)> = "\w" Captures all the tags, "+" one or many, I named it "tag" 
# (?P<content>.*?) = capture all the content ".*" (all chars as many possible) plus I named it "content"
# </\w+> = capture closing tag
print(test_string_2, end="\n\n")

pattern = re.compile(r"<(?P<tag>\w+)>(?P<content>.*?)</\w+>")

matches = re.finditer(pattern, test_string_2)

for i, match in enumerate(matches, start=1):
    print(f"Match {i}")
    print("--------")
    print(match)
    print(f"Group: {match.group()}")
    print(f"Groups: {match.groupdict()}")
    print(f"Group: {match.group('tag')}")
    print(f"Group: {match.group('content')}")
    
    print()


<div>This is a div tag</div>
<p>This is a paragraph tag</p>
<div>This is another div tag</div>
<h2>This is header div tag</h2>


Match 1
--------
<re.Match object; span=(1, 29), match='<div>This is a div tag</div>'>
Group: <div>This is a div tag</div>
Groups: {'tag': 'div', 'content': 'This is a div tag'}
Group: div
Group: This is a div tag

Match 2
--------
<re.Match object; span=(30, 60), match='<p>This is a paragraph tag</p>'>
Group: <p>This is a paragraph tag</p>
Groups: {'tag': 'p', 'content': 'This is a paragraph tag'}
Group: p
Group: This is a paragraph tag

Match 3
--------
<re.Match object; span=(61, 95), match='<div>This is another div tag</div>'>
Group: <div>This is another div tag</div>
Groups: {'tag': 'div', 'content': 'This is another div tag'}
Group: div
Group: This is another div tag

Match 4
--------
<re.Match object; span=(96, 127), match='<h2>This is header div tag</h2>'>
Group: <h2>This is header div tag</h2>
Groups: {'tag': 'h2', 'content': 'This is header div tag

#### 7.3 Back References 
* Used for referencing captured groups by short-hand notation
*  Mainly used when some parts of the pattern repeat
*  Syntax (default capture groups): `\group_index`
* Syntax (named capture groups): `(?P=group_name)`

In [83]:
# Extract every line 
# \b(\w+) = this means a group of alphanumeric words but boundary set means only non-alphanumeric is allowed before it
# \1 means the first group which means the first group is repeating
print(test_string_3, end="\n\n")

pattern = re.compile(r"\b(\w+) \1")

matches = re.finditer(pattern, test_string_3)

for i, match in enumerate(matches, start=1):
    print(f"Match {i}")
    print("--------")
    print(match)
    print(f"Group: {match.group()}")
    print(f"Groups: {match.groups()}")
    print()


hello hello
world world
python python


Match 1
--------
<re.Match object; span=(1, 12), match='hello hello'>
Group: hello hello
Groups: ('hello',)

Match 2
--------
<re.Match object; span=(13, 24), match='world world'>
Group: world world
Groups: ('world',)

Match 3
--------
<re.Match object; span=(25, 38), match='python python'>
Group: python python
Groups: ('python',)



In [98]:
# named groups
# Instead of writing this </\w+> we wrote this </(?P=tag)> using named groups already created
print(test_string_2, end="\n\n")

pattern = re.compile(r"<(?P<tag>\w+)>(?P<content>.*)</(?P=tag)>")

matches = re.finditer(pattern, test_string_2)

for i, match in enumerate(matches, start=1):
    print(f"Match {i}")
    print("--------")
    print(match)
    print(f"Group: {match.group()}")
    print(f"Groups: {match.groupdict()}")
    print(f"Group: {match.group('tag')}")
    print(f"Group: {match.group('content')}")
    print()


<div>This is a div tag</div>
<p>This is a paragraph tag</p>
<div>This is another div tag</div>
<h2>This is header div tag</h2>


Match 1
--------
<re.Match object; span=(1, 29), match='<div>This is a div tag</div>'>
Group: <div>This is a div tag</div>
Groups: {'tag': 'div', 'content': 'This is a div tag'}
Group: div
Group: This is a div tag

Match 2
--------
<re.Match object; span=(30, 60), match='<p>This is a paragraph tag</p>'>
Group: <p>This is a paragraph tag</p>
Groups: {'tag': 'p', 'content': 'This is a paragraph tag'}
Group: p
Group: This is a paragraph tag

Match 3
--------
<re.Match object; span=(61, 95), match='<div>This is another div tag</div>'>
Group: <div>This is another div tag</div>
Groups: {'tag': 'div', 'content': 'This is another div tag'}
Group: div
Group: This is another div tag

Match 4
--------
<re.Match object; span=(96, 127), match='<h2>This is header div tag</h2>'>
Group: <h2>This is header div tag</h2>
Groups: {'tag': 'h2', 'content': 'This is header div tag

#### 7.4 Non-capture Groups
* The 'groups' aren’t captured for later use
*  Syntax: `(?:pattern)`

In [107]:
# Extract all URLs in the string
# (?:https?://) = to capture group http or https. By adding "?:" this will not capture "https?" group
# (\w+) = To capture group next http
# \. = By adding '\' we want to capture literal '.' and not everything
# (\w+) = To capture whatever is next to "."
print(test_string_4, end="\n\n")

pattern = re.compile(r"(?:https?://)(\w+)\.(\w+)")

matches = re.finditer(pattern, test_string_4)

for i, match in enumerate(matches, start=1):
    print(f"Match {i}")
    print("--------")
    print(match)
    print(f"Group: {match.group()}")
    print(f"Groups: {match.groups()}") 
    print()

Visit our website at https://example.com or http://example.net

Match 1
--------
<re.Match object; span=(21, 40), match='https://example.com'>
Group: https://example.com
Groups: ('example', 'com')

Match 2
--------
<re.Match object; span=(44, 62), match='http://example.net'>
Group: http://example.net
Groups: ('example', 'net')



#### 7.5 Alteration
* Allows to match any pattern from listed alternatives
* Implemented by using the `|` (pipe) symbol

In [139]:
# capture all the names with their titles
# Mr? = to capture "Mr." and "Mr"
# \.? = to capture literal '.' either present or not present
# \s = to capture white space
# \w+ = to capture all the alphanumeric
print(test_string_5, end="\n\n")

# to capture all men
pattern = re.compile(r"Mr?\.?\s\w+") 

# to capture all women
pattern = re.compile(r"Mr?s\.?\s\w+")

# to capture both women and men
pattern = re.compile(r"Mr?s?\.?\s\w+")

matches = re.findall(pattern, test_string_5)

for name in matches:
    print(name)


Mr. Wayne
Mr Kent
Ms Romanoff
Mrs. Stark
Mr. M


Mr. Wayne
Mr Kent
Ms Romanoff
Mrs. Stark
Mr. M


In [145]:
# more readable pattern using "Alterations"
# (Mr|Mrs|Ms) = This will capture either Mr, Mrs, Ms
pattern = re.compile(r"(Mr|Mrs|Ms)\.?\s\w+")

matches = re.finditer(pattern, test_string_5)

for name in matches:
    print(name.group())

Mr. Wayne
Mr Kent
Ms Romanoff
Mrs. Stark
Mr. M


# 8. String Modifications
* Involves splitting a string or replacing parts of a string
*  The re module provides the split and sub functions to achieve this


### 8.1 Split
* Allows to split a string on any matched pattern
* Works similar to Python's str.split function, with the additional flexibility of 
regular expression
* Syntax: `re.split(pattern, string)`


In [154]:
test_string_1 = "This is the first sentence. This is the second sentence! This is the third sentence?"

test_string_2 = "Visit our website at https://website1.com or http://website2.net"

In [170]:
# Split and extract all the sentences
# We will split on '.', '?', '!' to get all the sentences
print(test_string_1, end="\n\n")

pattern = re.compile(r"[.!?]")
matches = re.split(pattern, test_string_1)
for match in matches[:-1]:
    print(match.strip())

This is the first sentence. This is the second sentence! This is the third sentence?

This is the first sentence
This is the second sentence
This is the third sentence


### 8.2 Substitution
* Allows to replace a part of a string with any matched pattern
* 
Useful for search and replace operations with the added flexibility of regular
expressions

*  Syntax: `re.sub(pattern, replacement, string)`

In [199]:
print(test_string_2, end="\n\n")

pattern = re.compile(r"(https?://)(\w+)\.(\w+)")

matches = re.sub(pattern, r"\2.\3.co.in", test_string_2)
matches

Visit our website at https://website1.com or http://website2.net



'Visit our website at website1.com.co.in or website2.net.co.in'

# 9. Lookahead and Lookbehind Assertions
* Lookahead and lookbehind assertions are powerful tools in regular expressions
that allow for complex pattern matching based on the context in which a pattern
appears
* They are used to match a pattern only if it is followed or preceded by another
specified pattern


### 9.1 Lookahead Assertion

#### 9.1.1 Positive lookahead assertion
* Asserts if a pattern to be matched (X) is immediately followed by another specified
pattern (Y)
* Syntax: `X(?=Y)`

In [204]:
test_string_1 = """
100 USD
150 USD
85 INR
120 AED
60 USD
70 INR
"""

test_string_2 = """
Chris Evans
Chris Hemsworth
Chris Pratt
Tom Holland
Tom Hiddleston
"""

In [212]:
# extract only if the amount is in USD right next to it
# Syntax: X(?=Y), here X==\d+; Y=(?=USD)
print(test_string_1, end="\n\n")

pattern = re.compile(r"\d+ (?=USD)")
matches = re.findall(pattern, test_string_1)
for match in matches:
    print(match)


100 USD
150 USD
85 INR
120 AED
60 USD
70 INR


100 
150 
60 


#### 9.1.1 Negative lookahead assertion
* Asserts if a pattern to be matched (X) is not immediately followed by another
specified pattern (Y)
* Syntax: X(?!Y)

In [217]:
# We want a currency that is not USD. Opposite for positive lookahead
print(test_string_1, end="\n\n")

pattern = re.compile(r"\d+ (?!USD)")
matches = re.findall(pattern, test_string_1)
for match in matches:
    print(match)


100 USD
150 USD
85 INR
120 AED
60 USD
70 INR


85 
120 
70 


### 9.3 Lookbehind Assertion

#### 9.1.1 Positive lookbehind assertion
* Asserts if a pattern to be matched (X) is immediately preceded by another specified
pattern (Y)
* Syntax: `(?<=Y)X`

In [242]:
# We want last-name that is only preceded by name 'Chris' 
print(test_string_2, end="\n\n")

pattern = re.compile(r"(?<=Chris) \w+")
matches = re.findall(pattern, test_string_2)
for match in matches:
    print(match.strip())


Chris Evans
Chris Hemsworth
Chris Pratt
Tom Holland
Tom Hiddleston


Evans
Hemsworth
Pratt


#### 9.1.1 Negative lookbehind assertion
* Asserts if a pattern to be matched (X) is not immediately preceded by another
specified pattern (Y)
* Syntax: `(?<!Y)X`

In [249]:
# We want last-name that are not preceded by the name 'Chris'. Opposite of negative lookbehind
print(test_string_2, end="\n\n")

pattern = re.compile(r"(?<!Chris) \w+")
matches = re.findall(pattern, test_string_2)
for match in matches:
    print(match.strip())


Chris Evans
Chris Hemsworth
Chris Pratt
Tom Holland
Tom Hiddleston


Holland
Hiddleston


# 10. Flags
* Flags provide additional control over pattern matching by altering the
behaviour of regular expressions
* Can be used for insensitive case matching, making the dot operator match
newline character, etc.
* Flags are usually passed as arguments to the functions of re module

### Common Flags:

#### • re.IGNORECASE (or re.I): Makes the pattern case-insensitive

In [255]:
test_string = """
Hello World!
This is a Hello test.
Hello again!
Python is fun.
Hello Python.
"""

In [265]:
# Extract Python from text
# By using "flags=re.IGNORECASE" we can override the case-sensitive behavior of the Python language
# You can use "re.I" to save time
pattern = re.compile(r"python", flags=re.IGNORECASE)

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

<re.Match object; span=(49, 55), match='Python'>
<re.Match object; span=(70, 76), match='Python'>


#### re.MULTILINE (or re.M): Allows ^ and $ to match the start and end of each line

In [291]:
# Extract "Hello" from the text that is in the beginning of every line
# Hello is not at the start of the line bc "\n" is at the start of the line in every line
# We tried to add "^" this with a small-case "hello" but then it can't detect small-case
# We want both features small-case detection and beginning-of-line detection that's why we add more flags
# You can add more flags by using the "|" pipe symbol
pattern = re.compile(r"^hello", flags=re.I | re.M )

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

<re.Match object; span=(1, 6), match='Hello'>
<re.Match object; span=(36, 41), match='Hello'>
<re.Match object; span=(64, 69), match='Hello'>


#### re.DOTALL (or re.S): Allows the . to match newline characters as well

In [295]:
# extact every character possible with '\n' as well 
pattern = re.compile(r".", flags=re.S)

matches = re.finditer(pattern, test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(1, 2), match='H'>
<re.Match object; span=(2, 3), match='e'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='l'>
<re.Match object; span=(5, 6), match='o'>
<re.Match object; span=(6, 7), match=' '>
<re.Match object; span=(7, 8), match='W'>
<re.Match object; span=(8, 9), match='o'>
<re.Match object; span=(9, 10), match='r'>
<re.Match object; span=(10, 11), match='l'>
<re.Match object; span=(11, 12), match='d'>
<re.Match object; span=(12, 13), match='!'>
<re.Match object; span=(13, 14), match='\n'>
<re.Match object; span=(14, 15), match='T'>
<re.Match object; span=(15, 16), match='h'>
<re.Match object; span=(16, 17), match='i'>
<re.Match object; span=(17, 18), match='s'>
<re.Match object; span=(18, 19), match=' '>
<re.Match object; span=(19, 20), match='i'>
<re.Match object; span=(20, 21), match='s'>
<re.Match object; span=(21, 22), match=' '>
<re.Match object; span=(22, 23), match='a'>
<re.M

### Extra flags:
* re.VERBOSE (or re.X): Allows you to write more readable regex by ignoring whitespace
and comments within the pattern
* re.ASCII (or re.A): Makes \w, \b, \d, and \s match only ASCII characters
* re.LOCALE (or re.L): Makes \w, \b, \d, and \s dependent on the current locale

# 11. Practice Exercises

### Q1. Extract all email addresses from given text

In [305]:
text = """
Hello John, please contact us at support@example.com for further assistance.
You can also reach out to our manager at manager@example.org or sales@example.net.
Thank you, support@example.com.
"""

In [315]:
pattern = re.compile(r"\w+@\w+.\w+")

matches = re.findall(pattern, text)

for match in matches:
    print(match)

support@example.com
manager@example.org
sales@example.net
support@example.com


### Q2. Validate phone numbers and callout invalid ones
* valid format: (XXX) XXX-XXXX

In [453]:
phone_numbers = [
    "(123) 456-7890",
    "(987) 654-3210",
    "123-456-7890",
    "(123)456-7890",
    "(123) 456-7890"
]

In [436]:
pattern = re.compile(r"\((\d{3})\).(\d{3})-(\d{4})")

for number in phone_numbers:
    if not re.match(pattern, number):
        print(number)

123-456-7890
(123)456-7890


### Q3. From Q2, represent all numbers in valid format

In [463]:
# print(phone_numbers)
pattern = re.compile(r"\(?(\d{3})\)?.?(\d{3})-(\d{4})")

for number in phone_numbers:
    print(re.sub(pattern, r"(\1) \2-\3", number).strip())

(123) 456-7890
(987) 654-3210
(123) 456-7890
(123) 456-7890
(123) 456-7890


### Q4. Identify dates & modify format to YYYY-MM-DD from below text

In [466]:
text = "We have meetings scheduled on 12/05/2023, 23/06/2024, and 07/07/2025."

In [488]:
pattern = re.compile(r"(\d{2})/(\d{2})/(\d{4})")
matches = re.finditer(pattern, text)
print(re.sub(pattern, r"\3-\2-\1", text))

We have meetings scheduled on 2023-05-12, 2024-06-23, and 2025-07-07.


### Q5. Extract URLs from given text

In [493]:
text = """
Visit our website at https://www.example.com for more information.
You can also check out our blog at http://blog.example.com or follow us on https://twitter.com/example.
"""

In [505]:
# pattern = re.compile(r"(https?://)(\w+)\.(\w+).?(\w+)")
patter = re.compile(r"https?://(www\.)?[\w.]+(/\w+)?")
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group())

https://www.example.com
http://blog.example.com
https://twitter.com/example


### Q6. Extract all sentences from below text

In [602]:
text = "Hello world! How are you doing today? This is agreat day. Let's make the most of it."

In [610]:
pattern = re.compile(r".*?[!.?]")
matches = re.findall(pattern, text)
for match in matches:
    print(match.strip())

Hello world!
How are you doing today?
This is agreat day.
Let's make the most of it.


### Q7. Validate passwords based on following conditions:
* atleast 8 characters in length
* atleast one lowercase alphabet
* atleast one uppercase alphabet
* atleast one numeric digit

In [623]:
passwords = [
    "Password123",
    "pass123",
    "PASSWORD123",
    "Pass123",
    "ValidPass1"
]

In [667]:
# We are using look ahead assertion X(?=Y)
# X = "^"
# Y = ".*[a-z]", ".*[A-Z]", ".*[0-9]"
# We are actually satisfying 3 conditions: X(?=Y)(?=Z)(?=W)
# Now by remaining at the start (^) we are checking if all three conditions that are in the brackets ahead of X are satisfied or not
# [a-zA-Z0-9]{8,} = mean if all the characters are total 8 or above
pattern = re.compile(r"^(?=.*[a-z])(?=.*[A-Z])(?=.*[0-9])[a-zA-Z0-9]{8,}")
for password in passwords:
    if not re.match(pattern, password):
        print("Invalid: ",password)

Invalid:  pass123
Invalid:  PASSWORD123
Invalid:  Pass123


### Q8. Remove all HTML tags from given text

In [672]:
text = """
<p>
The <strong>advent of technology</strong> in the 21st century has revolutionized the way we live and interact with the world around us. From the <em>smartphones</em> we carry to the <a href="https://en.wikipedia.org/wiki/Internet_of_things" target="_blank">Internet of Things</a> (IoT) devices that connect our homes, technology has become an <u>integral part</u> of our daily lives. <br><br>
Consider the impact of <abbr title="Artificial Intelligence">AI</abbr> on various industries. AI-driven applications are transforming <span style="color: blue;">healthcare</span> by enabling predictive diagnostics and personalized treatment plans. In the realm of <span style="font-weight: bold;">education</span>, AI is facilitating adaptive learning experiences that cater to individual student needs. <br><br>
Moreover, the <mark>explosion of data</mark> has given rise to the field of data science, where professionals analyze vast amounts of information to uncover <q>insights</q> that drive decision-making across sectors. In <cite>finance</cite>, algorithms are being used to detect fraudulent activities and to automate trading processes, making the markets more efficient. <br><br>
The rise of <kbd>cloud computing</kbd> has also played a crucial role in this technological transformation. By offering scalable and flexible computing resources, the cloud has enabled businesses to innovate without the constraints of physical infrastructure. This shift to the cloud has made it possible for <ins>startups</ins> and large enterprises alike to <del>maintain</del> <ins>achieve</ins> high levels of agility and resilience. <br><br>
Looking ahead, the integration of <sup>5G</sup> networks promises to further accelerate the capabilities of connected devices, paving the way for advancements in areas such as <sub>autonomous vehicles</sub> and smart cities. As we continue to explore the potential of these emerging technologies, it is imperative that we also address the ethical and societal implications they bring. <br><br>
In conclusion, the <strong>rapid pace</strong> of technological advancement presents both opportunities and challenges. By embracing innovation while remaining mindful of its impact, we can <i>harness the power</i> of technology to <u>create a better future</u> for all.
</p>
"""

In [692]:
pattern = re.compile(r"(</?.+?>)")
matches = re.sub(pattern, "" ,text)
# matches = re.findall(pattern, text)
print(matches.strip())

The advent of technology in the 21st century has revolutionized the way we live and interact with the world around us. From the smartphones we carry to the Internet of Things (IoT) devices that connect our homes, technology has become an integral part of our daily lives. 
Consider the impact of AI on various industries. AI-driven applications are transforming healthcare by enabling predictive diagnostics and personalized treatment plans. In the realm of education, AI is facilitating adaptive learning experiences that cater to individual student needs. 
Moreover, the explosion of data has given rise to the field of data science, where professionals analyze vast amounts of information to uncover insights that drive decision-making across sectors. In finance, algorithms are being used to detect fraudulent activities and to automate trading processes, making the markets more efficient. 
The rise of cloud computing has also played a crucial role in this technological transformation. By offe