# A walkthrough on Regular Expression (re)
Regular Expression is a default python package that allows for text lookup, replacement, and several features for strings 

In [1]:
import re 

### We generate a sample string and look for particular matches in the sample string

In [2]:
# Random string
test_string = "123abc456789abc123ABC"
# Pattern to match in text
pattern = re.compile(r"abc")
matches = pattern.finditer(test_string)
#matches = re.finditer(r"abc",test_string)  --- Alternate method
for match in matches:
    print(match)
# Span shows the start and end index of the match in the string, in all occurrences of the match 

<re.Match object; span=(3, 6), match='abc'>
<re.Match object; span=(12, 15), match='abc'>


### Difference between a Raw String and a String

In [3]:
# r is for raw string
a = "\tHello"
print(a)
a = r"\tHello"
print(a)

	Hello
\tHello


## Popular methods for pattern object which inherits from re
#### match(): Determine if the RE matches at the beginning of the string

In [4]:
# Remember using test_string = "123abc456789abc123ABC"
# match(), search(), findall()
matches = pattern.findall(test_string)
for match in matches:
   print(match)

abc
abc


#### finditer(): Find all substrings where the RE matches and return them as an iterator.

In [5]:
matches = re.finditer(r"abc",test_string)

# group, start, end, span
for match in matches:
   print(match.span(), match.start(), match.end())
   print(match.group())

(3, 6) 3 6
abc
(12, 15) 12 15
abc


## Meta Characters
### All meta characters: . ^ $ * + ? {} [] \ | ()


### . Any character (except newline character)

In [6]:
# Meta characters
matches = re.finditer(r".",test_string)
for match in matches:
   print(match.group())

1
2
3
a
b
c
4
5
6
7
8
9
a
b
c
1
2
3
A
B
C


### \ : Special sequence (or escape characters) 

In [7]:
# Using special character escape

test_string = "123abc456789abc123.ABC"
matches = re.finditer(r"\.",test_string)
for match in matches:
   print(match.group())

.


### \d : Matches any decimal digit; [0-9]

In [8]:
test_string = "hello 123_ heyho hohey"
pattern = re.compile(r'\d')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(6, 7), match='1'>
<re.Match object; span=(7, 8), match='2'>
<re.Match object; span=(8, 9), match='3'>


### \D : Matches any non-digit character

In [9]:
test_string = "hello 123_ heyho hohey"
pattern = re.compile(r'\D')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>
<re.Match object; span=(5, 6), match=' '>
<re.Match object; span=(9, 10), match='_'>
<re.Match object; span=(10, 11), match=' '>
<re.Match object; span=(11, 12), match='h'>
<re.Match object; span=(12, 13), match='e'>
<re.Match object; span=(13, 14), match='y'>
<re.Match object; span=(14, 15), match='h'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match=' '>
<re.Match object; span=(17, 18), match='h'>
<re.Match object; span=(18, 19), match='o'>
<re.Match object; span=(19, 20), match='h'>
<re.Match object; span=(20, 21), match='e'>
<re.Match object; span=(21, 22), match='y'>


### \b : Matches where the specified characters are at the beginning or at the end of a word

In [10]:
test_string = "hello 123_ heyho hohey"
pattern = re.compile(r'\bhello')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='hello'>


### \B : Matches where the specified characters are present, but NOT at the beginning or the end

In [13]:
test_string = "hello 123_ heyho hohey"
pattern = re.compile(r'\Bho')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(14, 16), match='ho'>


### [] :  A set of characters "[a-m]"

In [14]:
test_string = "hello 123_ heyho hohey"
pattern = re.compile(r'[lo]')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(18, 19), match='o'>


In [15]:
test_string = "hello_123"
pattern = re.compile(r'[a-z]')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='h'>
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='l'>
<re.Match object; span=(3, 4), match='l'>
<re.Match object; span=(4, 5), match='o'>


In [16]:
test_string = "hello_123"
pattern = re.compile(r'\d')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(6, 7), match='1'>
<re.Match object; span=(7, 8), match='2'>
<re.Match object; span=(8, 9), match='3'>


## Quantifiers 
### * : 0 or more

In [17]:
# Zero or more quantifier *
test_string = "hello_123"
pattern = re.compile(r'\d*')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(0, 0), match=''>
<re.Match object; span=(1, 1), match=''>
<re.Match object; span=(2, 2), match=''>
<re.Match object; span=(3, 3), match=''>
<re.Match object; span=(4, 4), match=''>
<re.Match object; span=(5, 5), match=''>
<re.Match object; span=(6, 9), match='123'>
<re.Match object; span=(9, 9), match=''>


### + : 1 or more

In [18]:
# One or more quantifier
test_string = "hello_123"
pattern = re.compile(r'\d+')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(6, 9), match='123'>


In [19]:
# One or more quantifier
test_string = "hello_123"
pattern = re.compile(r'_\d')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(5, 7), match='_1'>


### ? : 0 or 1, -> optional character

In [21]:
# Optional quantifier
test_string = "hello123"
pattern = re.compile(r'_?\d')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(5, 6), match='1'>
<re.Match object; span=(6, 7), match='2'>
<re.Match object; span=(7, 8), match='3'>


### {4} : exact number
This specifies the exact number of characters

In [22]:
# Exact matches
test_string = "hello123"
pattern = re.compile(r'\d{1}')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(5, 6), match='1'>
<re.Match object; span=(6, 7), match='2'>
<re.Match object; span=(7, 8), match='3'>


In [4]:
# Exact matches
test_string = "hello123"
pattern = re.compile(r'\d{2}')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(5, 7), match='12'>


In [26]:
# Exact matches for range
test_string = "hello123"
pattern = re.compile(r'\d{1,3}')
matches = pattern.finditer(test_string)

for match in matches:
    print(match)

<re.Match object; span=(5, 8), match='123'>


## Some more Application of Regular Expression Methods
Using the list of data with mixed formatting types we can use regular expression methods to extract whatever we need

In [6]:
dates = """
2020.03.01

2020.04.01

2020-04-01
2020-05-23
2020-06-11
2020-07-11
2020-08-11

2020_04_04
2020_04_04

2020/05/01
2020/06/01
2020/07/01
"""
pattern = re.compile(r'\d\d\d\d\.\d\d\.\d\d')
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(1, 11), match='2020.03.01'>
<re.Match object; span=(13, 23), match='2020.04.01'>


 A simpler approach while extracting dates with the - seperator as opposed to . 

In [41]:
#pattern = re.compile(r'\d\d\d\d-\d\d-\d\d')
pattern = re.compile(r'\d{4}-\d{2}-\d{2}')
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(25, 35), match='2020-04-01'>
<re.Match object; span=(36, 46), match='2020-05-23'>
<re.Match object; span=(47, 57), match='2020-06-11'>
<re.Match object; span=(58, 68), match='2020-07-11'>
<re.Match object; span=(69, 79), match='2020-08-11'>


Extracting dates with / seperator

In [38]:
pattern = re.compile(r'\d\d\d\d[/]\d\d[/]\d\d')
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(104, 114), match='2020/05/01'>
<re.Match object; span=(115, 125), match='2020/06/01'>
<re.Match object; span=(126, 136), match='2020/07/01'>


Selecting dates with month starting with 0 then 5 or 6

In [40]:
pattern = re.compile(r'\d\d\d\d[/]0[56][/]\d\d')
matches = pattern.finditer(dates)

for match in matches:
    print(match)

<re.Match object; span=(104, 114), match='2020/05/01'>
<re.Match object; span=(115, 125), match='2020/06/01'>


### Another example using special characters
#### \s : Matches any whitespace character; (space "" tab "\t" newline "\n")
#### \w : Matches any alphanumeric (word) character; [a-zA-z0-9_]
#### + One or more occurences "aix+"

In [42]:
# Conditions
my_string = """
hello world
1223
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
"""

pattern = re.compile(r'Mr\s\w+')
matches = pattern.finditer(my_string)

for match in matches:
    print(match)

<re.Match object; span=(18, 28), match='Mr Simpson'>


In [43]:
pattern = re.compile(r'Mr\.\s\w+')
matches = pattern.finditer(my_string)

for match in matches:
    print(match)

<re.Match object; span=(41, 50), match='Mr. Brown'>
<re.Match object; span=(60, 65), match='Mr. T'>


#### \w : Matches any alphanumeric (word) character; [a-zA-z0-9_]
#### ? : 0 or 1, -> optional character

In [46]:
pattern = re.compile(r'Mr\.?\s\w+')
matches = pattern.finditer(my_string)

for match in matches:
    print(match)

<re.Match object; span=(18, 28), match='Mr Simpson'>
<re.Match object; span=(41, 50), match='Mr. Brown'>
<re.Match object; span=(60, 65), match='Mr. T'>


## Conditionals
#### | : Either or "buy|sell"

In [47]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s\w+')
matches = pattern.finditer(my_string)

for match in matches:
    print(match)

<re.Match object; span=(18, 28), match='Mr Simpson'>
<re.Match object; span=(29, 40), match='Mrs Simpson'>
<re.Match object; span=(41, 50), match='Mr. Brown'>
<re.Match object; span=(51, 59), match='Ms Smith'>
<re.Match object; span=(60, 65), match='Mr. T'>


### Another example for Email Extraction

In [8]:
emails = """
hello world
1223
Mr Simpson
Mrs Simpson
Mr. Brown
pythonprogrammer@gmail.com
serendipython12@hotmail.hot
serendi-yo@admin.yo-domain.org
"""

pattern = re.compile(r'[a-zA-Z0-9-]+@[a-zA-Z-]+\.[a-zA-Z]+')
matches = pattern.finditer(emails)

for match in matches:
    print(match)

<re.Match object; span=(51, 77), match='pythonprogrammer@gmail.com'>
<re.Match object; span=(78, 105), match='serendipython12@hotmail.hot'>
<re.Match object; span=(106, 125), match='serendi-yo@admin.yo'>


## Grouping Regular Expression to allow for Subsetting with index

In [55]:
# Observe brackets that partition the expression.
pattern = re.compile(r'([a-zA-z0-9-]+)@([a-zA-Z-]+)\.([a-zA-Z]+)')
matches = pattern.finditer(emails)

for match in matches:
    print(match.group(1))

# Output shows only email username

pythonprogrammer
serendipython12
serendi-yo


In [9]:
pattern = re.compile(r'([a-zA-z0-9-]+)@([a-zA-Z-]+)\.([a-zA-Z]+)')
matches = pattern.finditer(emails)

for match in matches:
    print(match.group(2))

# Output shows only email domain

gmail
hotmail
admin


# Modifications to String
### There are two methods; split and sub
Split - splits string on pattern match

In [58]:
# Modification to matches
# Two methods; split, sub

test_string = "123abc456789abc123ABC"
pattern = re.compile(r"abc")
splitted = pattern.split(test_string)
print(splitted)

['123', '456789', '123ABC']


Sub - Substitute string on pattern match

In [61]:
# Sub method for substitution
test_string = "hello world, great time to be in the world"
pattern = re.compile(r"world")
subbed = pattern.sub("earth", test_string)
print(subbed)

hello earth, great time to be in the earth


## Dealing with URLs in string
First matching for only http

In [62]:
urls = """
hello
2020-05-20
http://python-programmer.com
https://www.serendi-programmer.com
http://www.pyman.net
"""

pattern = re.compile(r"http://www\.([a-zA-Z-]+)\.[a-zA-Z]+")
matches = pattern.finditer(urls)
for match in matches:
    print(match)

<re.Match object; span=(82, 102), match='http://www.pyman.net'>


Using the Optional Character(?) to factor in https and not only http on the whole (www.) group

In [67]:
pattern = re.compile(r"https?://(www\.)?([a-zA-Z-]+)\.[a-zA-Z]+")
matches = pattern.finditer(urls)
for match in matches:
    print(match)

<re.Match object; span=(18, 46), match='http://python-programmer.com'>
<re.Match object; span=(47, 81), match='https://www.serendi-programmer.com'>
<re.Match object; span=(82, 102), match='http://www.pyman.net'>


## Substituting URLs in urls String


In [68]:
subbed_urls = pattern.sub("link", urls)
print(subbed_urls)


hello
2020-05-20
link
link
link



## Grouping and Selecting only Domain Names

In [73]:
pattern = re.compile(r"https?://(www\.)?([a-zA-Z-]+)(\.[a-zA-Z]+)")
matches = pattern.finditer(urls)
for match in matches:
    #print(match.group(0))
    print(match.group(2))

python-programmer
serendi-programmer
pyman


## Substituting with Selected Domain Names

In [74]:
subbed_urls = pattern.sub(r"\2\3", urls)
print(subbed_urls)


hello
2020-05-20
python-programmer.com
serendi-programmer.com
pyman.net



## Compilation Flags
IGNORECASE, I : Do case-insensitive matches

In [75]:
# Compilation Flags
my_string = "Hello World"
# using the IGNORECASE
pattern = re.compile(r"world",re.I)
matches = pattern.finditer(my_string)
for match in matches:
    print(match)

<re.Match object; span=(6, 11), match='World'>
