In [1]:
import re

In [2]:
text = 'This is a good day'

### Search Function

In [3]:
print(True) if re.search('good',text) else print(False)

True


### Split

In [197]:
text2 = 'Amy is a good-person.Amy,is a_good\women.'

In [201]:
re.split(r'\s|,|_|\.|-|\\',text2) # | Or 

['Amy', 'is', 'a', 'good', 'person', 'Amy', 'is', 'a', 'good', 'women', '']

### Sub

In [202]:
text_sub = "I love to play with data"
re.sub(r"\s",' => ',text_sub) # Replace characters

'I => love => to => play => with => data'

### Find All

In [6]:
re.findall('Amy',text2)

['Amy', 'Amy']

In [7]:
t = 'Iran23'
d = re.findall('[A-z]+',t)
" ".join(str(x) for x in d)

'Iran'

In [8]:
re.search('good',text2)

<re.Match object; span=(9, 13), match='good'>

### Patterns and Character Classes

In [107]:
text3 = "011 5425-223 Hello My Name Is Abdelrahman 123456789 012 5536-889 Abdelrahman_Alarqan #$ %% @ ^ & aa bb dd"
re.findall('[a-z]',text3) # return all small chars
re.findall('[A-Z]',text3) # return all capital chars
re.findall('[a-zA-Z]',text3) # return all capital and small chars
print(re.findall('[^a-zA-z]',text3)) # return all things with out Charaxters

['0', '1', '1', ' ', '5', '4', '2', '5', '-', '2', '2', '3', ' ', ' ', ' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' ', '0', '1', '2', ' ', '5', '5', '3', '6', '-', '8', '8', '9', ' ', ' ', '#', '$', ' ', '%', '%', ' ', '@', ' ', ' ', '&', ' ', ' ', ' ']


### Phone Numbers

In [106]:
phone_numbers = """
Office of Research Administration: (734) 647-6333 | 4325 North Quad
Office of Budget and Financial Administration: (734) 647-8044 | 309 Maynard, Suite 205
Health Informatics Program: (734) 763-2285 | 333 Maynard, Suite 500
Office of the Dean: (734) 647-3576 | 4322 North Quad
UMSI Engagement Center: (734) 763-1251 | 777 North University
Faculty Adminstrative Support Staff: (734) 764-9376 | 4322 North Quad
"""
re.findall('[(]\d{3}[)]\s\d{3}[-]\d{4}',phone_numbers)

['(734) 647-6333',
 '(734) 647-8044',
 '(734) 763-2285',
 '(734) 647-3576',
 '(734) 763-1251',
 '(734) 764-9376']

### Domain Extaraction

In [167]:
links = 'I refer to https://google.com and I never refer http://www.baidu.com if I have to search anything'
re.findall('(?<=[https]:\/\/)([A-z0-9.]*)',links)

['google.com', 'www.baidu.com']

In [104]:
string = 'bat, lat, mat, bet, let, met, bit, lit, mit, bot, lot, mot'
result = re.findall('b[ao]t', string)
print(result)

['bat', 'bot']


### Quantifiers

In [125]:
text = "A ABC 123 1235-456 ABCDE ABCDA AB B C D E F G @# $ . #$$"
pattern = "A\w\w\w" # [\w] return 1 single char or number
pattern = "\s\sD" # [\s] return 1 single space
pattern = "\w*" # [\*] return all char or numbers
pattern = "\w+" # [\+] return all char or numbers
pattern = "\w{3}" # [\+] return all char or numbers with 3 length
pattern = "\w{2,5}" # [\+] return all char or numbers with range 2 and 5 length [2,3,4,5]
pattern = "\w{3,}" # [\+] return all char or numbers with range 3 and all lengths [3=>]
pattern = "\w{,4}" # [\+] return all char or numbers with length Up to 4
pattern = "\d{3}\s\d{4}-\d{3}" # Phone Number
pattern = "\w-?\s?" # [\+] return all char or numbers with - or space
text= "Name:Abdelrahman Alarqan"

re.findall('[A-z]+[:][A-z]+\s[A-z]+',text)

['Name:Abdelrahman Alarqan']

### Assertions & Email Pattern

In [131]:
emails = '''
Ahmed@gmail.com
khaled.sami@hotmail.com
john@john.com
john@john.info
@#$%@%^#$$
ross_123@yahoo.com
'''
pattern = "[A-z0-9\.?]+@[A-z0-9]+\.[com|info]+" # will return emails with .com and .info
re.findall(pattern,emails)

['Ahmed@gmail.com',
 'khaled.sami@hotmail.com',
 'john@john.com',
 'john@john.info',
 'ross_123@yahoo.com']

### Logical Or & Escaping

In [163]:
escaping = '''

1- Pandas
2- Python
3- Numpy

1) Pandas
2) Python
3) Numpy

1> Pandas
2> Python
3> Numpy

'''
phone_numbers = '''
123 5466 525
123 5858 (566)
'''
pattern_list = "(\d-|\d\)|\d>)(\s\w+)" # Lists
pattern_phone = "(\d{3}) (\d{4}) (\d{3}|\(\d{3}\))" # Phone num
re.findall(pattern_phone,phone_numbers)

[('123', '5466', '525'), ('123', '5858', '(566)')]

### Links Extraction

In [182]:
links = '''
https://regex101.com
https://www.linkedin.com
https://www.fiverr.com
https://www.upwork.com
http://soundcloud.com
Abdelrahman alarqan
Age 23
I love data
'''
pattern_links = "https?://[A-z0-9.]+"
re.findall(pattern_links,links)

['https://regex101.com',
 'https://www.linkedin.com',
 'https://www.fiverr.com',
 'https://www.upwork.com',
 'http://soundcloud.com']

### Practice 

In [211]:
text = "https://www.elzero.org:8080/category.php?article=105?name=how-to-do"

search = re.search(r"(https?)://(www)?\.?(\w+)\.(\w+):?(\d+)?/?(.+)",text)
print(f"Protocol : {search.group(1)}")
print(f"Sub Domain : {search.group(2)}")
print(f"Domain Name : {search.group(3)}")
print(f"Top Level Domain : {search.group(4)}")
print(f"Port : {search.group(5)}")
print(f"Query String : {search.group(5)}")

Protocol : https
Sub Domain : www
Domain Name : elzero
Top Level Domain : org
Port : 8080
Query String : 8080
