#### re module
- `Regex 101` => https://regex101.com/
- This module offers a set of function that allows us to search for a match in the string
- https://docs.python.org/3/library/re.html

#### The main methods of 're' module:
- findall - returns a list containing all matches
- search - returns a match object if there is a match anywhere in the string

 - `[abc]` : a,b or c
 - `[^abc]` : any character except a,b,c
 - `[a-z]` : a to z
 - `[a-z A-Z]` a to z, A to Z
 - `[0-9]` : 0 to 9
 - `[ ]?` : occur 0 or 1 time
 - `[ ]+` : occur 1 or more times
 - `[ ]*` : occur 0 or more times
 - `[ ]{n}` : occur n times
 - `[ ]{n,}` : occur n or more times
 - `[ ]{y,z}` : occur atleast y times but less than z times

#### Regex Metacharacters
- `\d` : [0-9]
- `\D` : [^0-9]
- `\w` : [a-z A-Z  _  0-9]
- `\W` : [^\w]
- `\s` : whitespace characters (which includes [ \t\n\r\f\v])
- `\b` : match entire words, not just parts of them.

In [3]:
import re
pattern = r"\d{2}"  # Give me all two digit numbers
text = 'The score of the player was 70 runs from 95 balls'
print(re.findall(pattern,text))
print(re.search(pattern,text))

['70', '95']
<re.Match object; span=(28, 30), match='70'>


In [6]:
pattern = r"\d{1,3}" # Check any digit between 1 and 3
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['70', '95', '9', '123', '7']


In [7]:
pattern = r"\d{1,3}\b" # Check any digit between 1 and 3; \b -> boundary 
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['70', '95', '9', '237']


In [108]:
pattern = r"bat\b"  
text = 'andubat'
print(re.findall(pattern,text))

['bat']


In [109]:
pattern = r"\bbat"  
text = 'andubat'
print(re.findall(pattern,text))

[]


In [110]:
pattern = r"\bbat"  
text = 'batsman'
print(re.findall(pattern,text))

['bat']


In [8]:
pattern = r"\b\d{1,3}\b" # Check any digit between 1 and 3; \b -> boundary 
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['70', '95', '9']


In [9]:
pattern = r"\s" # Checking whitespaces 
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


In [10]:
pattern = r"\S" # Everything apart from whitespaces 
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['T', 'h', 'e', 's', 'c', 'o', 'r', 'e', 'o', 'f', 't', 'h', 'e', 'p', 'l', 'a', 'y', 'e', 'r', 'w', 'a', 's', '7', '0', 'r', 'u', 'n', 's', 'f', 'r', 'o', 'm', '9', '5', 'b', 'a', 'l', 'l', 's', 'i', 'n', '9', 'o', 'v', 'e', 'r', 's', '1', '2', '3', '7']


In [12]:
pattern = r"." # Checking any character including whitespaces
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['T', 'h', 'e', ' ', 's', 'c', 'o', 'r', 'e', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'p', 'l', 'a', 'y', 'e', 'r', ' ', 'w', 'a', 's', ' ', '7', '0', ' ', 'r', 'u', 'n', 's', ' ', 'f', 'r', 'o', 'm', ' ', '9', '5', ' ', 'b', 'a', 'l', 'l', 's', ' ', 'i', 'n', ' ', '9', ' ', 'o', 'v', 'e', 'r', 's', ' ', '1', '2', '3', '7']


In [13]:
pattern = r"\w" # Checking any character
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['T', 'h', 'e', 's', 'c', 'o', 'r', 'e', 'o', 'f', 't', 'h', 'e', 'p', 'l', 'a', 'y', 'e', 'r', 'w', 'a', 's', '7', '0', 'r', 'u', 'n', 's', 'f', 'r', 'o', 'm', '9', '5', 'b', 'a', 'l', 'l', 's', 'i', 'n', '9', 'o', 'v', 'e', 'r', 's', '1', '2', '3', '7']


In [17]:
pattern = r"\W" # Everything apart from character
text = 'The score of the player was 70 runs - from 95 balls in 9 : overs 1237'
print(re.findall(pattern,text))

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '-', ' ', ' ', ' ', ' ', ' ', ' ', ':', ' ', ' ']


In [18]:
pattern = r"[a-c]" # All the alphabets between a and c
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['c', 'a', 'a', 'b', 'a']


In [20]:
pattern = r"[a-z]" # All the alphabets between a and z
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['h', 'e', 's', 'c', 'o', 'r', 'e', 'o', 'f', 't', 'h', 'e', 'p', 'l', 'a', 'y', 'e', 'r', 'w', 'a', 's', 'r', 'u', 'n', 's', 'f', 'r', 'o', 'm', 'b', 'a', 'l', 'l', 's', 'i', 'n', 'o', 'v', 'e', 'r', 's']


In [22]:
pattern = r"[0-5]" # All the numbers between 0 and 5
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1237'
print(re.findall(pattern,text))

['0', '5', '1', '2', '3']


In [28]:
pattern = r"[a-k]|[0-7]"
text = 'The score of the player was 70 runs from 95 balls in 9 overs ac1237'
print(re.findall(pattern,text))

['h', 'e', 'c', 'e', 'f', 'h', 'e', 'a', 'e', 'a', '7', '0', 'f', '5', 'b', 'a', 'i', 'e', 'a', 'c', '1', '2', '3', '7']


In [34]:
pattern = r"[a-k][0-7]" # Some character and a number
text = 'The score of the player was 70 runs from 95 balls in 9 overs 1cr19 c56'
print(re.findall(pattern,text))

['c5']


In [37]:
pattern = r"[A-K]"
text = 'The score of the player was ANKIT 70 runs from 95 balls in 9 overs 1cr19 c56'
print(re.findall(pattern,text))

['A', 'K', 'I']


In [41]:
pattern = r"play" # Check a word
text = "The score of the player is not what we expected. He scored only 156 runs"
re.findall(pattern,text)

['play']

In [43]:
pattern = r"\bplay\b" # Check a word
text = "The score of the player is not what we expected. He scored only 156 runs"
re.findall(pattern,text)

[]

In [45]:
pattern = r"\bplay\b" # Check a word
text = "The score of the player is not what we expected. He scored only 156 runs. Let's play play"
re.findall(pattern,text)

['play', 'play']

In [49]:
pattern = r"play\b" # anything end with play
text = "The score of the player is not what we expected. He scored only 156 runs. Let's play apkaplay"
re.findall(pattern,text)

['play', 'play']

In [52]:
pattern = r"\bplay" # anything starting with play 
text = "The score of the player is not what we expected. He scored only 156 runs. Let's play apkaplay"
re.findall(pattern,text) # One from player and one from play 

['play', 'play']

In [60]:
# Start and end of the string
pattern = r"^score" # If this is at the beginning 
text = "The score of the player is not what we expected. He scored only 156 runs. Let's play apkaplay"
print(re.findall(pattern,text))

[]


In [64]:
# Start and end of the string
pattern = r"^Score" # If this is at the beginning 
text = "Score of the player is not what we expected. He scored only 156 runs. let's play apkaplay"
print(re.findall(pattern,text))

['Score']


In [68]:
# Start and end of the string
pattern = r"runs$" # If this is at the end 
text = "Score of the player is not what we expected. He scored only 156 runs"
print(re.findall(pattern,text))

['runs']


In [79]:
pattern = r"the*" # ZERO or more occurances of "e" followed by "th"
text = "Score of the player is not what themes we expected. their scored only 156 runs"
print(re.findall(pattern,text)) # the, themes, their

['the', 'the', 'the']


#### re.IGNORECASE

In [80]:
pattern = r"the+" # ONE or more occurances of "e" followed by "th"
text = "Score of the player is not what th we expected. Their scored only 156 runs"
print(re.findall(pattern,text,re.IGNORECASE)) # the, their. "th" will not come

['the', 'The']


In [88]:
pattern = r"the?\b" # 0 or 1 'e' followed by 'th'
text = "Score of the thee player is not what th we expected. Their scored only 156 runs"
print(re.findall(pattern,text,re.IGNORECASE))

['the', 'th']


In [91]:
pattern = r"the{2}" #'th' followed by exactly 2 'e'
text = "Score of the thee player is not what th we expected. Their scored only 156 runs"
print(re.findall(pattern,text,re.IGNORECASE))

['thee']


In [94]:
pattern = r"the|was" # the or was 
text = "Score of the thee player was not what th we expected. Their scored only 156 runs"
print(re.findall(pattern,text,re.IGNORECASE))

['the', 'the', 'was', 'The']


#### Question

In [134]:
# Mobile number -> starts with 8 or 9 and total digits = 10
pattern = r"[89][0-9]{9}"
text = ["5807654367",'9034879067','8790675432']
final = []
for i in text:
    final.extend(re.findall(pattern,i))

In [135]:
final

['9034879067', '8790675432']

In [143]:
# Email Id
pattern = r"[a-z A-Z _ 0-9 \.]+[@]{1}+[a-z]+[\.]+[a-z]{2,3}"
find_in = ["boppanaa.99saur@gmail.com","hagui.semis@@gg.in",'kingu_singh@piplu.us']
result = []
for i in find_in:
    result.extend(re.findall(pattern,i))

In [144]:
result

['boppanaa.99saur@gmail.com', 'kingu_singh@piplu.us']