## Text processing with regular expressions
    - information extraction
    - PII masking
    - text cleaning

    - [0-9] = anything which is a number between 0 to 9 = also be represented by \d
    - [a-z] = anything between a to z
    - [A-Z] = anything between A to Z
    - \w = to represent [a-zA-Z0-9_]
    - ^ = for complement
    - \s = for whitespace
    {} = size of the word
    () = grouping
    <> = name of a group

In [1]:
import re

In [2]:
data = "My mobile number is 9898787845 and your number is 7889788945 thank you"

In [4]:
pattern = "[0-9]{10}"
re.findall(pattern,data)

['9898787845', '7889788945']

In [5]:
re.sub(pattern,"*********",data)

'My mobile number is ********* and your number is ********* thank you'

In [6]:
re.sub(pattern,"",data)

'My mobile number is  and your number is  thank you'

In [7]:
data = """ my birthday is 30-02-2000 and your birthday is 31-04-2002 and his birthday is 12-5-1990 and
his friend's birthday is 15/05/1999 thank you for your email, please resply me back on 
the id anshu_pandey@abccompany.com and also keep john.weka@yourcompany.com in cc and you may wanna 
include cera@gmail.com as well."""
print(data)

 my birthday is 30-02-2000 and your birthday is 31-04-2002 and his birthday is 12-5-1990 and
his friend's birthday is 15/05/1999 thank you for your email, please resply me back on 
the id anshu_pandey@abccompany.com and also keep john.weka@yourcompany.com in cc and you may wanna 
include cera@gmail.com as well.


In [10]:
pattern = "[0-9]{2}-[0-9]{2}-[0-9]{4}"
re.findall(pattern,data)

['30-02-2000', '31-04-2002']

In [11]:
pattern = "\d{2}-\d{2}-\d{4}"
re.findall(pattern,data)

['30-02-2000', '31-04-2002']

In [13]:
pattern = "\d{1,2}-\d{1,2}-\d{4}"
re.findall(pattern,data)

['30-02-2000', '31-04-2002', '12-5-1990']

In [17]:
pattern = "\d{1,2}-\d{1,2}-\d{4}|\d{1,2}/\d{1,2}/\d{4}"
re.findall(pattern,data)

['30-02-2000', '31-04-2002', '12-5-1990', '15/05/1999']

In [15]:
pattern = "[a-zA-Z0-9._]+@[0-9a-zA-Z._]+"
re.findall(pattern,data)

['anshu_pandey@abccompany.com', 'john.weka@yourcompany.com', 'cera@gmail.com']

### grouping with regular expressions

In [18]:
data = "my email id is anshu_pandey@abccompany.com what is yours?"

In [19]:
pattern = "[a-zA-Z0-9._]+@[0-9a-zA-Z._]+"
re.search(pattern,data)

<re.Match object; span=(15, 42), match='anshu_pandey@abccompany.com'>

In [20]:
pattern = "([a-zA-Z0-9._]+)@([0-9a-zA-Z._]+)"
match = re.search(pattern,data)

In [21]:
print(match.group(0))

anshu_pandey@abccompany.com


In [22]:
print(match.group(1))

anshu_pandey


In [24]:
print(match.group(2))

abccompany.com


In [25]:
pattern = "(?P<username>[a-zA-Z0-9._]+)@(?P<hostname>[0-9a-zA-Z._]+)"
match = re.search(pattern,data)

In [26]:
print(match.group(0))
print(match.group("username"))
print(match.group("hostname"))

anshu_pandey@abccompany.com
anshu_pandey
abccompany.com
