# RegEx

A regular expression is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern.

In [1]:
import re    # re stands for regular expression

In [2]:
print("Hello\nWorld")        # \n Escape Character

Hello
World


In [4]:
print(r'Hello\nWorld')       #Raw String Literal

Hello\nWorld


In [8]:
print(r'Hello\bWorld')         # backspace

Hello\bWorld


In [5]:
x = """abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
some.one@gmail.com
anything123@gmail.com
@gmail.com
9354-328-885
9354.328.885
Mr. Someone
Ms Anything"""
print(x)

abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
some.one@gmail.com
anything123@gmail.com
@gmail.com
9354-328-885
9354.328.885
Mr. Someone
Ms Anything


In [74]:
re.findall('[\d]{4}[-.][\d]{3}[-.][\d]{3}', x)

['9354-328-885', '9354.328.885']

In [11]:
text = "Hi, my name is Something Anything and my email address is somE@gmail.com harshil.one@gmail.com @gmail.com"
xr = re.findall('[a-zA-Z]@gmail.com', text)
print(xr)


# [] represents a single character from the mentioned characters

['E@gmail.com', 'e@gmail.com']


In [12]:
text = "Hi, my name is Something Anything and my email address is somE@gmail.com harshil.one@gmail.com @gmail.com"
xr = re.findall('[a-zA-Z][a-zA-Z]@gmail.com', text)
print(xr)

['mE@gmail.com', 'ne@gmail.com']


In [13]:
text = "Hi, my name is Something Anything and my email address is somE@gmail.com harshil.one@gmail.com @gmail.com"
xr = re.findall('[a-zA-Z]+@gmail.com', text)
print(xr)


# # Quantifiers
# + 1 or more
# * 0 or more
# ? 0 or 1
# {} specific number of characters or range

['somE@gmail.com', 'one@gmail.com']


In [15]:
text = "Hi, my name is Something Anything and my email address is somE@gmail.com harshil.one@gmail.com @gmail.com"
xr = re.findall('[a-zA-Z.]*@gmail.com', text)
print(xr)

['somE@gmail.com', 'harshil.one@gmail.com', '@gmail.com']


In [44]:
text = "Hi, my name is Something Anything and my email address is somE@gmail.com harshil.one@gmail.com @gmail.com"
xr = re.findall('[a-zA-Z.]{3}@gmail.com', text)
print(xr)

['omE@gmail.com', 'one@gmail.com']


In [77]:
text = "Hi, my name is Something Anything and my email address is somE@gmail.com har27_shil.one@gmail.com @gmail.com"
xr = re.findall('[\w.]{3,}@gmail.com', text)   # \w is equivalent to A-Za-z0-9_
print(xr)

['somE@gmail.com', 'har27_shil.one@gmail.com']


In [31]:
x1 = '''9354-328-885
9354.328.885
Mr. Someone
Mr Anything'''
print(x1)

9354-328-885
9354.328.885
Mr. Someone
Mr Anything


In [33]:
re.findall('Mr[.]? [A-Z][a-z]+', x1)

['Mr. Someone', 'Mr Anything']

In [3]:
x2 = '''9354-328-885
9354.328.885
Mr. Someone
Ms Anything'''
re.findall('M(?:r|s)[.]? [A-Z][a-z]+', x2)


# () represents a group

['Mr. Someone', 'Ms Anything']

In [49]:
text = "Hi, my name is Something Anything and my email address is somE@gmail.com har27shil.one@gmail.com @gmail.com"
xr = re.findall('[a-zA-Z0-9.]+@gmail[.]com', text)
print(xr)

['somE@gmail.com', 'har27shil.one@gmail.com']


In [73]:
x3 = "ab..e ab7.e hello"
re.findall('ab\.\.e', x3)

['ab..e']

In [56]:
print('HelloWorld')

HelloWorld


In [57]:
print('Hello\nWorld')

Hello
World


In [58]:
print('Hello\\nWorld')

Hello\nWorld


In [70]:
x4 = """Hi Someone,
How are you?

Regards
Harshil"""
re.findall('^[A-Za-z]+', x4)

['Hi']

In [69]:
x4 = """3Hi Someone,
How are you?

Regards
Harshil"""
re.findall('[A-Za-z]+$', x4)

['Harshil']

### Metacharacters
Metacharacters are characters with a special meaning:

In [None]:
# []	A single character which is a subset of characters	"[a-m]"	
# \	Signals a special sequence (can also be used to escape special characters)	"\d"	
# .	Any character (except newline character)	"he..o"	
# ^	Starts with	"^hello"	   caret symbol
# $	Ends with	"planet$"	
# |	Either or	"falls|stays"	
# ()	Capture and group

# Quantifiers
# *	Zero or more occurrences	"he.*o"	
# +	One or more occurrences	"he.+o"	
# ?	Zero or one occurrences	"he.?o"	
# {2}	Exactly the specified number of occurrences	"hel{2}o"	"hello"
# {1,34}	Specified Range of occurrences
#     {3,}	 3 till infinite characters
#     {, 10}	 Minimum 0 to max of 10 characters

### Sets
A set is a set of characters inside a pair of square brackets [] with a special meaning:

In [None]:
# [arn]	Returns a match where one of the specified characters ('a' or 'r' or 'n') are present	
# [a-n]	Returns a match for any lower case character, alphabetically between a and n	
# [^arn]	Returns a match for any character EXCEPT a, r, and n	
# [0123]	Returns a match where any of the specified digits (0, 1, 2, or 3) are present	
# [0-9]	Returns a match for any digit between 0 and 9	
# [0-5][0-9]	Returns a match for any two-digit numbers from 00 and 59	
# [a-zA-Z]	Returns a match for any character alphabetically between a and z, 
#           lower case OR upper case	
# [+]	In sets, +, *, ., |, (), $,{} has no special meaning, 
#         so [+] means: return a match for any + character in the string

### Special Sequences
A special sequence is a \ followed by one of the characters in the list below, and has a special meaning:

In [None]:
# \A	Returns a match if the specified characters are at the beginning of the string	"\AThe"	
# \b	Returns a match where the specified characters are at the beginning or at the end of a word 	r"\bain" r"ain\b"	
# \B	Returns a match where the specified characters are present, but NOT at the beginning 
#      (or at the end) of a word (the "r" in the beginning is making sure that the string is being 
#      treated as a "raw string")	r"\Bain" r"ain\B"	
# \d	Returns a match where the string contains digits (numbers from 0-9)	"\d"	
# \D	Returns a match where the string DOES NOT contain digits	"\D"	
# \s	Returns a match where the string contains a white space character	"\s"	
# \S	Returns a match where the string DOES NOT contain a white space character	"\S"	
# \w	Returns a match where the string contains any word characters 
#        (characters from a to Z, digits from 0-9, and the underscore _ character)	"\w"	
# \W	Returns a match where the string DOES NOT contain any word characters	"\W"	
# \Z	Returns a match if the specified characters are at the end of the string	"Spain\Z"

### Functions/Methods in re
- re.match - return match object if pattern is found in the beginning of the string, else returns None
- re.search - return match object for first occurence of the pattern in the string else returns None.
- re.findall - returns a list of all the matches corresponding to the given pattern else returns empty list
- re.split - returns a list of strings splitted from the given pattern/separator
- re.sub - substitute the pattern in the string
- re.finditer - Creates an iterator on the matches.

In [6]:
print(x)

abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
some.one@gmail.com
anything123@gmail.com
@gmail.com
9354-328-885
9354.328.885
Mr. Someone
Ms Anything


In [19]:
res = re.match('[a-z]+', x)
print(res)

if res != None:
    print(res.span())
    print(res.group(0))

<re.Match object; span=(0, 26), match='abcdefghijklmnopqrstuvwxyz'>
(0, 26)
abcdefghijklmnopqrstuvwxyz


In [125]:
res = re.match('[aeiou]', x)
print(res)
if res != None:
    print(res.span())
    print(res.group(0))

<re.Match object; span=(0, 1), match='a'>
(0, 1)
a


In [60]:
x = """abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
some.one@gmail.com
anything123@gmail.com
12345-67890-123
9354-328-885
9354.328.885
Mr. Someone
Mr. Anything"""

In [28]:
regex = "[\d]{4}[.-][\d]{3}[-.][\d]{3}"
s = re.finditer(regex, x)
print(s)
for i in s:
    print(i.span())
    print(i.group())
    print()

<callable_iterator object at 0x0000018A2CAC5AF0>
(106, 118)
9354-328-885

(119, 131)
9354.328.885



In [134]:
regex = r"[\d]+-[\d]+-[\d]+"
s = re.search(regex, x)
print(s)

<re.Match object; span=(95, 110), match='12345-67890-123'>


In [128]:
print('HelloWolrd')

HelloWolrd


In [129]:
print('Hello\bWolrd')               # \b means backspace

HelloWolrd


In [130]:
print(r'Hello\bWolrd')               # \b means backspace

Hello\bWolrd


In [None]:
strin = "My name is Harshil. My contact number is 9354328885 and I was born on 12th November 2020 in Pune."
regex = ? plus some code 
Output = ['Harshil', '9354328855', '12th November 2020', 'Pune']

In [None]:
print(x)

In [None]:
res = re.match('abcdefghi', x)
print(res)
if res != None:
    print(res.span())
    print(res.group())

In [None]:
res = re.search('\d{4}', x)
print(res)

In [None]:
print(x)

In [None]:
res = re.match(r'abce', x)
print(res)
if res != None:
    print(res.span())

In [None]:
res = re.search(r'[abcd]+', x)
print(res)
if res != None:
    print(res.span())
    print(res.group())

In [None]:
s = "Hello"    #String Object
s.find('l')
r = list(s)    #List Object
r.index()

In [None]:
print(x)

In [None]:
res = re.findall(r'[A-Z][a-z]', x)
print(res)    # List Object

We use group() method to get all the matches and captured groups. 
The groups contain the matched values.
In particular:
- match.group(0) always returns the fully matched string
- match.group(1) match.group(2), ... return the captured groups in order from left to right in the input string
- match.group() is equivalent to match.group(0)

In [None]:
import re

# regex1 = r"[a-zA-Z]+ \d{2} \d{4}"

regex = r"([a-zA-Z]+) (\d{2}) (\d{4})"

string = "My name is Harshil. I was born on January 24 2012"

s = re.search(regex, string)
print(s)

# So this will print "June 24"
print ("Full match: %s" % (s.group()))
  
# So this will print "June"
print ("Month: %s" % (s.group(1)))
  
# So this will print "24"
print ("Day: %s" % (s.group(2)))

# So this will print "2012"
print ("Year: %s" % (s.group(3)))

In [None]:
print(x)

In [None]:
res1 = re.search(r'Mr\.', x)
print(res1)

In [None]:
res1 = re.search(r'Mrs?\. [a-zA-Z]+', x)
print(res1)
res1.span()

In [None]:
res = re.findall(r'Mrs?\. [a-zA-Z]+', x)
print(res)

In [None]:
print(x)

In [None]:
t = range(2,7)    # Iterator Collection
print(type(t))
l = [2,3,4,5,6]   # List Collection
# for x in t:
d = {1:2, 3:4}    # Dictionary Collection
d.keys()          # Iterator Collection


In [141]:
print(x)

abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
some.one@gmail.com
anything123@gmail.com
12345-67890-123
9354-328-885
9354.328.885
Mr. Someone
Mr. Anything


In [143]:
res = re.findall(r'Mrs?[.] [a-zA-Z]+', x)
print(res)

['Mr. Someone', 'Mr. Anything']


In [64]:
res = re.finditer(r'Mrs?[.] [a-zA-Z]+', x)
print(res)
for i in res:
    print(i.span())
    print(i.group())
    print()

<callable_iterator object at 0x00000141FA9F0760>
(137, 148)
Mr. Someone

(149, 161)
Mr. Anything



In [145]:
res = re.finditer(r'Mrs?\. [a-zA-Z]+', x)
print(res)
for i in res:
    print(i.span())
    print(i.group())
    print()

<callable_iterator object at 0x0000015623609CA0>
(137, 148)
Mr. Someone

(149, 161)
Mr. Anything



In [29]:
dob = input('Enter Date of Birth: ')
dl = dob.split("/")
print(dl)

Enter Date of Birth: 27/12/1990
['27', '12', '1990']


In [30]:
dob = input('Enter Date of Birth: ')
dl = dob.split("/")
print(dl)

Enter Date of Birth: 27/12:1990
['27', '12:1990']


In [38]:
dob = input('Enter Date of Birth: ')
dl = re.split("[/:]+", dob)
print(dl)

Enter Date of Birth: 27/12:::::::::::::::1990
['27', '12', '1990']


In [32]:
name = input('Enter name: ')
nl = name.split(" ")
print(nl)

Enter name: Harshil Bansal
['Harshil', 'Bansal']


In [36]:
name = input('Enter name: ')
nl = name.split(" ")
print(nl)

Enter name: Harshil              Bansal
['Harshil', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Bansal']


In [34]:
name = input('Enter name: ')
nl = re.split("[ ]+", name)
print(nl)

Enter name: Harshil Bansal
['Harshil', 'Bansal']


In [44]:
re.split?

In [35]:
name = input('Enter name: ')
nl = re.split("[ ]+", name)
print(nl)

Enter name: Harshil        Bansal
['Harshil', 'Bansal']


In [41]:
dob = input('Enter Date of Birth: ')
do = dob.replace("/", ':')
print(do)

Enter Date of Birth: 27/12-1990
27:12-1990


In [43]:
dob = input('Enter Date of Birth: ')
do = re.sub("[/-]", ':', dob)
print(do)

Enter Date of Birth: 27/12-1990
27:12:1990
