In [2]:
#examples and workflow is adopted from "Mastering Regular Expressions with Python"

In [3]:
import re

In [4]:
pattern = re.compile(r'\bfoo\b') # "r" is used for raw string notation to make \ non-special character (\n =>\n)

In [5]:
pattern.match("foo bar") # single match at a time starting from the beginning of the line

<_sre.SRE_Match object; span=(0, 3), match='foo'>

In [6]:
# multiline parameter inside compile
# search starts from any point of the string but still is a single
pattern = re.compile(r'^morning', re.MULTILINE) # checking the regex pattern at the beginning of each line
print(pattern.search("\n\nsalam\nmorning"))

<_sre.SRE_Match object; span=(8, 15), match='morning'>


In [7]:
# findall returns a list of all non-verlapping occurances of the pattern
pattern = re.compile(r"\d{3}[\s-]?\d{3}")
pattern.findall("My id is 333-345 and his id is 333999")

['333-345', '333999']

In [8]:
# finditer yeilds an iterator that each elements of it is one match object with groups and other methods.
pattern = re.compile(r"([A-Z]{1}[a-z]{,20})\s([A-Z]{1}[a-z]{,20}i{1})") #  Persian first name and last name
it = pattern.finditer("my name is Jun Sungi and his name is Bernardo Francisi\n she is Angela Berlostoni")
for match in it: # iterators can be iterated over similar to iterables
    print(match.groups()) # this is called capturing via group function
    print(match.span())

('Jun', 'Sungi')
(11, 20)
('Bernardo', 'Francisi')
(37, 54)
('Angela', 'Berlostoni')
(63, 80)


In [9]:
# split is used for tokenizing the sentences (strings), you can set the number of splits you like to see
pattern = re.compile(r"[^a-zA-Z]+")
pattern.split("I love working on - any- =+ @#@#@#@ -nlp related works. It will be the future of the world!!!!! ip=192.168.0.1")

['I',
 'love',
 'working',
 'on',
 'any',
 'nlp',
 'related',
 'works',
 'It',
 'will',
 'be',
 'the',
 'future',
 'of',
 'the',
 'world',
 'ip',
 '']

In [10]:
# you can use sub(repl, string, count) by replacing the matched pattern with a replacement
pattern = re.compile(r"\d+")
string = "Replace any number with -[num]- in this document.\n 123 days remained to end of yeayr 2122.\
by then you will weight close to 190 pounds and 12 oz! "
pattern.sub("[num]", string)

'Replace any number with -[num]- in this document.\n [num] days remained to end of yeayr [num].by then you will weight close to [num] pounds and [num] oz! '

In [11]:
# you can also substitute with a function instead of a string if you are trying to modify a specific group of the text
#let's assume you wan to replace A with B and - with A in a specific position:

def susbstituter(matchobject):
    try:
        if matchobject.group(1) == 'A': return 'B'
        elif matchobject.group(1) == '-': return 'A'
    except:
        pass

pattern =re.compile(r'([AB-])')
pattern.sub(susbstituter, "A4561-512")

'B4561A512'

In [12]:
# now let's try to substitude italic operation in markdown with bold operation in html
# we will use back reference 
text = "where **tetha** is the phase angle and **c** is the speed of light." # italic in markdown format
pattern = re.compile(r"\*\*(.*?)\*\*") # we made it none greedy using ?
pattern.sub(r"<it>\g<1><\\it>", text) #where \g<num> is the groups number of matched

'where <it>tetha<\\it> is the phase angle and <it>c<\\it> is the speed of light.'

In [13]:
# if you would like to know how many substitution has been made, use subn , it will return a tuple
# of the number of iterations and the new text
text = "where **tetha** is the phase angle and **c** is the speed of light." # italic in markdown format
pattern = re.compile(r"\*\*(.*?)\*\*") # we made it none greedy using ?
pattern.subn(r"<it>\g<1><\\it>", text) #where \g<num> is the groups number of matched


('where <it>tetha<\\it> is the phase angle and <it>c<\\it> is the speed of light.',
 2)

In [15]:
# you can name groups in your regex using ?P<name> for example
pattern = re.compile(r"(?P<id>\w+)-(?P<digits>\d+)")
it = pattern.finditer("AB-3438734 is the daughter of AJ-983834 and the mother of JJ-6483")
print([iter.groups() for iter in it]) # returns all the matches in a tuple
print([iter.group(1) for iter in it]) # return group one results
print([iter.group('digits') for iter in it]) # return group 2 results which is named 'digits'
print([iter.group("id", 2) for iter in it]) # print 

[('AB', '3438734'), ('AJ', '983834'), ('JJ', '6483')]
[]
[]
[]


In [16]:
# you can access the group match as a hash table 
pattern.search("AB-3438734 is the daughter of AJ-983834 and the mother of JJ-6483").groupdict() # only works when group's named

{'id': 'AB', 'digits': '3438734'}

In [17]:
## there are some compilation flags that define the behaviour of your pattern search
# some examples are:
# re.I the pattern will match both lowercase and uppercase
# re.M modifies ^$ behavior it does the search for each line
# re.S "." will match any character even newline
# rel.L flags become dependant on current location
# re.X makes writing the regex easier
# re.DEBUG gives information about compilation patter
# let's diveinto them

In [18]:
pattern = re.compile("[A-Z]+", re.I)
pattern.match("abcdef")

<_sre.SRE_Match object; span=(0, 6), match='abcdef'>

In [19]:
pattern = re.compile("([0-9]+)", re.M) # if you have something in groups format it will return group
pattern.findall("1234 78578 \n 98 343 454")

['1234', '78578', '98', '343', '454']

In [25]:
patterm = re.compile(r"\w+", re.U) # gets access to all unicode alphanumeric
pattern.findall(r"یار مرا غار مرا عشق جگر خار مرا یار تویی غار تویی خاجه نگه دار مرا  غزل 123")

[]

In [26]:
# Grouping
# we use paranthesis to change alteration
pattern = re.compile("Espana(a|ol)") # match espana or espanaol

In [27]:
# backreferenceing groups you can use \num to back reference to a previous group number
pattern = re.compile(r"(\w+)-\1") # whati it says is to find a word that is followed by an space by itself, recall group (0) is all
# you can use this pattern to remove duplicates
match = pattern.search(r"bye-bye birdy")
march.group() # this will return the first group that has matched "bye"

NameError: name 'march' is not defined

In [None]:
# you can also use backreferencing for swapping different groups assume you want to swap city and zipcode
pattern = re.compile(r"([A-Z]{1}[a-z]++), (\d+)")
text = "Hamedan, 126456 \n Tehran, 89374\n 34334 3434Mashad, 3437\n$ $$ $ 4 error Yazd, 232323"
pattern.sub("\2-\1")

In [None]:
#It is, however, a better practice to name the groups. this makes the code more scalable, here how it goes
# (?P<name>pattern) you access this in other regex using \g<name> for submission and (?P=name) for repition

pattern = re.compile(r"(?P<first_name>[A-Z]{1}[a-z]+) (?P<last_name>[A-Z]{1}[a-z])")
match = pattern.search("Jassem Karrar")
print(match.group("first_name"))
print(match.group("last_name"))
print("reverse is :" + pattern.sub(r"\g<last_name>-\g<first_name>", "Juliet Binoche 12135433"))

In [None]:
# if we want to check the duplicates names
pattern = re.compile(r"(?P<name>[A-Z]{1}[a-z]+) (?P=name)+")
pattern.search(r"Tom Tom Tom Cruise").groups()

In [28]:
## non-capturing grouping
# if we want to use grouping for alternation but we dont want to save the groups (saving some memory)
pattern = re.compile("Analy(?:s|z)e") # ?: makes the group not saving it
print(pattern.search(pattern, "Analyze is the same as analyse?").groups())

TypeError: 'str' object cannot be interpreted as an integer

In [None]:
## you caan use attomic matching to just check once and if it fails doesnt start with the next char pattern = "(?>\w+)"

In [None]:
## how about if statement? we would like to check if some group has happened before?
# Here is the syntanx (?(group1/group2)yes_pattern|no_pattern)
# for example you want to match if user input a username or an email
pattern = re.compile(r"(?P<name>[A-Za-z]{1}\w{,30}){1}(?P<domain>@[a-z]{,10})?(?(domain)(\.[a-z]{3}))")
print(pattern.findall("zibaei@yahoo.com shirini1365 54564646 mashang@ ghorban.com"))

In [None]:
# A character that is matched is discarded
# In order to be able to look ahead or look back (look around), we use metacharacters called zero-width methacharacters

# Look ahead uses (?=expression). at each location, it looks to see if it finds any match, however it does not consume
# (discard) what it has matched
# lets find all the objects of a sentence (anything comes before . or ,)

pattern = re.compile("\w+(?=,|\.)")
pattern.findall("He walked the dog, then ate his salad, and finally went to bed.")

In [None]:
# if you want to negate look ahaed (negative look ahead), you can use (?!expression). It matches expressions that are 
# not followed by "expression"
pattern = re.compile(r"کتاب(?!\sمثنوی)") # assume you want to find any کتاب that is not مثنوی

In [None]:
pattern = re.compile(r"کتاب(?!\sمثنوی) \w+") # assume you want to find any کتاب that is not مثنوی
it = pattern.findall("کتاب مثنوی و کتاب گلستان و کتاب حافظ بسیار مفید هستند.")

In [None]:
# now we are using sub function with the lookahead
# assume we want to replace numbers to seperate numbers to dash seperated pack of 4 numbers from right to left
pattern = re.compile(r"\d{1,4}(?=(\d{4})+(?!\d))")
print(pattern.sub("\g<0>-", '12334566764345345234234')) # since the look ahead is zero width, we can not use it for 
# findall and finditer, it would not be shown

In [29]:
## look behind uses this logic ?<=expression. the expression used should be a fixed-width,
## look behind logic for reverse is ?<!expression
## for example, we are looking inside a dated log and want to check all the failures that are not auth. failure
## here is an example 12:43:02 - errror number 64 auth. failure
text = "12:43:02 - errror number 64 auth. failure\n12:45:04 - errror number 65 syntax failure"
pat = r"\d{2}:\d{2}:\d{2}\s-\s[\w\.\s]*(?<!auth.\s)failure"
pattern = re.compile(pat, re.M) # make the compilation multiline
print(pattern.findall(text))

['12:45:04 - errror number 65 syntax failure']


In [30]:
## if we want to know how our program is doing, we can use cprofiler which is python built-in profiler
import cProfile
def phone_extractor(text):
    pattern = re.compile("\d{3}\d{3}\d{4}")
    return pattern.findall(text)


print(phone_extractor("1233453456 3453453456"))

['1233453456', '3453453456']


In [31]:
# some practices from regex practices on the hackerrank
pattern = re.compile(r"^[+-]?\d*\.\d+$") # detecting floating point number
pattern = re.compile(r"(?P<rep>[a-zA-Z0-9]{1})(?P=rep)") # detecting the reoccurance of alphanumerical values
pattern = re.compile(r"(?<=[^aeiou]{1})[aeiou]{2,}(?=[^aeiou]{1})") # detecting multiple vowels inside 2 consonants
# replacing "&&" and "||" with "and" and "or"
pattern = re.compile(r"(?<= )(?P<oper>&{2}|\|{2})(?= )")
pattern.sub(lambda x: "and" if x.group('oper')=='&&' else "or", text)
# matching a ten digit phone number starting by 7,8,9
pattern = re.compile(r"^[789][0-9]{9}$")
# matching email address with format <username@domain.extension>
pattern = re.compile(r"<(?P<user>[a-zA-Z][\w\-\._]+)@(?P<dom>[a-zA-Z]+)\.(?P<ext>[a-zA-Z]{1,3})>")
pattern = re.compile(r"(?<=\W)(#[a-fA-F0-9]{3}(?!\w)|#[a-fA-F0-9]{6})") # taking out HEX color codes 

In [28]:
# Nice problems: https://www.hackerrank.com/challenges/validating-uid/problem
#A valid UID must follow the rules below:

#It must contain at least  uppercase English alphabet characters.
#It must contain at least  digits ( - ).
#It should only contain alphanumeric characters ( - ,  -  &  - ).
#No character should repeat.
#There must be exactly  characters in a valid UID.


In [23]:
# in general I found this way of writing regex match expressions much more intuitive
def boolanizer(pattern, string):
    pattern = re.compile(pattern)
    return bool(pattern.search(string))

def UID_checker(string):
    length_10 = r"[a-zA-Z0-9]{10}"
    alpha_2 = r"[A-Z].*[A-Z]"
    numer_3 = r"\d.*\d.*\d"
    repeater = r"(.).*\1"

    if (
        boolanizer(length_10, string) and 
        boolanizer(alpha_2, string) and
        boolanizer(numer_3, string) and 
        not boolanizer(repeater, string)
    ):
        return "Valid"
    else:
        return "Invalid"

for _ in range(int(input())):
    print(UID_checker(input()))

3
5AW4F5SKRK
Invalid
W45A55K455
Invalid
BQWAILU471
Valid


In [30]:
# Another interesting example
# Match digits between 100,000 to 999,999
# There is not more than 1 alternating numbers e.g. 121 or 545
# https://www.hackerrank.com/challenges/validating-postalcode/problem
 
def postal_code_validator(s):
    egex_integer_in_range = r"^[1-9]\d{5}$" 
    regex_alternating_repetitive_digit_pair = r"(.)(?=.\1)"
    print (bool(re.match(egex_integer_in_range , s) and len(re.findall(regex_alternating_repetitive_digit_pair,s))<2 ))

postal_code_validator(input())

542361
True
