In [4]:
import re

In [82]:
# re.match() function checks for a match only at the beginning of the string (by default)

# \w(lowercase w) matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
# Note that although "word" is the mnemonic for this, it only matches a single word char,
# not a whole word. \W (upper case W) matches any non-word character

str = 'word:cat and example!!'
match = re.match(r'word:\w\w\w', str)

if match:
  print 'found', match.group()
else:
  print 'did not find'


found word:cat


In [6]:
# re.search() function checks for a match anywhere in the string.
# On success, match.group() is matched text.

match = re.search(r'iii', 'piiig')
match.group()  

'iii'

In [7]:
# .(period) matches any single character except single line \n

match = re.search(r'...g', 'yzzgpiiig')
match.group()

'yzzg'

In [8]:
# \d -- decimal digit [0-9] (some older regex utilities do not support but \d, but they all support \w and \s)
# \s -- (lowercase s) matches a single whitespace character -- space, newline, return, tab, form [ \n\r\t\f].
# \S (upper case S) matches any non-whitespace character.

match = re.search(r'\d\d\d', 'a123x')
print match.group()
match = re.search(r'\w\w\w', 'a123x')
print match.group()
match = re.search(r'\s\s\s', '\t \n')
print match.group()

123
a12
	 



In [53]:
# + -- 1 or more occurrences of the pattern to its left, e.g. 'i+' = one or more i's
# * -- 0 or more occurrences of the pattern to its left
# ? -- match 0 or 1 occurrences of the pattern to its left

match = re.search(r'pi+', 'piiigpii')
print match.group()
match = re.search(r'pi*', 'pgii')
print match.group()
match = re.search(r'pi?', 'piiigii')
print match.group()

piii
p
pi


In [15]:
  ## \s* = zero or more whitespace chars
  ## Here look for 3 digits, possibly separated by whitespace.
match = re.search(r'\d\s*\d\s*\d', 'xx1 2   3xx') 
print match.group() == "1 2   3"
match = re.search(r'\d\s*\d\s*\d', 'xx12  3xx')  
print match.group() == "12  3"
match = re.search(r'\d\s*\d\s*\d', 'xx123xx')  
print match.group() == "123"

True
True
True


In [38]:
# ^ = matches the start of the string

match = re.search(r'^b\w+', 'bca')
print match.group()

bca


In [83]:
# Email example

str = 'anujkatara7@gmail.com'
email = re.search(r'\w+@\w+', str)
if match:
    print email
    print email.group()
else:
    print "no match"
    
# Note : The search does not get the whole email address in this case
# because the \w does not match the '-' or '.' in the address.

<_sre.SRE_Match object at 0x7f0c6a183e68>
anujkatara7@gmail


In [84]:
# Squares Bracket

str = 'bob-marley78@yopmail.com'
email = re.search(r'[\w.-]+@[\w.-]+', str)
print email.group()

bob-marley78@yopmail.com


In [48]:
# An up-hat/caret (^) at the start of a square-bracket set inverts it, so [^ab] means any char except 'a' or 'b'.

match = re.search(r'[^ba]+', 'hhz')
print match.group()

hhz


In [52]:
# Group Extraction
# The "group" feature of a regular expression allows you to pick out parts of the matching text.
# for an example username@host for sepration of username and host we use () paranthesis.
# match.group(1) is the match text corresponding to the 1st left parenthesis, 
# and match.group(2) is the text corresponding to the 2nd left parenthesis.

str = 'bob marley-b@yopmail.com manana'
match = re.search(r'([\w\s.-]+)@([\w\s.-]+)', str)
if match:
    print match.group() 
    print match.group(1)
    print match.group(2)


bob marley-b@yopmail.com manana
bob marley-b
yopmail.com manana


In [57]:
# re.search() to find the first match for a pattern.
# findall() finds *all* the matches and returns them as a list of strings, with each string representing one match.

str = 'bob marley-b@yopmail.com manana, bob marley-b@yopmail.com manana'
emails = re.findall(r'[\w\s.-]+@[\w\s.-]+', str)
print emails

['bob marley-b@yopmail.com manana', ' bob marley-b@yopmail.com manana']
