In [1]:
import re

In [2]:
# re.match() function checks for a match only at the beginning of the string (by default)

# \w(lowercase w) matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
# Note that although "word" is the mnemonic for this, it only matches a single word char,
# not a whole word. \W (upper case W) matches any non-word character

str = 'word:cat and example!!'
match = re.match(r'word:\w\w\w', str)

if match:
  print 'found', match.group()
else:
  print 'did not find'


found word:cat


In [3]:
# re.search() function checks for a match anywhere in the string.
# On success, match.group() is matched text.

match = re.search(r'iii', 'piiig')
match.group()  

'iii'

In [4]:
# .(period) matches any single character except single line \n

match = re.search(r'...g', 'yzzgpiiig')
match.group()

'yzzg'

In [5]:
# \d -- decimal digit [0-9] (some older regex utilities do not support but \d, but they all support \w and \s)
# \s -- (lowercase s) matches a single whitespace character -- space, newline, return, tab, form [ \n\r\t\f].
# \S (upper case S) matches any non-whitespace character.

match = re.search(r'\d\d\d', 'a123x')
print match.group()
match = re.search(r'\w\w\w', 'a123x')
print match.group()
match = re.search(r'\s\s\s', '\t \n')
print match.group()

123
a12
	 



In [6]:
# + -- 1 or more occurrences of the pattern to its left, e.g. 'i+' = one or more i's
# * -- 0 or more occurrences of the pattern to its left
# ? -- match 0 or 1 occurrences of the pattern to its left

match = re.search(r'pi+', 'piiigpii')
print match.group()
match = re.search(r'pi*', 'pgii')
print match.group()
match = re.search(r'pi?', 'piiigii')
print match.group()

piii
p
pi


In [7]:
  ## \s* = zero or more whitespace chars
  ## Here look for 3 digits, possibly separated by whitespace.
match = re.search(r'\d\s*\d\s*\d', 'xx1 2   3xx') 
print match.group() == "1 2   3"
match = re.search(r'\d\s*\d\s*\d', 'xx12  3xx')  
print match.group() == "12  3"
match = re.search(r'\d\s*\d\s*\d', 'xx123xx')  
print match.group() == "123"

True
True
True


In [8]:
# ^ = matches the start of the string

match = re.search(r'^b\w+', 'bca')
print match.group()

# \t - Lowercase t. Matches tab.
# $ - Matches a pattern at the end of string.
match = re.search(r'ake$', 'Eatcake')
print match.group()  

bca
ake


In [9]:
# Email example

str = 'anujkatara7@gmail.com'
email = re.search(r'\w+@\w+', str)
if match:
    print email
    print email.group()
else:
    print "no match"
    
# Note : The search does not get the whole email address in this case
# because the \w does not match the '-' or '.' in the address.

<_sre.SRE_Match object at 0x7ff41bf95ac0>
anujkatara7@gmail


In [10]:
# Squares Bracket

str = 'bob-marley78@yopmail.com'
email = re.search(r'[\w.-]+@[\w.-]+', str)
print email.group()

bob-marley78@yopmail.com


In [11]:
# An up-hat/caret (^) at the start of a square-bracket set inverts it, so [^ab] means any char except 'a' or 'b'.

match = re.search(r'[^ba]+', 'hhz')
print match.group()

hhz


In [12]:
# Group Extraction
# The "group" feature of a regular expression allows you to pick out parts of the matching text.
# for an example username@host for sepration of username and host we use () paranthesis.
# match.group(1) is the match text corresponding to the 1st left parenthesis, 
# and match.group(2) is the text corresponding to the 2nd left parenthesis.

str = 'bob marley-b@yopmail.com manana'
match = re.search(r'([\w\s.-]+)@([\w\s.-]+)', str)
if match:
    print match.group() 
    print match.group(1)
    print match.group(2)


bob marley-b@yopmail.com manana
bob marley-b
yopmail.com manana


In [13]:
# re.search() to find the first match for a pattern.
# findall() finds *all* the matches and returns them as a list of strings, with each string representing one match.

str = 'bob marley-b@yopmail.com manana, bob marley-b@yopmail.com manana'
emails = re.findall(r'[\w\s.-]+@[\w\s.-]+', str)
for email in emails:
    print email

bob marley-b@yopmail.com manana
 bob marley-b@yopmail.com manana


In [14]:
#find all method with file
# Open file
f = open('test.txt', 'r')
# Feed the file text into findall(); it returns a list of all the found strings
strings = re.findall(r'^ab\w', f.read())
for string in strings:
    print string
        

abc


In [15]:
# findall() and groupd 
str = 'bob marley-b@yopmail.com manana, bob marley-b@yopmail.com manana'
tuples = re.findall(r'([\w\s.-]+)@([\w\s.-]+)', str)
for tuple in tuples:
    print tuple[0]
    print tuple[1]

bob marley-b
yopmail.com manana
 bob marley-b
yopmail.com manana


In [16]:
# \A - Uppercase a. Matches only at the start of the string. Works across multiple lines as well.
# \b - Lowercase b. Matches only the beginning or end of the word.
print re.search(r'\A[A-E]ookie', 'Cookie').group()
print re.search(r'\b[A-E]ookie', 'Cookie').group()


Cookie
Cookie


In [17]:
# This checks for '\' in the string instead of '\s' due to the '\' used 
print re.search(r'Back\\stail', 'Back\stail').group()
# This treats '\s' as an escape character because it lacks '\' at the start of '\s'
print re.search(r'Back\stail', 'Back tail').group()

Back\stail
Back tail


In [18]:
# {x} - Repeat exactly x number of times.

# {x,} - Repeat at least x times or more.

# {x, y} - Repeat at least x times but no more than y times.

print re.search(r'\d{9,10}', '0987654321').group()

0987654321


In [19]:
# The pattern <.*> matched the whole string, right up to the second occurrence of >
heading  = r'<h1>TITLE</h1>'
re.match(r'<.*>', heading).group()

'<h1>TITLE</h1>'

In [20]:
# There is an extension to regular expression where you add a ? at the end,
# such as .*? or .+?, changing them to be non-greedy.
heading  = r'<h1>TITLE</h1>'
re.match(r'[^>]*', heading).group()

'<h1'

In [21]:
# The re.sub(pat, replacement, str) function searches for all the instances of pattern in the given string, 
# and replaces them. 
str = 'radha shyam@google.com, bob marley@abc.com blah'
## \1 is group(1), \2 group(2) in the replacement
print re.sub(r'([\w\.-]+)@([\w\.-]+)', r'\1@yo-yo-dyne.com', str)
print re.sub(r'([\w\.-]+)@([\w\.-]+)', r'\2@yo-yo-dyne.com', str)

radha shyam@yo-yo-dyne.com, bob marley@yo-yo-dyne.com blah
radha google.com@yo-yo-dyne.com, bob abc.com@yo-yo-dyne.com blah


In [22]:
# When you need to use an expression several times in a single program, 
# using the compile(sequence, flags) function to save the resulting regular
# expression object for reuse is more efficient.

pattern  = re.compile(r"cookie")
sequence = "Cake and cookie"
print pattern.search(sequence).group()

cookie


In [39]:
import re
import requests
the_idiot_url = 'https://www.gutenberg.org/files/2638/2638-0.txt'

def get_book(url):
    # Sends a http request to get the text from project Gutenberg
    raw = requests.get(url).text
    # Discards the metadata from the beginning of the book
    start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*",raw ).end()
    # Discards the metadata from the end of the book
    stop = re.search(r"II", raw).start()
    # Keeps the relevant text
    text = raw[start:stop]
    return text

def preprocess(sentence): 
    return re.sub('[^A-Za-z0-9.\"]+' , ' ', sentence)
book = get_book(the_idiot_url)
processed_book = preprocess(book)
# print(processed_book)
print len(re.findall(r'the', processed_book))
print len(re.findall(r',', book))
print len(re.findall(r'\"', book))


280
373
0
