In [1]:
import re  # This is the regular expressions module. We must import this first.

## Find string that contains number character
### a_output_list_of_str(Number) = re.findal ('[0-9]', InputString)


In [5]:
string_list = ['no numbers', 'That costs $2,000!', 'Beverly Hills 90210']

print "We have the following string list: ", string_list
print "And we will use re.refindal to find string that contains numbers"
print "Result: "
print

# Find strings that contain numbers
for a_string in string_list:
    
    matches = re.findall('[0-9]', a_string)
    
    if len(matches) > 0:
        print '    The matches in the string "'+ a_string + '" are:', matches
        print
        
    else:
        print '    No matches in the string "' + a_string + '"'
        print

We have the following string list:  ['no numbers', 'That costs $2,000!', 'Beverly Hills 90210']
And we will use re.refindal to find string that contains numbers
Result: 

    No matches in the string "no numbers"

    The matches in the string "That costs $2,000!" are: ['2', '0', '0', '0']

    The matches in the string "Beverly Hills 90210" are: ['9', '0', '2', '1', '0']



##### The basic approach to regular expressions is that you give

* a searcher (here, _re.findall()_)
* a pattern (here, _'[0-9]'_)
* and a string in which to search (here, _one_string_).

##### In this case:

* The pattern _'[0-9]'_ means: any character between '0' and '9', i.e., any one of '0', '1', '2', ..., '9'
* The searcher _re.findall()_ tries to find all match of the pattern to the string (_one\_string_). 

### The re module has many other specialized searchers.


##### Let's look a bit more at some commonly used patterns.

> **[...]**
* means match any character within the square brackets

> **[^...]** 
* means match anything _except_ the characters within the brackets

> **[abcd]**
* means match any **one** of 'a', 'b', 'c', or 'd'

> **[a-z]**
* means match anything lower-case character

> **[a-zA-Z]**
* means match any lower-case or upper-case character

> **[0-9]**
* means any number


> **.**  **(the "full stop" sign)**
* means match any one character, whatever it is (this is called a _wild card_)

> **\\.**
* means match the full-stop character

> **There are several useful shorthands as well:**
* **\\w** is shorthand for [a-zA-Z0-9\_]
* **\\d** is shorthand for [0-9]
* **\\s** is shorthand for a space or a tab, often used as delimiter



In [6]:
def find_phone_number(a_string):
    """Given a string, find a phone number in it. Consider only two forms: 123.456.7890 or 123-456-7890"""
    print re.findall('\s\d\d\d[\.-]\d\d\d[\.-]\d\d\d\d\s', a_string)
    
find_phone_number(' Call me at 512-232-1234 or 888.291.2135 ASAP')

[' 512-232-1234 ', ' 888.291.2135 ']


In [7]:
blatantly_false_string = """
I'm gonna have the MOST followers! More tha @BarackObama, bigger
that @katyperry, gonna top @taylorswift13, snuff out @Harry_Styles,
and all you punks out there! That's right, contact me at 
bigdaddy@utexas.edu while I set up my account.
"""

In [8]:
def find_twitter_handles(a_string):
    
    ### use of the "+" sign to repeat the character ###
    print re.findall('@[a-zA-Z0-9_]+', a_string)
    
find_twitter_handles(blatantly_false_string)

['@BarackObama', '@katyperry', '@taylorswift13', '@Harry_Styles', '@utexas']


In [11]:
def find_twitter_handles_2(a_string):
    # \s refer to space
    print re.findall('\s@[a-zA-Z0-9_]+', a_string)

find_twitter_handles_2(blatantly_false_string)

[' @BarackObama', ' @katyperry', ' @taylorswift13', ' @Harry_Styles']


In [12]:
def find_emails(a_string):
    
    # [^@\s]+ anything except @ and space
    # \s refer to space
    print re.findall('[^@\s]+@[^@\s]+', a_string)

find_emails('Hello from csev@umich.edu to cwen@iupui.edu about the meeting @2PM')

['csev@umich.edu', 'cwen@iupui.edu']


In [13]:
# More complicated setup: extract emails from email headers.
header_string = """
From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008
Return-Path: <postmaster@collab.sakaiproject.org>
          for <source@collab.sakaiproject.org>;
Received: (from apache@localhost)
Author: stephen.marquard@uct.ac.za
"""

find_emails(header_string)

['stephen.marquard@uct.ac.za', '<postmaster@collab.sakaiproject.org>', '<source@collab.sakaiproject.org>;', 'apache@localhost)', 'stephen.marquard@uct.ac.za']


In [14]:
def find_emails_2(a_string):
    
    ### "*" represent 0 or more character
    print re.findall('[a-zA-Z][a-zA-Z\.]*@[a-zA-Z\.]*[a-zA-Z]', a_string)
    
find_emails_2('Hello from csev@umich.edu to cwen@iupui.edu about the meeting @2PM')
find_emails_2(header_string)

['csev@umich.edu', 'cwen@iupui.edu']
['stephen.marquard@uct.ac.za', 'postmaster@collab.sakaiproject.org', 'source@collab.sakaiproject.org', 'apache@localhost', 'stephen.marquard@uct.ac.za']


In [None]:
test_string_1 = '...<a href="http://mccombs.utexas.edu">McCombs</a>...'
test_string_2 = '...<a href="https://gmail.com">Gmail</a>...'

In [15]:
def find_URLs(a_string):
    print re.findall('<a href="https?://[^"]+">', a_string)

find_URLs(test_string_1)
find_URLs(test_string_2)

['<a href="http://mccombs.utexas.edu">']
['<a href="https://gmail.com">']


In [17]:
def find_URLs_2(a_string):
    #we can put parentheses around the part we want as the result
    print re.findall('<a href="(https?://[^"]+)">', a_string)

find_URLs_2(test_string_1)
find_URLs_2(test_string_2)

['http://mccombs.utexas.edu']
['https://gmail.com']
