In [1]:
import re

# Finding patterns in text - Regular Expression (RegEx)

In [62]:
text = "That person wears marvelous trousers."

In [63]:
pattern = 'a'
re.findall(pattern, text)

['a', 'a', 'a']

## Finding the pattern `er`

In [64]:
pattern = 'er'
re.findall(pattern, text)

['er', 'er']

## Finding the pattern `e` or the pattern `r`

In [65]:
pattern = '[er]'
re.findall(pattern, text)

['e', 'r', 'e', 'r', 'r', 'e', 'r', 'e', 'r']

## Use case

In [60]:
text = "Is it spelled gray or grey?"

pattern = 'gr[ae]y'
re.findall(pattern, text)

['gray', 'grey']

> So brackets [ ] are considered `sets` in RegEx. A set of patterns you want to find. 

## Since it is a set, you can make it a complete set

For example: The set of upper-case letters from A to C.

In [69]:
text = "This is an A and B conversation, so C your way out of it, or Even F."

pattern = '[A-C]'
re.findall(pattern, text)

['A', 'B', 'C']

In [70]:
text = "I'm not going to the party because 1) Karen is going, 2) I don't like her, and 3) I already have a headache."

pattern = '[1-3]'
re.findall(pattern, text)

['1', '2', '3']

Some useful sets: 

* [a-z]: Any lowercase letter between a and z.
* [A-Z]: Any uppercase letter between A and Z.
* [0-9]: Any numeric character between 0 and 9.

# Meta characters - They means something different than the character they represent.

* `.` : Match **any character** except newline (\n)
* `^` : If used within a `set`, negates the condition (similar to `~` in python)
    * Careful, this pattern also represents another thing: If used outside a `set`, it represents `match if at the beginning of the line`
* `$` : Match if at end of the line
* `|` : "OR" operator

## Match any character

In [71]:
text = """My boss asked me to turn in my TPS reports. 
I told him they were done, but they are not."""

pattern = '.'
print(re.findall(pattern, text))

['M', 'y', ' ', 'b', 'o', 's', 's', ' ', 'a', 's', 'k', 'e', 'd', ' ', 'm', 'e', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'i', 'n', ' ', 'm', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'e', 'p', 'o', 'r', 't', 's', '.', ' ', 'I', ' ', 't', 'o', 'l', 'd', ' ', 'h', 'i', 'm', ' ', 't', 'h', 'e', 'y', ' ', 'w', 'e', 'r', 'e', ' ', 'd', 'o', 'n', 'e', ',', ' ', 'b', 'u', 't', ' ', 't', 'h', 'e', 'y', ' ', 'a', 'r', 'e', ' ', 'n', 'o', 't', '.']


## Match everything not in specific set

In [72]:
pattern = '[^a-m]'
print(re.findall(pattern, text))

['M', 'y', ' ', 'o', 's', 's', ' ', 's', ' ', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'n', ' ', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'p', 'o', 'r', 't', 's', '.', ' ', '\n', 'I', ' ', 't', 'o', ' ', ' ', 't', 'y', ' ', 'w', 'r', ' ', 'o', 'n', ',', ' ', 'u', 't', ' ', 't', 'y', ' ', 'r', ' ', 'n', 'o', 't', '.']


## Match sentences beginning with `pattern`

In [73]:
pattern = '^My boss'
print(re.findall(pattern, text))

['My boss']


In [74]:
pattern = 'reports.$'
print(re.findall(pattern, text))

[]


In [34]:
pattern = 'are not.$'
print(re.findall(pattern, text))

['are not.']


In [39]:
pattern = 'boss|TPS|reports'
print(re.findall(pattern, text))

['boss', 'TPS', 'reports']


# Quantifiers 

* *: Matches previous character 0 or more times
* +: Matches previous character 1 or more times
* ?: Matches previous character 0 or 1 times (optional)
* {}: Matches previous characters however many times specified within:
* {n} : Exactly n times
* {n,} : At least n times
* {n,m} : Between n and m times

## Application of previous example of `$` using one of the most useful quantifiers `*`

In [80]:
pattern = 'are not.$'
print(re.findall(pattern, text))

['are not.']


In [81]:
pattern = '.are not.$'
print(re.findall(pattern, text))

[' are not.']


In [82]:
pattern = '.*are not.$'
print(re.findall(pattern, text))

['I told him they were done, but they are not.']


In [83]:
pattern = ',.*are not.$'
print(re.findall(pattern, text))

[', but they are not.']


## Capturing group!

What if I wanted to capture only things up until the comma (`,`), however, not include the comma?

I would have to use a capturing group to specify what specifically I want to capture.

In [84]:
pattern = ',.*are not.$'
print(re.findall(pattern, text))

[', but they are not.']


In [90]:
pattern = ',(.*are not.$)'
print(re.findall(pattern, text))

['but they are not.']


### How would I remove first space?

## So getting back to quantifiers

> `*` matches **0 or more** times

In [93]:
text = "The complicit cat interacted with the other cats exactly as we expected."

pattern = "ca*t"
print(re.findall(pattern, text))

['cat', 'ct', 'cat', 'ct', 'ct']


In [94]:
pattern = "ca+t"
print(re.findall(pattern, text))

['cat', 'cat']


> `?` matches **** times

In [98]:
text = "The colonel likes the color blue"

pattern = "colou?r"
print(re.findall(pattern, text))

['color']


How the Regex engine works?

In [99]:
text = "Is the correct spelling color, colour, or colr?"

pattern = "colou?r"
print(re.findall(pattern, text))

['color', 'colour']


# Important Regex Concept: Greediness


What will this match?

In [114]:
text = 'You are yelling! So I will yell too! Let me yell!!!'

# anything up to exclamation point
pattern = ".*!"
print(re.findall(pattern, text))

['You are yelling! So I will yell too! Let me yell!!!']


In [43]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2}"
print(re.findall(pattern, text))


['aww', 'aww', 'aww', 'aww']


In [44]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2,}"
print(re.findall(pattern, text))

['aww', 'awww', 'awwww', 'awwwww']


In [45]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2,3}"
print(re.findall(pattern, text))

['aww', 'awww', 'awww', 'awww']


# Character Patterns

* \w: Any alphanumeric character.
* \W: Any non-alphanumeric character.
* \d: Any numeric character.
* \D: Any non-numeric character.
* \s: Any whitespace characters.
* \S: Any non-whitespace characters.

In [46]:
text = "Th1s is going to_be a weird sentence with @ bunch-of-$tuff in it <3."

pattern = '\w'
print(re.findall(pattern, text))


['T', 'h', '1', 's', 'i', 's', 'g', 'o', 'i', 'n', 'g', 't', 'o', '_', 'b', 'e', 'a', 'w', 'e', 'i', 'r', 'd', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', 'w', 'i', 't', 'h', 'b', 'u', 'n', 'c', 'h', 'o', 'f', 't', 'u', 'f', 'f', 'i', 'n', 'i', 't', '3']


In [47]:
text = "Th1s is going to_be a weird sentence with @ bunch-of-$tuff in it <3."

pattern = '\W'
print(re.findall(pattern, text))


[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '@', ' ', '-', '-', '$', ' ', ' ', ' ', '<', '.']


In [48]:
text = "Th1s is going to_be a weird sentence with @ bunch-of-$tuff in it <3."
pattern = '\d'
print(re.findall(pattern, text))

['1', '3']


In [49]:
text = "Th1s is going to_be a weird sentence with @ bunch-of-$tuff in it <3."
pattern = '\D'
print(re.findall(pattern, text))

['T', 'h', 's', ' ', 'i', 's', ' ', 'g', 'o', 'i', 'n', 'g', ' ', 't', 'o', '_', 'b', 'e', ' ', 'a', ' ', 'w', 'e', 'i', 'r', 'd', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', ' ', 'w', 'i', 't', 'h', ' ', '@', ' ', 'b', 'u', 'n', 'c', 'h', '-', 'o', 'f', '-', '$', 't', 'u', 'f', 'f', ' ', 'i', 'n', ' ', 'i', 't', ' ', '<', '.']


In [51]:
text = "If you tell the truth 1 time, you don't have to remember anything 2 times."

pattern = '\w+'
print(re.findall(pattern, text))

['If', 'you', 'tell', 'the', 'truth', '1', 'time', 'you', 'don', 't', 'have', 'to', 'remember', 'anything', '2', 'times']


In [52]:
## word length
pattern = '\w{4,}'
print(re.findall(pattern, text))

['tell', 'truth', 'time', 'have', 'remember', 'anything', 'times']


In [53]:
text = "TerraPower, a nuclear-energy company founded by Bill Gates, is unlikely to follow through on building a demonstration reactor in China, due largely to the Trump administration’s crackdown on the country."

pattern = '[A-Z][a-z]+'
print(re.findall(pattern, text))

['Terra', 'Power', 'Bill', 'Gates', 'China', 'Trump']


# Capturing Groups!

In [54]:
pattern = '([A-Z][a-z]+ ?[A-Z][a-z]+)|([A-Z][a-z]+)'
print(re.findall(pattern, text))


[('TerraPower', ''), ('Bill Gates', ''), ('', 'China'), ('', 'Trump')]


In [55]:
results = [i for j in re.findall(pattern, text) for i in j if i != '']
results

['TerraPower', 'Bill Gates', 'China', 'Trump']

In [56]:
## quotes
text = """
For eight young men the AP tracked down in Seattle, tech obsession has become something much darker, getting in the way of their normal lives.

"We’re talking flunk-your-classes, can’t-find-a-job, live-in-a-dark-hole kinds of problems, with depression, anxiety and sometimes suicidal thoughts part of the mix," the AP's Martha Irvine reports.
"""

pattern = '".*"'
re.findall(pattern, text)

['"We’re talking flunk-your-classes, can’t-find-a-job, live-in-a-dark-hole kinds of problems, with depression, anxiety and sometimes suicidal thoughts part of the mix,"']

In [58]:
text = """
Aeromexico 800-237-6639
Air Canada 888-247-2262
Air Canada Rouge 888-247-2262
Air Creebec 800-567-6567
Air Inuit 800-361-2965
Air North 800-661-0407
Air Tindi 888-545-6794
Air Transat 866-847-1112
Alaska Airlines 800-426-0333, 866-516-1685"""

In [59]:
pattern = '\d+-\d+-\d+'
re.findall(pattern, text)

['800-237-6639',
 '888-247-2262',
 '888-247-2262',
 '800-567-6567',
 '800-361-2965',
 '800-661-0407',
 '888-545-6794',
 '866-847-1112',
 '800-426-0333',
 '866-516-1685']