In [None]:
import re

In [None]:
text = "This is a good day."

In [None]:
re.search("good", text) is not None

In [None]:
# the work that regex does here is called tokenizing
# where the string is separated into substrings based on patterns
# tokenizing is a core activity in natural language processing (NLP)
# the findall() and split() functions will parse the string for us and return chunks
text = "Amy works diligently. Amy gets good grades. Our student Amy is successful."

In [None]:
re.split("Amy", text)

In [None]:
# findall() will return an array
# if we wanted to count how many times we have talked about Amy
re.findall("Amy", text)

In [None]:
# search() looks for some pattern and returns a boolean
# the caret "^" it means that the text regex processor retrieves must start with the string you specify
# if this begins with Amy
re.search("^Amy", text) is not None

patterns and character classes

In [None]:
grades = "ACAAAABCBCBAA"

In [None]:
# if we want to answer the question "how many B's were in the grade list?"
re.findall("A", grades)

In [None]:
# if we wanted to count the number of A's or B's in the list
# can't use "AB" since this is used to match
# we should put the characters A and B inside square brackets []
re.findall("[AB]", grades)

In [None]:
# parse out all instances where this student receive an A followed by a B or a C
re.findall("A[BC]", grades) # or AB|AC

In [None]:
# if we want to parse out only the grades which were not A's
re.findall("[^A]", grades)
# note - the caret "^" was previously matched to the beginning of a string as an anchor point
# but inside of the set operator "[]" the caret, and the other special characters we will be talking about lose their meaning
# this can be a bit confusing

In [None]:
re.findall("^[^A]", grades)
# it's an empty list, because the regex says that we want to match any value at the beginning of the string which is not an A
# our string through start with an A, so there is no match found
# and remember when you are using the set operator you are doing character based matching
# so you are matching individual characters in an OR method

**quantifier**

In [None]:
# how many times has this student been on a back-to-back A's streak
# we'll use 2 as our min, but 10 as our max
# regex quantifieer syntax does not allow you to deviate from the {m,n}
re.findall("A{2,10}", grades) # not allow A{2, 10}

In [None]:
re.findall("A{2}", grades)

In [None]:
re.findall("A{1,10}B{1,10}C{1,10}", grades)

In [None]:
with open("ferpa.txt", "r") as f:
    wiki = f.read()

In [None]:
wiki

In [None]:
re.findall("[A-Za-z]{1,100}\[edit\]", wiki)

In [None]:
re.findall("\w{1,100}\[edit\]", wiki)

In [None]:
re.findall("\w+\[edit\]", wiki)

In [None]:
re.findall("[\w ]+\[edit\]", wiki)

In [None]:
for title in re.findall("[\w ]+\[edit\]", wiki):
    print(re.split("\[", title)[0])

**groups**

In [None]:
# above we talking about a regex as a single pattern which is matched
# but you can actually match different patterns, called groups, at the same time, and refer to the group you want
# group patterns together you use parentheses
re.findall("(([\w ]+)(\[edit\]))", wiki)

In [None]:
for title in re.finditer("(([\w ]+)(\[edit\]))", wiki):
    print(title.groups())

In [None]:
for title in re.finditer("(([\w ]+)(\[edit\]))", wiki):
    print(title.groups()[1])

In [None]:
# good idea is labeling or naming groups
# we use the syntax (?P<name>), where the parenthesis starts the groups
# the ?P indicates that this is an extension to basic regex, and <name> is the dictionary key we want to use wrapped in <>
for title in re.finditer("((?P<title>[\w ]+)(?P<edit_link>\[edit\]))", wiki):
    print(title.groupdict())

In [None]:
for title in re.finditer("((?P<title>[\w ]+)(?P<edit_link>\[edit\]))", wiki):
    print(title.groupdict()["title"])

In [None]:
# here's the dictionary kept for the last match
print(title.groupdict())

**look-ahead and look-behind**

In [None]:
# in our headers we want to isolate text which comes before the [edit] rendering, but we actually don't care about the [edit] text itself
# ?= is the syntax look-ahead
for title in re.finditer("(?P<title>[\w ]+)(?=\[edit\])", wiki): #
    print(title.group())

**example**

In [None]:
with open("buddhist.txt", "r", encoding="utf-8") as f:
    wiki = f.read()

In [None]:
wiki

In [None]:
university = "(?P<university>[\w ]+)"
u_c = "(\ (–|is)\ located\ in\ )"
city = "(?P<city>[\w ]+)"
c_t = "(,\ )"
state = "(?P<state>\w+)"

# hash
pattern = '%s' % (university + u_c + city + c_t + state)
pattern

In [None]:
arr = []
for i in re.finditer(pattern, wiki, re.VERBOSE):
    arr.append(i.groupdict())
    print(i.groupdict())

**example**

In [None]:
with open("nytimeshealth.txt", "r", encoding="utf-8") as f:
    health = f.read()

In [None]:
health

In [None]:
pattern = "#[\w]+(?=\ )"

In [None]:
re.findall(pattern, health)