## String Manipulation and Regular Expressions

#### Capitalising letters

In [2]:
my_string = "tHis Is a niCe StriNg"

my_string.capitalize()

'This is a nice string'

#### Splitting

In [3]:
my_string = "This string will be split"

my_string.split(sep=" ", maxsplit=2)

['This', 'string', 'will be split']

#### Right splitting

In [4]:
my_string.rsplit(sep=" ", maxsplit=2)

['This string will', 'be', 'split']

#### Splitting lines

In [5]:
my_string = "This string will be split\nin two"

my_string.splitlines()

['This string will be split', 'in two']

#### Joining

In [6]:
my_list = ["this", "would", "be", "a", "string"]
print(" ".join(my_list))

this would be a string


#### Stripping characters

In [7]:
my_string = " This string will be stripped\n"
my_string.strip()

'This string will be stripped'

#### Remove characters from the right end

In [8]:
my_string.rstrip()

' This string will be stripped'

#### Remove characters from the left end

In [9]:
my_string.lstrip()

'This string will be stripped\n'

#### Example

In [10]:
movie = '$I supposed that coming from MTV Films I should expect no less$'

# Convert to lowercase and print the result
movie_lower = movie.lower()
print(movie_lower)

# Remove specified character and print the result
movie_no_sign = movie_lower.strip("$")
print(movie_no_sign)

# Split the string into substrings and print the result
movie_split = movie_no_sign.split()
print(movie_split)

# Select root word and print the result
word_root = movie_split[1][:-1]
print(word_root)

$i supposed that coming from mtv films i should expect no less$
i supposed that coming from mtv films i should expect no less
['i', 'supposed', 'that', 'coming', 'from', 'mtv', 'films', 'i', 'should', 'expect', 'no', 'less']
suppose


In [11]:
movie = 'the film,however,is all good<\\i>'
# Remove tags happening at the end and print results
movie_tag = movie.rstrip("<\i>")
print(movie_tag)

# Split the string using commas and print results
movie_no_comma = movie_tag.split(",")
print(movie_no_comma)

# Join back together and print results
movie_join = " ".join(movie_no_comma)
print(movie_join)

the film,however,is all good
['the film', 'however', 'is all good']
the film however is all good


#### Finding substrings with .find(), .index()

In [1]:
my_string = "Where's Waldo?"
my_string.find("Waldo")

8

In [2]:
my_string.find("Wenda")

-1

In [3]:
my_string.find("Waldo", 0, 6)

-1

In [5]:
my_string.index("Waldo")

8

In [7]:
try:
    my_string.index("Wenda")
except ValueError:
      print("Not found")

Not found


#### Counting occurrences

In [8]:
my_string = "How many fruits do you have in your fruit basket?"
my_string.count("fruit")

2

In [9]:
my_string.count("fruit", 0, 16)

1

#### Replacing sub-strings

In [10]:
my_string = "The red house is between the blue house and the old house"
print(my_string.replace("house", "car"))

The red car is between the blue car and the old car


In [11]:
print(my_string.replace("house", "car", 2))

The red car is between the blue car and the old house


#### Positional formatting

In [2]:
wikipedia_article = 'In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals.'

my_list = []

# Assign the substrings to the variables
first_pos = wikipedia_article[3:19].lower()
second_pos = wikipedia_article[21:44].lower()

# Define string with placeholders 
my_list.append("The tool {} is used in {}")

# Define string with rearranged placeholders
my_list.append("The tool {1} is used in {0}")

# Use format to print strings
for my_string in my_list:
    print(my_string.format(first_pos, second_pos))

The tool computer science is used in artificial intelligence
The tool artificial intelligence is used in computer science


#### Reordering values

In [3]:
print("{2} has a friend called {0} and a sister called {1}".format("Betty", "Linda", "Daisy"))

Daisy has a friend called Betty and a sister called Linda


#### Named placeholders

In [4]:
tool="Unsupervised algorithms"
goal="patterns"
print("{title} try to find {aim} in the dataset".format(title=tool, aim=goal))

Unsupervised algorithms try to find patterns in the dataset


In [6]:
my_methods = {"tool": "Unsupervised algorithms", "goal": "patterns"}
print('{data[tool]} try to find {data[goal]} in the dataset'.format(data=my_methods))

Unsupervised algorithms try to find patterns in the dataset


In [10]:
courses = ['artificial intelligence', 'neural networks']

# Create a dictionary
plan = { "field": courses[0],
        "tool": courses[1]
        }

# Complete the placeholders accessing elements of field and tool keys in the data dictionary
my_message = "If you are interested in {data[field]}, you can take the course related to {data[tool]}"

# Use the plan dictionary to replace placeholders
print(my_message.format(data=plan))

If you are interested in artificial intelligence, you can take the course related to neural networks


#### Formatting strings

In [7]:
print("Only {0:.2f}% of the {1} produced worldwide is {2}!".format(0.5155675, "data", "analyzed"))

Only 0.52% of the data produced worldwide is analyzed!


#### Date strings

In [9]:
from datetime import datetime
print(datetime.now())

print("Today's date is {:%Y-%m-%d %H:%M}".format(datetime.now()))

2022-02-25 14:09:26.569583
Today's date is 2022-02-25 14:09


In [11]:
# Import datetime 
from datetime import datetime

# Assign date to get_date
get_date = datetime.now()

# Add named placeholders with format specifiers
message = "Good morning. Today is {today:%B %d, %Y}. It's {today:%H:%M} ... time to work!"

# Use the format method replacing the placeholder with get_date
print(message.format(today=get_date))

Good morning. Today is February 25, 2022. It's 14:14 ... time to work!


#### Formatted literal strings

 - !s normal string excluding quotes
 - !r includes quotes
 - !a escapes non-ASCII characters

In [19]:
name = "Python"
print(f"Python is called {name!s} due to a comedy series")

print(f"Python is called {name!r} due to a comedy series")

Python is called Python due to a comedy series
Python is called 'Python' due to a comedy series


In [24]:
number = 90.41890417471841

print(f"In the last 2 years, {number:.2f}% of the data was produced worldwide!")

In the last 2 years, 90.42% of the data was produced worldwide!


In [25]:
from datetime import datetime

my_today = datetime.now()

print(f"Today's date is {my_today:%B %d, %Y}")

Today's date is February 25, 2022


In [26]:
family = {"dad": "John", "siblings": "Peter"}

print(f"Is your dad called {family['dad']}?")

Is your dad called John?


In [27]:
def my_function(a, b):
    return a + b

print(f"If you sum up 10 and 20 the result is {my_function(10, 20)}")

If you sum up 10 and 20 the result is 30


In [32]:
# Divide the length of list by 120 rounded to two decimals
list_links = ['url1','url2']
print(f"Only {len(list_links)*100/120:.2f}% of the posts contain links")

Only 1.67% of the posts contain links


In [34]:
# Access values of date and price in east dictionary
import datetime
east = {'date': datetime.datetime(2007, 4, 20, 0, 0), 'price': 1232443}
print(f"The price for a house in the east neighborhood was ${east['price']} in {east['date']:%m-%d-%Y}")

The price for a house in the east neighborhood was $1232443 in 04-20-2007


In [35]:
# Access values of date and price in west dictionary
west = {'date': datetime.datetime(2006, 5, 26, 0, 0), 'price': 1432673}
print(f"The price for a house in the west neighborhood was ${west['price']} in {west['date']:%m-%d-%Y}.")

The price for a house in the west neighborhood was $1432673 in 05-26-2006.


#### Templating

In [38]:
from string import Template
job = "Data science"
name = "sexiest job of the 21st century"

my_string = Template('$title has been called $description')
my_string.substitute(title=job, description=name)

'Data science has been called sexiest job of the 21st century'

In [39]:
my_string = Template('I find Python very ${noun}ing but my sister has lost $noun')
my_string.substitute(noun="interest")

'I find Python very interesting but my sister has lost interest'

In [45]:
my_string = Template('I paid for the Python course only $$ $price, amazing!')
my_string.substitute(price="12.50")

'I paid for the Python course only $ 12.50, amazing!'

#### Safe substitution

In [54]:
favorite = dict(flavor="chocolate")
my_string = Template('I love $flavor c$ake very much')
my_string.safe_substitute(favorite)

'I love chocolate c$ake very much'

In [55]:
favorite = dict(flavor="chocolate")
my_string = Template('I love $flavor $cake very much')
my_string.substitute(favorite)

KeyError: 'cake'

#### Regular expressions

 - A regular expression is a string that contains normal characters and special metacharacters which describe patterns to find text or positions within a text
 - A pattern is a sequence of characters that maps to words or punctuation
 - Regex allow for finding patterns that would be very difficult otherwise
 - They are fast
 
 - You write down some helpful metacharacters to help you later:

    - \d: digit
    - \w: word character
    - \W: non-word character
    - \s: whitespace

In [56]:
# Import the re module
import re

sentiment_analysis = '@robot9! @robot4& I have a good feeling that the show isgoing to be amazing! @robot9$ @robot7%'
# Write the regex
regex = r"@robot\d\W"

# Find all matches of regex
print(re.findall(regex, sentiment_analysis))

['@robot9!', '@robot4&', '@robot9$', '@robot7%']


#### Find the numbers
You pull a list of metacharacters:
 - \d digit
 - \w word character
 - \s whitespace

In [57]:
sentiment_analysis = "Unfortunately one of those moments wasn't a giant squid monster. User_mentions:2, likes: 9, number of retweets: 7"

# Write a regex to obtain user mentions
print(re.findall(r"User_mentions:\d", sentiment_analysis))

# Write a regex to obtain number of likes
print(re.findall(r"likes:\s\d", sentiment_analysis))

# Write a regex to obtain number of retweets
print(re.findall(r"number\sof\sretweets:\s\d", sentiment_analysis))

['User_mentions:2']
['likes: 9']
['number of retweets: 7']


#### Match and split

In [58]:
sentiment_analysis = 'He#newHis%newTin love with$newPscrappy. #8break%He is&newYmissing him@newLalready'

# Write a regex that matches the pattern separating the sentences in sentiment_analysis, e.g. &4break!
# Write a regex to match pattern separating sentences
regex_sentence = r"\W\dbreak\W"

# Replace the regex_sentence with a space
sentiment_sub = re.sub(regex_sentence, " ", sentiment_analysis)

# Write a regex that matches the pattern separating the words in sentiment_analysis, e.g. #newH.
# Write a regex to match pattern separating words
regex_words = r"\Wnew\w"

# Replace the regex_words and print the result
sentiment_final = re.sub(regex_words, " ", sentiment_sub)
print(sentiment_final)

He is in love with scrappy.  He is missing him already


#### Repetitions
 - Quantifiers a metacharacter that tells the regex engine how many times to match a character immediately to its left (+)
 - Zero times or more (*)
 - Zero times or once (?)
 - n times at least, m times at most {n,m}

In [59]:
text = "Date of start: 4-3. Date of registration: 10-04."
re.findall(r"\d+-\d+", text)

['4-3', '10-04']

In [60]:
my_string = "The concert was amazing! @ameli!a @joh&&n @mary90"
re.findall(r"@\w+\W*\w+", my_string)

['@ameli!a', '@joh&&n', '@mary90']

In [61]:
text = "The color of this image is amazing. However, the colour blue could be brighter."
re.findall(r"colou?r", text)

['color', 'colour']

In [62]:
phone_number = "John: 1-966-847-3131 Michelle: 54-908-42-42424"
re.findall(r"\d{1,2}-\d{3}-\d{2,3}-\d{4,}", phone_number)

['1-966-847-3131', '54-908-42-42424']

In [64]:
sentiment_analysis = [
    "Boredd. Colddd @blueKnight39 Internet keeps stuffing up. Save me! https://www.tellyourstory.com",
    "I had a horrible nightmare last night @anitaLopez98 @MyredHat31 which affected my sleep, now I'm really tired",
    "im lonely  keep me company @YourBestCompany! @foxRadio https://radio.foxnews.com 22 female, new york"
    ]

# Import re module
import re

for tweet in sentiment_analysis:
    
    # Write a regex to find all the matches of http links appearing in each tweet in sentiment_analysis
    # Write regex to match http links and print out result
    print(re.findall(r"http\S+", tweet))
    
    # Write a regex to find all the matches of user mentions appearing in each tweet in sentiment_analysis
    # Write regex to match user mentions and print out result
    print(re.findall(r"@\w+", tweet))

['https://www.tellyourstory.com']
['@blueKnight39']
[]
['@anitaLopez98', '@MyredHat31']
['https://radio.foxnews.com']
['@YourBestCompany', '@foxRadio']


#### More Repetitions

In [65]:
sentiment_analysis = [
    "I would like to apologize for the repeated Video Games Live related tweets. 32 minutes ago",
    "@zaydia but i cant figure out how to get there / back / pay for a hotel 1st May 2019",
    "FML: So much for seniority, bc of technological ineptness 23rd June 2018 17:54"

]

# Complete the for loop with a regex to find dates
for date in sentiment_analysis:
    print(re.findall(r"\d{1,2}\s\w+\sago", date))

    print(re.findall(r"\d{1,2}\w+\s\w+\s\d{4}", date))

    print(re.findall(r"\d{1,2}\w+\s\w+\s\d{4}\s\d{2}:\d{1,2}", date))

['32 minutes ago']
[]
[]
[]
['1st May 2019']
[]
[]
['23rd June 2018']
['23rd June 2018 17:54']


#### Getting tokens

In [66]:
sentiment_analysis = 'ITS NOT ENOUGH TO SAY THAT IMISS U #MissYou #SoMuch #Friendship #Forever'

# Write a regex matching the hashtag pattern
regex = r"#\w+"

# Replace the regex by an empty string
no_hashtag = re.sub(regex, "", sentiment_analysis)

# Get tokens by splitting text
print(re.split(r"\s+", no_hashtag))

['ITS', 'NOT', 'ENOUGH', 'TO', 'SAY', 'THAT', 'IMISS', 'U', '']


#### Search vs match

In [70]:
re.search(r"\d{4}", "4506 people attend the show")

<re.Match object; span=(0, 4), match='4506'>

In [71]:
re.match(r"\d{4}", "4506 people attend the show")

<re.Match object; span=(0, 4), match='4506'>

In [77]:
re.search(r"\d+", "Yesterday, I saw 3 shows")

<re.Match object; span=(17, 18), match='3'>

In [79]:
re.match(r"\d+","Yesterday, I saw 3 shows") # results in None

#### Match with characters

In [80]:
my_links = "Just check out this link: www.amazingpics.com. It has amazing photos!"
re.findall(r"www.+com", my_links)

['www.amazingpics.com']

In [84]:
my_string = "the 80s music was much better that the 90s"
print(re.findall(r"the\s\d+s", my_string))
print(re.findall(r"^the\s\d+s", my_string))
print(re.findall(r"the\s\d+s$", my_string))

['the 80s', 'the 90s']
['the 80s']
['the 90s']


In [85]:
my_string = "Elephants are the world's largest land animal! I would love to see an elephant one day"
re.findall(r"Elephant|elephant", my_string)

['Elephant', 'elephant']

#### Regex metacharacters

 - They appear at the start of the string
 - They always start with a sequence of 2 or 3 upper or lowercase vowels (a e i o u)
 - They always finish with the txt ending

In [67]:
sentiment_analysis = [
    "AIshadowhunters.txt aaaaand back to my literature review. At least i have a friendly cup of coffee to keep me company",
    "ouMYTAXES.txt I am worried that I won't get my $900 even though I paid tax last year"
]

# Write a regex to match text file name
regex = r"^[aeiouAEIOU]{2,3}.+txt"

for text in sentiment_analysis:
    # Find all matches of the regex
    print(re.findall(regex, text))

    # Replace all matches with empty string
    print(re.sub(regex, "", text))

['AIshadowhunters.txt']
 aaaaand back to my literature review. At least i have a friendly cup of coffee to keep me company
['ouMYTAXES.txt']
 I am worried that I won't get my $900 even though I paid tax last year


#### Regex metacharacters examples

In [68]:
emails = ['n.john.smith@gmail.com', '87victory@hotmail.com', '!#mary-=@msca.net']

# Write a regex to match a valid email address
regex = r"^[a-zA-Z0-9!#%&*$.]+@\w+\.com"

for example in emails:
  	# Match the regex to the string
    if re.match(regex, example):
        # Complete the format method to print out the result
      	print("The email {email_example} is a valid email".format(email_example=example))
    else:
      	print("The email {email_example} is invalid".format(email_example=example))   

The email n.john.smith@gmail.com is a valid email
The email 87victory@hotmail.com is a valid email
The email !#mary-=@msca.net is invalid


In [69]:
passwords = ['Apple34!rose', 'My87hou#4$', 'abc123']

# Write a regex to check if the password is valid
regex = r"[a-zA-Z0-9*#$%!&.]{8,20}"

for example in passwords:
    # Scan the strings to find a match
    if re.search(regex, example):
        # Complete the format method to print out the result
        print("The password {pass_example} is a valid password".format(pass_example=example))
    else:
        print("The password {pass_example} is invalid".format(pass_example=example))

The password Apple34!rose is a valid password
The password My87hou#4$ is a valid password
The password abc123 is invalid


#### Greedy and lazy matching

 - Greedy
     - match as many characters as possible
     - return longest match
     - backtracks when it finds too many characters
 - Lazy
     - match as few characters as possible
     - return shortest match
     - backtracks when too few characters are matched

In [86]:
string = 'I want to see that <strong>amazing show</strong> again!.'

# Write a regex to eliminate tags
string_notags = re.sub(r"<.+?>", "", string)

# Print out the result
print(string_notags)

I want to see that amazing show again!.


In [87]:
sentiment_analysis = 'Was intending to finish editing my 536-page novel manuscript tonight, but that will probably not happen. And only 12 pages are left '

# Write a lazy regex expression 
numbers_found_lazy = re.findall(r"\d+?", sentiment_analysis)

# Print out the result
print(numbers_found_lazy)


# Write a greedy regex expression 
numbers_found_greedy = re.findall(r"\d+", sentiment_analysis)

# Print out the result
print(numbers_found_greedy)

['5', '3', '6', '1', '2']
['536', '12']


In [88]:
sentiment_analysis = "Put vacation photos online (They were so cute) a few yrs ago. PC crashed, and now I forget the name of the site (I'm crying). "

# Write a greedy regex expression to match text that appears within parentheses in the variable
sentences_found_greedy = re.findall(r"\(.*\)", sentiment_analysis)

# Print out the result
print(sentences_found_greedy)

# Write a lazy regex expression to match text that appears within parentheses in the variable 
sentences_found_lazy = re.findall(r"\(.*?\)", sentiment_analysis)

# Print out the results
print(sentences_found_lazy)

["(They were so cute) a few yrs ago. PC crashed, and now I forget the name of the site (I'm crying)"]
['(They were so cute)', "(I'm crying)"]


#### Capturing groups

In [89]:
re.findall(r'([A-Za-z]+)\s\w+\s(\d+)\s(\w+)', 
           "Clary has 2 dogs but John has 3 cats")


[('Clary', '2', 'dogs'), ('John', '3', 'cats')]

In [90]:
re.search(r"(\d[A-Za-z])+", "My user name is 3e4r5fg")

<re.Match object; span=(16, 22), match='3e4r5f'>

Capturing repeated groups vs repeating a capturing group

In [95]:
my_string = "My lucky numbers are 8755 and 33"
re.findall(r"(\d)+", my_string)

['5', '3']

In [96]:
re.findall(r"(\d+)", my_string)

['8755', '33']

 - Complete the regex to match the email capturing only the name part. The name part appears before the @.
 - Find all matches of the regex in each element of sentiment_analysis analysis. Assign it to the variable email_matched.
 - Complete the .format() method to print the results captured in each element of sentiment_analysis analysis.

In [97]:
sentiment_analysis = ['Just got ur newsletter, those fares really are unbelievable. Write to statravelAU@gmail.com or statravelpo@hotmail.com. They have amazing prices',
 'I should have paid more attention when we covered photoshop in my webpage design class in undergrad. Contact me Hollywoodheat34@msn.net.',
 'hey missed ya at the meeting. Read your email! msdrama098@hotmail.com']

# Write a regex that matches email
regex_email = r"([A-Za-z0-9]+)@\S+"

for tweet in sentiment_analysis:
    # Find all matches of regex in each tweet
    email_matched = re.findall(regex_email, tweet)

    # Complete the format method to print the results
    print("Lists of users found in this tweet: {}".format(email_matched))

Lists of users found in this tweet: ['statravelAU', 'statravelpo']
Lists of users found in this tweet: ['Hollywoodheat34']
Lists of users found in this tweet: ['msdrama098']


<<Here you have your boarding pass LA4214 AER-CDB 06NOV.>>

You need to extract the information about the flight:

 - The two letters indicate the airline (e.g LA),
 - The 4 numbers are the flight number (e.g. 4214).
 - The three letters correspond to the departure (e.g AER),
 - The destination (CDB),
 - The date (06NOV) of the flight.

In [98]:
flight = 'Subject: You are now ready to fly. Here you have your boarding pass IB3723 AMS-MAD 06OCT'

# Write regex to capture information of the flight
regex = r"([A-Z]{2})(\d{4})\s([A-Z]{3})-([A-Z]{3})\s(\d{2}[A-Z]{3})"

# Find all matches of the flight information
flight_matches = re.findall(regex, flight)
    
#Print the matches
print("Airline: {} Flight number: {}".format(flight_matches[0][0], flight_matches[0][1]))
print("Departure: {} Destination: {}".format(flight_matches[0][2], flight_matches[0][3]))
print("Date: {}".format(flight_matches[0][4]))

Airline: IB Flight number: 3723
Departure: AMS Destination: MAD
Date: 06OCT


#### Pipe

In [101]:
my_string = "I want to have a pet. But I don't know if I want 2 cats, 1 dog or a bird."
re.findall(r"\d+\scat|dog|bird", my_string)

['2 cat', 'dog', 'bird']

#### Alternation

 - Use groups to choose between optional patterns

In [99]:
my_string = "I want to have a pet. But I don't know if I want 2 cats, 1 dog or a bird."
re.findall(r"(\d)+\s(cat|dog|bird)", my_string)

[('2', 'cat'), ('1', 'dog')]

#### Match but not capture a group

In [100]:
my_string = "John Smith: 34-34-34-042-980, Rebeca Smith: 10-10-10-434-425"
re.findall(r"(?:\d{2}-){3}(\d{3}-\d{3})", my_string)

['042-980', '434-425']

 - Use | between the optional words you want to capture inside the parentheses ()
 - Use the . metacharacter together with the plus quantifier if you want to match any type of character
 - Consider using a non-greedy quantifier adding ?
 
 
 - Complete the regular expression to capture the words love or like or enjoy. Match and capture the words movie or concert. Match and capture anything appearing until the .
 - Find all matches of the regex in each element of sentiment_analysis. Assign them to positive_matches
 - Complete the .format() method to print out the results contained in positive_matches for each element in sentiment_analysis

In [102]:
sentiment_analysis = [
        'I totally love the concert The Book of Souls World Tour. It kinda amazing!',
        'I enjoy the movie Wreck-It Ralph. I watched with my boyfriend.',
        "I still like the movie Wish Upon a Star. Too bad Disney doesn't show it anymore."
        ]

# Write a regex that matches sentences with the optional words
regex_positive = r"(love|like|enjoy).+?(movie|concert)\s(.+?)\."

for tweet in sentiment_analysis:
    # Find all matches of regex in tweet
    positive_matches = re.findall(regex_positive, tweet)
    
    # Complete format to print out the results
    print("Positive comments found {}".format(positive_matches))

Positive comments found [('love', 'concert', 'The Book of Souls World Tour')]
Positive comments found [('enjoy', 'movie', 'Wreck-It Ralph')]
Positive comments found [('like', 'movie', 'Wish Upon a Star')]


 - Complete the regular expression to capture the words hate or dislike or disapprove. Match but don't capture the words movie or concert. Match and capture anything appearing until the .

In [104]:
sentiment_analysis = [
    'That was horrible! I really dislike the movie The cabin and the ant. So boring.',
    "I disapprove the movie Honest with you. It's full of cliches.",
    'I dislike very much the concert After twelve Tour. The sound was horrible.'
]

# Write a regex that matches sentences with the optional words
regex_negative = r"(hate|dislike|disapprove).+?(?:movie|concert)\s(.+?)\."

for tweet in sentiment_analysis:
    # Find all matches of regex in tweet
    negative_matches = re.findall(regex_negative, tweet)
    
    # Complete format to print out the results
    print("Negative comments found {}".format(negative_matches))

Negative comments found [('dislike', 'The cabin and the ant')]
Negative comments found [('disapprove', 'Honest with you')]
Negative comments found [('dislike', 'After twelve Tour')]


#### Numbered groups

In [105]:
text = "Python 3.0 was released on 12-03-2008."
information = re.search('(\d{1,2})-(\d{2})-(\d{4})', text)
information.group(3)

'2008'

#### Named groups

In [106]:
text = "Austin, 78701"
cities = re.search(r"(?P<city>[A-Za-z]+).*?(?P<zipcode>\d{5})", text)
cities.group("city")

'Austin'

In [107]:
cities.group("zipcode")

'78701'

#### Backreferences
 - Capture sequences of characters that were previously captured

In [108]:
 sentence = "I wish you a happy happy birthday!"
re.findall(r"(\w+)\s\1", sentence)

['happy']

In [111]:
# Replace the entire expression match with the first group
re.sub(r"(\w+)\s\1", r"\1", sentence)

'I wish you a happy birthday!'

In [112]:
sentence = "Your new code number is 23434. Please, enter 23434 to open the door."
re.findall(r"(?P<code>\d{5}).*?(?P=code)", sentence)

['23434']

In [113]:
sentence = "This app is not working! It's repeating the last word word."
re.sub(r"(?P<word>\w+)\s(?P=word)", r"\g<word>", sentence)

"This app is not working! It's repeating the last word."

In [116]:
contract = 'Provider will invoice Client for Services performed within 30 days of performance. \
Client will pay Provider as set forth in each Statement of Work within 30 days of receipt and acceptance of\
such invoice. It is understood that payments to Provider for services rendered shall be made in full as agreed,\
without any deductions for taxes of any kind whatsoever, in conformity with Provider’s status as an independent\
contractor. Signed on 03/25/2001.'

# Write regex and scan contract to capture the dates described
regex_dates = r"Signed\son\s(\d{2})/(\d{2})/(\d{4})"
dates = re.search(regex_dates, contract)

# Assign to each key the corresponding match
signature = {
    "day": dates.group(2),
    "month": dates.group(1),
    "year": dates.group(3)
}
# Complete the format method to print-out
print("Our first contract is dated back to {data[year]}. Particularly, the day {data[day]} of the month {data[month]}.".format(data=signature))

Our first contract is dated back to 2001. Particularly, the day 25 of the month 03.


In [117]:
html_tags = [
    '<body>Welcome to our course! It would be an awesome experience</body>',
    '<article>To be a data scientist, you need to have knowledge in statistics and mathematics</article>',
    '<nav>About me Links Contact me!'
]

for string in html_tags:
    
    # Complete the regex and find if it matches a closed HTML tags
    match_tag =  re.match(r"<(\w+)>.*?</\1>", string)
 
    if match_tag:
        # If it matches print the first group capture
        print("Your tag {} is closed".format(match_tag.group(1)))
        
    else:
        # If it doesn't match capture only the tag 
        notmatch_tag = re.match(r"<(\w+)>", string)
        # Print the first group capture
        print("Close your {} tag!".format(notmatch_tag.group(1)))

Your tag body is closed
Your tag article is closed
Close your nav tag!


#### Reeepeated characters

 - Back to your sentiment analysis! Your next task is to replace elongated words that appear in the tweets. We define an elongated word as a word that contains a repeating character twice or more times. e.g. "Awesoooome".

 - Replacing those words is very important since a classifier will treat them as a different term from the source words lowering their frequency.

 - To find them, you will use capturing groups and reference them back using numbers. E.g \4.

 - If you want to find a match for Awesoooome. You first need to capture Awes. Then, match o and reference the same character back, and then, me.

In [118]:
sentiment_analysis = [
    '@marykatherine_q i know! I heard it this morning and wondered the same thing. Moscooooooow is so behind the times',
    'Staying at a friends house...neighborrrrrrrs are so loud-having a party',
    'Just woke up an already have read some e-mail'
]

# Complete the regex to match an elongated word
regex_elongated = r"\w+(\w)\1\w*"

for tweet in sentiment_analysis:
    # Find if there is a match in each tweet 
    match_elongated = re.search(regex_elongated, tweet)
    
    if match_elongated:
        # Assign the captured group zero 
        elongated_word = match_elongated.group(0)
        
        # Complete the format method to print the word
        print("Elongated word found: {word}".format(word=elongated_word))
    else:
        print("No elongated word found") 

Elongated word found: Moscooooooow
Elongated word found: neighborrrrrrrs
No elongated word found


#### Lookaround

In [119]:
# Positive look ahead

my_text = "tweets.txt transferred, mypass.txt transferred, keywords.txt error"
re.findall(r"\w+\.txt(?=\stransferred)", my_text)

['tweets.txt', 'mypass.txt']

In [121]:
# Negative look ahead

my_text = "tweets.txt transferred, mypass.txt transferred, keywords.txt error"
re.findall(r"\w+\.txt(?!\stransferred)", my_text)


['keywords.txt']

In [122]:
# Positive look behind

my_text = "Member: Angus Young, Member: Chris Slade, Past: Malcolm Young, Past: Cliff Williams."
re.findall(r"(?<=Member:\s)\w+\s\w+", my_text)

['Angus Young', 'Chris Slade']

In [123]:
# Negative look behind

my_text = "My white cat sat at the table. However, my brown dog was lying on the couch."
re.findall(r"(?<!brown\s)(cat|dog)", my_text)

['cat']

Positive lookahead (?=) makes sure that first part of the expression is followed by the lookahead expression

Positive lookbehind (?<=) returns all matches that are preceded by the specified pattern

In [125]:
sentiment_analysis = 'You need excellent python skills to be a data scientist. Must be! Excellent python'

# Positive lookahead
look_ahead = re.findall(r"\w+(?=\spython)", sentiment_analysis)

# Print out
print(look_ahead)

['excellent', 'Excellent']


In [126]:
# Positive lookbehind
look_behind = re.findall(r"(?<=[Pp]ython\s)\w+", sentiment_analysis)

# Print out
print(look_behind)

['skills']
