## String Manipulation and Regular Expressions

#### Capitalising letters

In [2]:
my_string = "tHis Is a niCe StriNg"

my_string.capitalize()

'This is a nice string'

#### Splitting

In [3]:
my_string = "This string will be split"

my_string.split(sep=" ", maxsplit=2)

['This', 'string', 'will be split']

#### Right splitting

In [4]:
my_string.rsplit(sep=" ", maxsplit=2)

['This string will', 'be', 'split']

#### Splitting lines

In [5]:
my_string = "This string will be split\nin two"

my_string.splitlines()

['This string will be split', 'in two']

#### Joining

In [6]:
my_list = ["this", "would", "be", "a", "string"]
print(" ".join(my_list))

this would be a string


#### Stripping characters

In [7]:
my_string = " This string will be stripped\n"
my_string.strip()

'This string will be stripped'

#### Remove characters from the right end

In [8]:
my_string.rstrip()

' This string will be stripped'

#### Remove characters from the left end

In [9]:
my_string.lstrip()

'This string will be stripped\n'

#### Example

In [10]:
movie = '$I supposed that coming from MTV Films I should expect no less$'

# Convert to lowercase and print the result
movie_lower = movie.lower()
print(movie_lower)

# Remove specified character and print the result
movie_no_sign = movie_lower.strip("$")
print(movie_no_sign)

# Split the string into substrings and print the result
movie_split = movie_no_sign.split()
print(movie_split)

# Select root word and print the result
word_root = movie_split[1][:-1]
print(word_root)

$i supposed that coming from mtv films i should expect no less$
i supposed that coming from mtv films i should expect no less
['i', 'supposed', 'that', 'coming', 'from', 'mtv', 'films', 'i', 'should', 'expect', 'no', 'less']
suppose


In [11]:
movie = 'the film,however,is all good<\\i>'
# Remove tags happening at the end and print results
movie_tag = movie.rstrip("<\i>")
print(movie_tag)

# Split the string using commas and print results
movie_no_comma = movie_tag.split(",")
print(movie_no_comma)

# Join back together and print results
movie_join = " ".join(movie_no_comma)
print(movie_join)

the film,however,is all good
['the film', 'however', 'is all good']
the film however is all good


#### Finding substrings with .find(), .index()

In [1]:
my_string = "Where's Waldo?"
my_string.find("Waldo")

8

In [2]:
my_string.find("Wenda")

-1

In [3]:
my_string.find("Waldo", 0, 6)

-1

In [5]:
my_string.index("Waldo")

8

In [7]:
try:
    my_string.index("Wenda")
except ValueError:
      print("Not found")

Not found


#### Counting occurrences

In [8]:
my_string = "How many fruits do you have in your fruit basket?"
my_string.count("fruit")

2

In [9]:
my_string.count("fruit", 0, 16)

1

#### Replacing sub-strings

In [10]:
my_string = "The red house is between the blue house and the old house"
print(my_string.replace("house", "car"))

The red car is between the blue car and the old car


In [11]:
print(my_string.replace("house", "car", 2))

The red car is between the blue car and the old house


#### Positional formatting

In [2]:
wikipedia_article = 'In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals.'

my_list = []

# Assign the substrings to the variables
first_pos = wikipedia_article[3:19].lower()
second_pos = wikipedia_article[21:44].lower()

# Define string with placeholders 
my_list.append("The tool {} is used in {}")

# Define string with rearranged placeholders
my_list.append("The tool {1} is used in {0}")

# Use format to print strings
for my_string in my_list:
    print(my_string.format(first_pos, second_pos))

The tool computer science is used in artificial intelligence
The tool artificial intelligence is used in computer science


#### Reordering values

In [3]:
print("{2} has a friend called {0} and a sister called {1}".format("Betty", "Linda", "Daisy"))

Daisy has a friend called Betty and a sister called Linda


#### Named placeholders

In [4]:
tool="Unsupervised algorithms"
goal="patterns"
print("{title} try to find {aim} in the dataset".format(title=tool, aim=goal))

Unsupervised algorithms try to find patterns in the dataset


In [6]:
my_methods = {"tool": "Unsupervised algorithms", "goal": "patterns"}
print('{data[tool]} try to find {data[goal]} in the dataset'.format(data=my_methods))

Unsupervised algorithms try to find patterns in the dataset


In [10]:
courses = ['artificial intelligence', 'neural networks']

# Create a dictionary
plan = { "field": courses[0],
        "tool": courses[1]
        }

# Complete the placeholders accessing elements of field and tool keys in the data dictionary
my_message = "If you are interested in {data[field]}, you can take the course related to {data[tool]}"

# Use the plan dictionary to replace placeholders
print(my_message.format(data=plan))

If you are interested in artificial intelligence, you can take the course related to neural networks


#### Formatting strings

In [7]:
print("Only {0:.2f}% of the {1} produced worldwide is {2}!".format(0.5155675, "data", "analyzed"))

Only 0.52% of the data produced worldwide is analyzed!


#### Date strings

In [9]:
from datetime import datetime
print(datetime.now())

print("Today's date is {:%Y-%m-%d %H:%M}".format(datetime.now()))

2022-02-25 14:09:26.569583
Today's date is 2022-02-25 14:09


In [11]:
# Import datetime 
from datetime import datetime

# Assign date to get_date
get_date = datetime.now()

# Add named placeholders with format specifiers
message = "Good morning. Today is {today:%B %d, %Y}. It's {today:%H:%M} ... time to work!"

# Use the format method replacing the placeholder with get_date
print(message.format(today=get_date))

Good morning. Today is February 25, 2022. It's 14:14 ... time to work!


#### Formatted literal strings

 - !s normal string excluding quotes
 - !r includes quotes
 - !a escapes non-ASCII characters

In [19]:
name = "Python"
print(f"Python is called {name!s} due to a comedy series")

print(f"Python is called {name!r} due to a comedy series")

Python is called Python due to a comedy series
Python is called 'Python' due to a comedy series


In [24]:
number = 90.41890417471841

print(f"In the last 2 years, {number:.2f}% of the data was produced worldwide!")

In the last 2 years, 90.42% of the data was produced worldwide!


In [25]:
from datetime import datetime

my_today = datetime.now()

print(f"Today's date is {my_today:%B %d, %Y}")

Today's date is February 25, 2022


In [26]:
family = {"dad": "John", "siblings": "Peter"}

print(f"Is your dad called {family['dad']}?")

Is your dad called John?


In [27]:
def my_function(a, b):
    return a + b

print(f"If you sum up 10 and 20 the result is {my_function(10, 20)}")

If you sum up 10 and 20 the result is 30


In [32]:
# Divide the length of list by 120 rounded to two decimals
list_links = ['url1','url2']
print(f"Only {len(list_links)*100/120:.2f}% of the posts contain links")

Only 1.67% of the posts contain links


In [34]:
# Access values of date and price in east dictionary
import datetime
east = {'date': datetime.datetime(2007, 4, 20, 0, 0), 'price': 1232443}
print(f"The price for a house in the east neighborhood was ${east['price']} in {east['date']:%m-%d-%Y}")

The price for a house in the east neighborhood was $1232443 in 04-20-2007


In [35]:
# Access values of date and price in west dictionary
west = {'date': datetime.datetime(2006, 5, 26, 0, 0), 'price': 1432673}
print(f"The price for a house in the west neighborhood was ${west['price']} in {west['date']:%m-%d-%Y}.")

The price for a house in the west neighborhood was $1432673 in 05-26-2006.


#### Templating

In [38]:
from string import Template
job = "Data science"
name = "sexiest job of the 21st century"

my_string = Template('$title has been called $description')
my_string.substitute(title=job, description=name)

'Data science has been called sexiest job of the 21st century'

In [39]:
my_string = Template('I find Python very ${noun}ing but my sister has lost $noun')
my_string.substitute(noun="interest")

'I find Python very interesting but my sister has lost interest'

In [45]:
my_string = Template('I paid for the Python course only $$ $price, amazing!')
my_string.substitute(price="12.50")

'I paid for the Python course only $ 12.50, amazing!'

#### Safe substitution

In [54]:
favorite = dict(flavor="chocolate")
my_string = Template('I love $flavor c$ake very much')
my_string.safe_substitute(favorite)

'I love chocolate c$ake very much'

In [55]:
favorite = dict(flavor="chocolate")
my_string = Template('I love $flavor $cake very much')
my_string.substitute(favorite)

KeyError: 'cake'

#### Regular expressions

 - A regular expression is a string that contains normal characters and special metacharacters which describe patterns to find text or positions within a text
 - A pattern is a sequence of characters that maps to words or punctuation
 - Regex allow for finding patterns that would be very difficult otherwise
 - They are fast
 
 - You write down some helpful metacharacters to help you later:

    - \d: digit
    - \w: word character
    - \W: non-word character
    - \s: whitespace

In [56]:
# Import the re module
import re

sentiment_analysis = '@robot9! @robot4& I have a good feeling that the show isgoing to be amazing! @robot9$ @robot7%'
# Write the regex
regex = r"@robot\d\W"

# Find all matches of regex
print(re.findall(regex, sentiment_analysis))

['@robot9!', '@robot4&', '@robot9$', '@robot7%']


#### Find the numbers
You pull a list of metacharacters:
 - \d digit
 - \w word character
 - \s whitespace

In [57]:
sentiment_analysis = "Unfortunately one of those moments wasn't a giant squid monster. User_mentions:2, likes: 9, number of retweets: 7"

# Write a regex to obtain user mentions
print(re.findall(r"User_mentions:\d", sentiment_analysis))

# Write a regex to obtain number of likes
print(re.findall(r"likes:\s\d", sentiment_analysis))

# Write a regex to obtain number of retweets
print(re.findall(r"number\sof\sretweets:\s\d", sentiment_analysis))

['User_mentions:2']
['likes: 9']
['number of retweets: 7']


#### Match and split

In [58]:
sentiment_analysis = 'He#newHis%newTin love with$newPscrappy. #8break%He is&newYmissing him@newLalready'

# Write a regex that matches the pattern separating the sentences in sentiment_analysis, e.g. &4break!
# Write a regex to match pattern separating sentences
regex_sentence = r"\W\dbreak\W"

# Replace the regex_sentence with a space
sentiment_sub = re.sub(regex_sentence, " ", sentiment_analysis)

# Write a regex that matches the pattern separating the words in sentiment_analysis, e.g. #newH.
# Write a regex to match pattern separating words
regex_words = r"\Wnew\w"

# Replace the regex_words and print the result
sentiment_final = re.sub(regex_words, " ", sentiment_sub)
print(sentiment_final)

He is in love with scrappy.  He is missing him already


#### Repetitions
 - Quantifiers a metacharacter that tells the regex engine how many times to match a character immediately to its left (+)
 - Zero times or more (*)
 - Zero times or once (?)
 - n times at least, m times at most {n,m}

In [59]:
text = "Date of start: 4-3. Date of registration: 10-04."
re.findall(r"\d+-\d+", text)

['4-3', '10-04']

In [60]:
my_string = "The concert was amazing! @ameli!a @joh&&n @mary90"
re.findall(r"@\w+\W*\w+", my_string)

['@ameli!a', '@joh&&n', '@mary90']

In [61]:
text = "The color of this image is amazing. However, the colour blue could be brighter."
re.findall(r"colou?r", text)

['color', 'colour']

In [62]:
phone_number = "John: 1-966-847-3131 Michelle: 54-908-42-42424"
re.findall(r"\d{1,2}-\d{3}-\d{2,3}-\d{4,}", phone_number)

['1-966-847-3131', '54-908-42-42424']

In [64]:
sentiment_analysis = [
    "Boredd. Colddd @blueKnight39 Internet keeps stuffing up. Save me! https://www.tellyourstory.com",
    "I had a horrible nightmare last night @anitaLopez98 @MyredHat31 which affected my sleep, now I'm really tired",
    "im lonely  keep me company @YourBestCompany! @foxRadio https://radio.foxnews.com 22 female, new york"
    ]

# Import re module
import re

for tweet in sentiment_analysis:
    
    # Write a regex to find all the matches of http links appearing in each tweet in sentiment_analysis
    # Write regex to match http links and print out result
    print(re.findall(r"http\S+", tweet))
    
    # Write a regex to find all the matches of user mentions appearing in each tweet in sentiment_analysis
    # Write regex to match user mentions and print out result
    print(re.findall(r"@\w+", tweet))

['https://www.tellyourstory.com']
['@blueKnight39']
[]
['@anitaLopez98', '@MyredHat31']
['https://radio.foxnews.com']
['@YourBestCompany', '@foxRadio']


#### More Repetitions

In [65]:
sentiment_analysis = [
    "I would like to apologize for the repeated Video Games Live related tweets. 32 minutes ago",
    "@zaydia but i cant figure out how to get there / back / pay for a hotel 1st May 2019",
    "FML: So much for seniority, bc of technological ineptness 23rd June 2018 17:54"

]

# Complete the for loop with a regex to find dates
for date in sentiment_analysis:
    print(re.findall(r"\d{1,2}\s\w+\sago", date))

    print(re.findall(r"\d{1,2}\w+\s\w+\s\d{4}", date))

    print(re.findall(r"\d{1,2}\w+\s\w+\s\d{4}\s\d{2}:\d{1,2}", date))

['32 minutes ago']
[]
[]
[]
['1st May 2019']
[]
[]
['23rd June 2018']
['23rd June 2018 17:54']


#### Getting tokens

In [66]:
sentiment_analysis = 'ITS NOT ENOUGH TO SAY THAT IMISS U #MissYou #SoMuch #Friendship #Forever'

# Write a regex matching the hashtag pattern
regex = r"#\w+"

# Replace the regex by an empty string
no_hashtag = re.sub(regex, "", sentiment_analysis)

# Get tokens by splitting text
print(re.split(r"\s+", no_hashtag))

['ITS', 'NOT', 'ENOUGH', 'TO', 'SAY', 'THAT', 'IMISS', 'U', '']


#### Regex metacharacters