# Exercise: Regular Expressions

In [2]:
import re

In [None]:
#1 Question: Match all email addresses (SOLVED)

inputs = ["My email is john@email.com and mary@otherplace.net",
          "Visit us at support@ourcompany.co.uk for help",
          "No emails here"]

pattern = r"(\w+)@(\w+)\.(\w+)"

matches = []
for input in inputs:
    matches += re.findall(pattern, input)

for m in matches:
    username = m[0]
    mailserver = m[1]
    domain = m[2]
    print(f"Username: {username}")
    print(f"Mailserver: {mailserver}")
    print(f"Domain: {domain}")
    print('')

Username: john
Mailserver: email
Domain: com

Username: mary
Mailserver: otherplace
Domain: net

Username: support
Mailserver: ourcompany
Domain: co



In [7]:
#2 Question: Extract domain from email

inputs = ["john@email.com",
          "mary+newsletter@gmail.com",
          "support@ourcompany.co.uk"]

for email in inputs:
    match = re.search(r"@([\w]+)", email)
    if match:
        domain = match.group(1)
        print(domain)

email
gmail
ourcompany


In [8]:
#3 Question: Validate phone number

inputs = ["555-123-4567",
          "1 (234) 567-8910",
          "notaphonenumber"]

for phone_number in inputs:
    match = re.match(r"^\d{3}-\d{3}-\d{4}$|^\d{1} \(\d{3}\) \d{3}-\d{4}", phone_number)

    if match:
        print(f"{phone_number} is a valid phone number.")
    else:
        print(f"{phone_number} is not a valid phone number.")

555-123-4567 is a valid phone number.
1 (234) 567-8910 is a valid phone number.
notaphonenumber is not a valid phone number.


In [None]:
#4 Question: Extract area code

inputs = ["(555) 123-4567",
          "1 (234) 567-8910",
          "5551234567"]

for phone_number in inputs:
    match = re.search(r"\((\d{3})\)|^\d{3}", phone_number)
    if match:
        area_code = match.group(1)
        print(f"The area code in {phone_number} is {area_code}.")
    else:
        print(f"No area code found in {phone_number}.")

The area code in (555) 123-4567 is 555.
The area code in 1 (234) 567-8910 is 234.
The area code in 5551234567 is None.


In [None]:
#5 Question: Match URLs and extract host

inputs = ["Visit https://www.example.com for more info",
          "Our website is example.com",
          "No URLs here"]

for text in inputs:
    matches = re.findall(r"https?://([^\s/?\.]+\.?)+(/[^\s]*)?", text)
    if matches:
        for match in matches:
            host = match[0]
            print(f"The host in \"{text}\" is {host}.")
    else:
        print(f"No URLs found in \"{text}\".")

The host in "Visit https://www.example.com for more info" is com.
No URLs found in "Our website is example.com".
No URLs found in "No URLs here".


In [None]:
#6 Question: Remove non-alphabetic characters

inputs = ["Hello world!",
          "123 Main St.",
          "greetings&more"]

for text in inputs:
    cleaned_text = re.sub(r"[^a-zA-Z]+", "", text)
    print(f"The cleaned text of \"{text}\" is \"{cleaned_text}\".")

The cleaned text of "Hello world!" is "Helloworld".
The cleaned text of "123 Main St." is "MainSt".
The cleaned text of "greetings&more" is "greetingsmore".


In [None]:
#7 Question: Find words containing "tion"

inputs = ["This is a test sentence with the word station in it.",
          "No words containing tion here",
          "motion activation vacation"]

for text in inputs:
    matches = re.findall(r"\b\w*tion\w*\b", text)
    if matches:
        print(f"The words containing 'tion' in \"{text}\" are: {', '.join(matches)}.")
    else:
        print(f"No words containing 'tion' found in \"{text}\".")

The words containing 'tion' in "This is a test sentence with the word station in it." are: station.
The words containing 'tion' in "No words containing tion here" are: tion.
The words containing 'tion' in "motion activation vacation" are: motion, activation, vacation.


In [3]:
#8 Question: Replace all occurrences of "hello" with "goodbye"

inputs = ["hello world",
          "hello there",
          "no match"]

replacement_word = "goodbye"

for index, text in enumerate(inputs):
    # Use regular expressions to replace "hello" with "goodbye"
    inputs[index] = re.sub(r'hello', replacement_word, text)

print(inputs)

['goodbye world', 'goodbye there', 'no match']


In [None]:
#9 Question: Extract date strings in ISO8601 format

inputs = ["Log from 2023-01-15",
          "Meeting on 2023-02-01T13:00:00Z",
          "No dates"]

for text in inputs:
    matches = re.findall(r"\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}Z)?\b", text)
    if matches:
        print(f"The ISO8601 date strings in \"{text}\" are: {', '.join(matches)}.")
    else:
        print(f"No ISO8601 date strings found in \"{text}\".")


The ISO8601 date strings in "Log from 2023-01-15" are: 2023-01-15.
The ISO8601 date strings in "Meeting on 2023-02-01T13:00:00Z" are: 2023-02-01T13:00:00Z.
No ISO8601 date strings found in "No dates".


In [None]:
#10 Question: Validate correctly formatted date
from datetime import datetime


def validate_date(date_string):
    try:
        datetime.strptime(date_string, "%Y-%m-%d")
        return True
    except ValueError:
        return False

inputs = ["2023-01-15", "02/01/2023", "invalid date"]

for date in inputs:
    if validate_date(date):
        print(f"The date \"{date}\" is correctly formatted.")
    else:
        print(f"The date \"{date}\" is not correctly formatted.")

The date "2023-01-15" is correctly formatted.
The date "02/01/2023" is not correctly formatted.
The date "invalid date" is not correctly formatted.


In [11]:
#11 Question: Remove punctuation except hyphens

inputs = ["Hello! World?",
          "123-Main_St.",
          "Hi there."]
def remove_punctuation(text):

    pattern = r"[^\w\s-]"

    cleaned_text = re.sub(pattern, "", text)

    return cleaned_text

for text in inputs:
    cleaned_text = remove_punctuation(text)
    print(f"Input: {text} | Cleaned Text: {cleaned_text}")

Input: Hello! World? | Cleaned Text: Hello World
Input: 123-Main_St. | Cleaned Text: 123-Main_St
Input: Hi there. | Cleaned Text: Hi there


In [None]:
#12 Question: Count occurrences of a word

inputs = ["Hello world. Hello!",
          "Hello hello world",
          "no match"]

# your code here ...:

In [None]:
#13 Question: Extract IP addresses from log

inputs = ["127.0.0.1 - GET /",
          "User logged in from 192.168.1.1",
          "No IPs"]

for text in inputs:
    ip_addresses = re.findall(r"\b(?:\d{1,3}\.){3}\d{1,3}\b", text)
    if ip_addresses:
        print(f"The IP addresses in \"{text}\" are: {', '.join(ip_addresses)}.")
    else:
        print(f"No IP addresses found in \"{text}\".")

The IP addresses in "127.0.0.1 - GET /" are: 127.0.0.1.
The IP addresses in "User logged in from 192.168.1.1" are: 192.168.1.1.
No IP addresses found in "No IPs".


In [None]:
#14 Question: Redact credit card and SSN numbers

inputs = ["Visa: 4111-1111-1111-1111",
          "My SSN is 111-11-1111",
          "No numbers"]

redacted_inputs = []

for text in inputs:
    redacted_text = re.sub(r"\b(?:\d{4}-){3}\d{4}\b|\b\d{3}-\d{2}-\d{4}\b", "[REDACTED]", text)
    redacted_inputs.append(redacted_text)

for i in range(len(inputs)):
    print(f"Original: {inputs[i]}")
    print(f"Redacted: {redacted_inputs[i]}")
    print()

Original: Visa: 4111-1111-1111-1111
Redacted: Visa: [REDACTED]

Original: My SSN is 111-11-1111
Redacted: My SSN is [REDACTED]

Original: No numbers
Redacted: No numbers

