# Exercise: Regular Expressions

In [2]:
import re

In [None]:
#1 Question: Match all email addresses (SOLVED)

inputs = ["My email is john@email.com and mary@otherplace.net",
          "Visit us at support@ourcompany.co.uk for help",
          "No emails here"]

pattern = r"(\w+)@(\w+)\.(\w+)"

matches = []
for input in inputs:
    matches += re.findall(pattern, input)

for m in matches:
    username = m[0]
    mailserver = m[1]
    domain = m[2]
    print(f"Username: {username}")
    print(f"Mailserver: {mailserver}")
    print(f"Domain: {domain}")
    print('')

Username: john
Mailserver: email
Domain: com

Username: mary
Mailserver: otherplace
Domain: net

Username: support
Mailserver: ourcompany
Domain: co



In [3]:
#2 Question: Extract domain from email

inputs = ["john@email.com",
          "mary+newsletter@gmail.com",
          "support@ourcompany.co.uk"]


pattern = r'@(.+)$'
domains = []
for email in inputs:
    match = re.search(pattern, email)
    if match:
        domain = match.group(1)
        domains.append(domain)
for domain in domains:
    print(domain)


email.com
gmail.com
ourcompany.co.uk


In [4]:
#3 Question: Validate phone number

inputs = ["555-123-4567",
          "1 (234) 567-8910",
          "notaphonenumber"]

pattern = r'\b(\d{3}[-.\s]?\d{3}[-.\s]?\d{4}|\(\d{3}\)\s?\d{3}[-.\s]?\d{4})\b'

phone_numbers = []
for text in inputs:
    matches = re.findall(pattern, text)
    if matches:
        phone_numbers.extend(matches)
for phone_number in phone_numbers:
    print(phone_number)


555-123-4567


In [5]:
#4 Question: Extract area code

inputs = ["(555) 123-4567",
          "1 (234) 567-8910",
          "5551234567"]

# your code here ...:

# Define a regular expression pattern to match the area code
pattern = r'\((\d{3})\)|(\d{3})\s|-|\s'

def extract_area_code(phone_number):
    matches = re.findall(pattern, phone_number)
    for match in matches:
        for group in match:
            if group:
                return group

for phone_number in inputs:
    area_code = extract_area_code(phone_number)
    print(area_code)


555
234
None


In [6]:
pattern = r'https?://([A-Za-z0-9.-]+)'

# Function to extract the host (domain) from a URL
def extract_host_from_url(text):
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]
inputs = ["Visit https://www.example.com for more info",
          "Our website is example.com",
          "No URLs here"]
for text in inputs:
    host = extract_host_from_url(text)
    if host:
        print(host)

www.example.com


In [7]:
#6 Question: Remove non-alphabetic characters

def remove_non_alphabetic(text):
    cleaned_text = re.sub(r'[^a-zA-Z ]', '', text)
    return cleaned_text

# List of input strings
inputs = ["Hello world!", "123 Main St.", "greetings&more"]

for text in inputs:
    cleaned_text = remove_non_alphabetic(text)
    print(cleaned_text)
# your code here ...:

Hello world
 Main St
greetingsmore


In [8]:
#7 Question: Find words containing "tion"

def find_words_with_tion(text):
    matches = re.findall(r'\b\w*tion\w*\b', text)
    return matches

inputs = ["This is a test sentence with the word station in it.",
          "No words containing tion here",
          "motion activation vacation"]
for text in inputs:
    tion_words = find_words_with_tion(text)
    if tion_words:
        print(tion_words)

['station']
['tion']
['motion', 'activation', 'vacation']


In [None]:
def replace_hello_with_goodbye(text):
    modified_text = text.replace("hello", "goodbye")
    return modified_text
input_string = "hello, world! This is a simple hello example."
result = replace_hello_with_goodbye(input_string)
print(result)


In [9]:
#9 Question: Extract date strings in ISO8601 format


inputs = ["Log from 2023-01-15",
          "Meeting on 2023-02-01T13:00:00Z",
          "No dates"]
def extract_iso8601_dates(text):
    iso8601_dates = re.findall(r'\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}Z)?', text)
    return iso8601_dates
for text in inputs:
    iso8601_dates = extract_iso8601_dates(text)
    if iso8601_dates:
        print(iso8601_dates)

['2023-01-15']
['2023-02-01T13:00:00Z']


In [15]:
#10 Question: Validate correctly formatted date


def validate_dates(text):
    if re.match(r'\d{4}-\d{2}-\d{2}', text):
        return True
    else:
        return False

inputs = ["2023-01-15", "02/01/2023", "invalid date"]

for date in inputs:
    is_valid = validate_dates(date)
    print(f'"{date}" is valid: {is_valid}')



"2023-01-15" is valid: True
"02/01/2023" is valid: False
"invalid date" is valid: False


In [14]:
#11 Question: Remove punctuation except hyphens

inputs = ["Hello! World?",
          "123-Main_St.",
          "Hi there."]

def remove_punctuation_except_hyphens(text):
    cleaned_text = re.sub(r'[^\w\s-]', '', text)
    return cleaned_text
for text in inputs:
    cleaned_text = remove_punctuation_except_hyphens(text)
    print(cleaned_text)


Hello World
123-Main_St
Hi there


In [13]:
#12 Question: Count occurrences of a word

inputs = ["Hello world. Hello!",
          "Hello hello world",
          "no match"]
import re

def count_word_occurrences(word, text_list):
    word = word.lower()
    count = 0

    pattern = r'\b' + re.escape(word) + r'\b'

    for text in text_list:
        matches = re.findall(pattern, text.lower())
        count += len(matches)

    return count

inputs = ["Hello world. Hello!", "Hello hello world", "no match"]
word_to_count = "hello"
occurrences = count_word_occurrences(word_to_count, inputs)
print(f'The word "{word_to_count}" appears {occurrences} times.')


The word "hello" appears 4 times.


In [11]:
#13 Question: Extract IP addresses from log

def extract_ip_addresses(text):
    # Use a regular expression to find IP addresses
    ip_addresses = re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', text)
    return ip_addresses

inputs = ["127.0.0.1 - GET /",
          "User logged in from 192.168.1.1",
          "No IPs"]

for text in inputs:
    ip_addresses = extract_ip_addresses(text)
    if ip_addresses:
        print(ip_addresses)
# your code here ...:

['127.0.0.1']
['192.168.1.1']


In [10]:
#14 Question: Redact credit card and SSN numbers
def redact_numbers(text):
    text = re.sub(r'\b\d{4}-\d{4}-\d{4}-\d{4}\b', 'XXXX-XXXX-XXXX-XXXX', text)

    text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', 'XXX-XX-XXXX', text)

    return text

inputs = ["Visa: 4111-1111-1111-1111",
          "My SSN is 111-11-1111",
          "No numbers"]

for text in inputs:
    redacted_text = redact_numbers(text)
    print(redacted_text)

Visa: XXXX-XXXX-XXXX-XXXX
My SSN is XXX-XX-XXXX
No numbers
