In [1]:
import pandas as pd
import re

# Regular Expressions Exercises

## 1

Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [2]:
def is_vowel(character):
    if type(character) != str or len(character) > 1:
        return False
    
    regexp = r'[aeiouAEIOU]'
    return bool(re.search(regexp, character))

In [3]:
assert is_vowel('a')
assert is_vowel('O')
assert is_vowel('u')
assert not is_vowel('t')
assert not is_vowel('N')
assert not is_vowel('p')
print('Tests passed')

Tests passed


## 2

Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.

In [20]:
def is_valid_username(username):
    regexp = r'^[a-z][a-z0-9_]{0,31}$'
    return bool(re.search(regexp, username))

In [22]:
assert not is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
assert is_valid_username('codeup')
assert not is_valid_username('Codeup')
assert is_valid_username('codeup123')
assert not is_valid_username('1codeup')
assert is_valid_username('codeup_123')
print('Tests passed')

Tests passed


## 3

Write a regular expression to capture phone numbers. It should match all of the following:

- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

In [43]:
def is_phone_number(phone_number):
    regexp = r'^(\+?1\s)?(\(?\d{3}\)?[-\.\s])?\d{3}[-\.\s]\d{4}$'
    return bool(re.search(regexp, phone_number))

In [44]:
assert is_phone_number('(210) 867 5309')
assert is_phone_number('+1 210.867.5309')
assert is_phone_number('867-5309')
assert is_phone_number('210-867-5309')
assert is_phone_number('+1 210-867-5309')
assert is_phone_number('+1 (210) 867-5309')
assert is_phone_number('1 (210) 867-5309')
assert is_phone_number('(210) 867.5309')
assert is_phone_number('867 5309')
assert not is_phone_number('-1 210-867-539')
assert not is_phone_number('210-867-539')
assert not is_phone_number('210+867+5309')
assert not is_phone_number('210-8673-5309')
print('Tests passed')

Tests passed


## 4

Use regular expressions to convert the dates below to the standardized year-month-day format.
- 02/04/19
- 02/05/19
- 02/06/19
- 02/07/19
- 02/08/19
- 02/09/19
- 02/10/19

In [45]:
dates = pd.Series([
    '02/04/19',
    '02/05/19',
    '02/06/19',
    '02/07/19',
    '02/08/19',
    '02/09/19',
    '02/10/19'
])

match = r'(\d{2})/(\d{2})/(\d{2})'
target = r'\3-\1-\2'

dates.str.replace(match, target, regex = True)

0    19-02-04
1    19-02-05
2    19-02-06
3    19-02-07
4    19-02-08
5    19-02-09
6    19-02-10
dtype: object

## 5

Write a regex to extract the various parts of these logfile lines:
- GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
- POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
- GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58

In [126]:
logs = pd.Series([
    'GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
    'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
    'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58'
])

regexp = r'^(?P<request_type>[A-Z]{3,4})\s+(?P<page>.*?)\s+\[(?P<timestamp>.*?)\]\s+(?P<protocol>.*?)\s+\{(?P<status_code>\d{3})\}\s+(?P<size>\d+?)\s+"(?P<agent>.*?)"\s+(?P<ip_address>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$'
url_parts = logs.str.extract(regexp)
url_parts

Unnamed: 0,request_type,page,timestamp,protocol,status_code,size,agent,ip_address
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58


## Bonus Exercise

You can find a list of words on your mac at /usr/share/dict/words. Use this file to answer the following questions:

In [116]:
words = pd.read_csv('/usr/share/dict/words')
words.columns = ['words']
words.dropna(inplace = True)

- How many words have at least 3 vowels?

In [117]:
regexp = r'(\w*[aeiouAEIOU]\w*){3,}'
words['result'] = words.words.str.contains(regexp, regex = True)
words[words.result]

  words['result'] = words.words.str.contains(regexp, regex = True)


Unnamed: 0,words,result
3,aalii,True
5,Aani,True
6,aardvark,True
7,aardwolf,True
8,Aaron,True
...,...,...
235873,zymotically,True
235874,zymotize,True
235875,zymotoxic,True
235877,Zyrenian,True


- How many words have at least 3 vowels in a row?

In [118]:
regexp = r'[aeiouAEIOU]{3,}'
words['result'] = words.words.str.contains(regexp, regex = True)
words[words.result]

Unnamed: 0,words,result
233,Abietineae,True
234,abietineous,True
300,ablatitious,True
433,abranchious,True
506,absenteeism,True
...,...,...
235799,Zygophyceae,True
235800,zygophyceous,True
235801,Zygophyllaceae,True
235802,zygophyllaceous,True


- How many words have at least 4 consonants in a row?

In [119]:
regexp = r'[^aeiouAEIOU]{4,}'
words['result'] = words.words.str.contains(regexp, regex = True)
words[words.result]

Unnamed: 0,words,result
56,abarthrosis,True
72,abashlessly,True
143,abdominocystic,True
145,abdominohysterectomy,True
146,abdominohysterotomy,True
...,...,...
235879,Zyryan,True
235880,zythem,True
235881,Zythia,True
235882,zythum,True


- How many words start and end with the same letter?

In [120]:
regexp = r'^(\w)\w*\1$'
words['result'] = words.words.str.contains(regexp, regex = True)
words[words.result]

  words['result'] = words.words.str.contains(regexp, regex = True)


Unnamed: 0,words,result
1,aa,True
15,aba,True
19,abaca,True
74,abasia,True
182,abepithymia,True
...,...,...
234849,yowley,True
234853,yoy,True
234892,yucky,True
234912,yummy,True


- How many words start and end with a vowel?

In [121]:
regexp = r'^[aeiouAEIOU]\w*[aeiouAEIOU]$'
words['result'] = words.words.str.contains(regexp, regex = True)
words[words.result]

Unnamed: 0,words,result
1,aa,True
3,aalii,True
5,Aani,True
11,Aaronite,True
13,Aaru,True
...,...,...
226472,uvulae,True
226474,Uvularia,True
226478,uvulotome,True
226485,uxoricide,True


- How many words contain the same letter 3 times in a row?

In [122]:
regexp = r'(\w)\1\1'
words['result'] = words.words.str.contains(regexp, regex = True)
words[words.result]

  words['result'] = words.words.str.contains(regexp, regex = True)


Unnamed: 0,words,result
24987,bossship,True
50635,demigoddessship,True
78497,goddessship,True
82996,headmistressship,True
140480,patronessship,True
230261,wallless,True
231687,whenceeer,True


In [123]:
words.result.sum()

7

- What other interesting patterns in words can you find?

In [124]:
# words that contain the same three letter sequence of letters twice or more

regexp = r'(\w{3})\1+'
words['result'] = words.words.str.contains(regexp, regex = True)
words[words.result]

  words['result'] = words.words.str.contains(regexp, regex = True)


Unnamed: 0,words,result
825,acatastasia,True
1098,aceacenaphthene,True
4302,ahuehuete,True
4515,akeake,True
4854,alchochoden,True
...,...,...
230090,wagwag,True
230603,warwards,True
233961,xanthopurpurin,True
235336,Zirianian,True
