In [1]:
import pandas as pd
import re

import warnings
warnings.filterwarnings("ignore")

#### 1. Write a function named `is_vowel`. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of `re.search` as a boolean value that indicates whether or not the regular expression matches the given string.

In [2]:
def is_vowel(string):
    '''
    This function takes in a string as input
    and uses a regular expression to determine if the passed string is a vowel
    '''

    regexp = r'[aeiouAEIOU]'
    subject = string
    
    if re.search(regexp, subject):
        print('Found a vowel.')
    else: 
        print('No vowels here.')

In [3]:
is_vowel("A")

Found a vowel.


In [4]:
is_vowel("a")

Found a vowel.


In [5]:
is_vowel("X")

No vowels here.


In [6]:
is_vowel("123")

No vowels here.


In [7]:
is_vowel("aeiou")

Found a vowel.


In [8]:
#solution from walkthrough
def is_vowel(string):
    """
    returns a boolean value assessing if the passed string is a single vowel
    """
    regex = r'^[aeiou]$'
    return bool(re.search(regex, string.lower()))


assert is_vowel("A") == True
assert is_vowel("e") == True
assert is_vowel("b") == False
assert is_vowel("ee") == False
assert is_vowel("aie") == False
print("code works")

code works


#### 2. Write a function named `is_valid_username` that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the `_` character. It should also be no longer than 32 characters. The function should return either `True` or `False` depending on whether the passed string is a valid username.


`>>> is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')`

`False`

`>>> is_valid_username('codeup')`

`True`

`>>> is_valid_username('Codeup')`

`False`

`>>> is_valid_username('codeup123')`

`True`

`>>> is_valid_username('1codeup')`

`False`

In [9]:
def is_valid_username(string):
    '''
    This function accepts a string as input
    and returns a boolean depending on whether the passed string is a valid username.
    
    A valid username starts with a lowercase letter, 
    and only consists of lowercase letters, numbers, or the _ character
    and no longer than 32 characters.
    '''
    regexp = r'^[a-z][a-z0-9_]{,31}$'
    subject = string
    
    if re.search(regexp, subject):
        return True
    else:
        return False

In [10]:
is_valid_username('alicia123_92__jeannette_aka.bunny_gonzalez')

False

In [11]:
is_valid_username('ALICIA123_92')

False

In [12]:
is_valid_username('alicia123_92')

True

In [13]:
is_valid_username('aliCia123_92')

False

In [14]:
assert is_valid_username('codeup') == True
assert is_valid_username('codeup123') == True
assert is_valid_username("Robert') DROP TABLE Students;--") == False
assert is_valid_username("'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'") == False
print("code works")

code works


#### 3. Write a regular expression to capture phone numbers. It should match all of the following:


`(210) 867 5309
+1 210.867.5309
867-5309
210-867-5309`

In [15]:
#regexp that matches all subjects 
regexp = r'[()0-9\s.+-]+'

subject1 = '(210) 867 5309'
subject2 = '+1 210.867.5309'
subject3 = '867-5309'
subject4 = '210-867-5309'

In [16]:
re.search(regexp, subject1)

<re.Match object; span=(0, 14), match='(210) 867 5309'>

In [17]:
re.search(regexp, subject2)

<re.Match object; span=(0, 15), match='+1 210.867.5309'>

In [18]:
re.search(regexp, subject3)

<re.Match object; span=(0, 8), match='867-5309'>

In [19]:
re.search(regexp, subject4)

<re.Match object; span=(0, 12), match='210-867-5309'>

In [20]:
regexp = r'[()0-9\s.+-]+'

#putting all subjects together
subject = '(210) 867 5309 +1 210.867.5309 867-5309 210-867-5309'

re.findall(regexp, subject)

['(210) 867 5309 +1 210.867.5309 867-5309 210-867-5309']

In [21]:
#solution from walkthrough: putting all phone numbers into a single col dataframe
df = pd.DataFrame()
df['number'] = ['(210) 867 5309',
                '+1 210.867.5309',
                '867-5309',
                '210-867-5309',
                '2108675309']

In [22]:
#define each individual group of phone numbers
#country code, area code, exchange code, and line_number

phone_regex = re.compile( 
'''^
(?P<country_code>\+\d+)? 
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
$''', re.VERBOSE)

#starts w/ and ends w/ everything in the middle 
#'\D*?' in btwn, anything that is not a number for 0 or more (optional)
#'?P< >' to map out the groups and name them for the dataframe
#wrapped in triple quotes, in re.compile (to compile the full regex statement)
#re.VERBOSE ignores the whitespace in the reg expression (makes it more readable)
#another way can be (re.X)

- Country codes: have plus sign followed by a digit(s) (U.S is usualy +1).
    - `\+` (the backslash before the plus sign is used as an escape key to say that we're looking for the literal '+' sign. 
    - the country code will not be in every phone number, hence the '?' (may or may not be seen-optional).
- Area codes: always 3 digits
    - '?' is added at the end b/c not all phone numbers in list have the area code
- Exchange codes: always 3 digits
- Line number: always 4 digits

In [23]:
#look at extracted df (will be concatenated to original df)
df['number'].str.extract(phone_regex)

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309
4,,210.0,867,5309


In [24]:
#concat back to original df
#.extract to pull out name groups from phone_regex
pd.concat([df, df['number'].str.extract(phone_regex)], axis=1)

Unnamed: 0,number,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309
4,2108675309,,210.0,867,5309


#### 4. Use regular expressions to convert the dates below to the standardized year-month-day format.


`02/04/19
02/05/19
02/06/19
02/07/19
02/08/19
02/09/19
02/10/19`

In [25]:
#this matches the dates from above
regexp = r'\d+[/]\d+[/]\d+'
subject = '02/04/19 02/05/19 02/06/19 02/07/19 02/08/19 02/09/19 02/10/19'

re.findall(regexp, subject)

['02/04/19',
 '02/05/19',
 '02/06/19',
 '02/07/19',
 '02/08/19',
 '02/09/19',
 '02/10/19']

In [26]:
#need to convert date_list to standardized year-month-day
date_list = [ '02/04/19', 
              '02/05/19', 
              '02/06/19', 
              '02/07/19', 
              '02/08/19', 
              '02/09/19', 
              '02/10/19']

#reg expression that matches the date_list from above
#adds capture groups to be able to convert it w/ re.sub
date_reg = r'(\d+)/(\d+)/(\d+)'

#for loop to create a new list w/ re.sub
new_list = []
for date in date_list:
    new_list.append(re.sub(date_reg, r'20\3-\1-\2', date))

new_list

['2019-02-04',
 '2019-02-05',
 '2019-02-06',
 '2019-02-07',
 '2019-02-08',
 '2019-02-09',
 '2019-02-10']

In [27]:
#another way: list comprehension
#re.sub date_reg w/ r'20\3(rd capture group)-\1(st capture group)-\2(nd capture group)
#date is the iterable for date in date_list
[re.sub(date_reg, r'20\3-\1-\2', date) for date in date_list]

['2019-02-04',
 '2019-02-05',
 '2019-02-06',
 '2019-02-07',
 '2019-02-08',
 '2019-02-09',
 '2019-02-10']

#### 5. Write a regex to extract the various parts of these logfile lines:

`GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58`

In [28]:
#get each of the 3 lines and save to a variable
lines = """
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
"""

In [29]:
# extract the parts and use these parts for the regexp name groups:

# method: GET or POST
# path (whatever page it is going to): /api/v1/sales?page=86 or /users_accounts...
# timestamp: [16/Apr/2019:193452+0000]
# http version: HTTP/1.1
# status code: {200}
# bytes: 510348
# user agent: "python-requests/2.21.0" or "User-Agent: ..."
# ip: 97.105.19.58

In [42]:
#write regexp to match each of the parts from above
regexp = r'''
^
(?P<method>GET|POST) 
\s
(?P<path>[/\w\-\?=]+)
\s
\[(?P<timestamp>.+)\]
\s
(?P<http_version>HTTP/\d+\.\d+)
\s
\{(?P<status_code>\d+)\}
\s
(?P<bytes_out>\d+)
\s
"(?P<user_agent>.+)"
\s
(?P<ip>\d+\.\d+\.\d+\.\d+)
$
'''
#method is either 'GET' or 'POST'
#path: a forward slash, and then any word character, a hyphen, a question mark, and/or equal sign
    #within the scope of the brackets, and one or more characters of them.
#timestamp: search for brackets (escape key for them, since looking for the literal brackets)
    #and one or more of anything within the brackets
#http_version: search for HTTP/, one or more digits, escape for literal '.', and one or more digits
#status_code: escape curly braces out and search for one or more digits within the '{ }'
#bytes_out: search for any one or more digits
#user_agent: search for one or more of anything btwn double quotes " " (no escape '\' needed b/c they are literals)
#ip: digits separated by periods(need to escape '\' period '.' b/c need to search for the literal '.')
#\s for whitespace in between each part

In [43]:
#list of values
lines.strip().split('\n')

['GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
 'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
 'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58']

In [31]:
#searching for all the groups within the line in lines
#lines.split on line breaks '\n'
#.groupdict associates each of the named groups into its own dictionary
[re.search(regexp, line, re.VERBOSE).groupdict() for line in lines.strip().split('\n')]

[{'method': 'GET',
  'path': '/api/v1/sales?page=86',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '200',
  'bytes_out': '510348',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'},
 {'method': 'POST',
  'path': '/users_accounts/file-upload',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '201',
  'bytes_out': '42',
  'user_agent': 'User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
  'ip': '97.105.19.58'},
 {'method': 'GET',
  'path': '/api/v1/items?page=3',
  'timestamp': '16/Apr/2019:193453+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '429',
  'bytes_out': '3561',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'}]

In [32]:
#compile the regexp and whitespace
regex = re.compile(regexp, re.VERBOSE)

#make an empty dataframe
df = pd.DataFrame()

#putting all lines into a single col dataframe and strip/split on line breaks '\n'
df['line'] = lines.strip().split('\n')

#concat back to original df
#.extract to pull out groups
df = pd.concat([df, df.line.str.extract(regex)], axis=1)
df

Unnamed: 0,line,method,path,timestamp,http_version,status_code,bytes_out,user_agent,ip
0,GET /api/v1/sales?page=86 [16/Apr/2019:193452+...,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST /users_accounts/file-upload [16/Apr/2019:...,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET /api/v1/items?page=3 [16/Apr/2019:193453+0...,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58


#### 6. Bonus: You can find a list of words on your mac at `/usr/share/dict/words`. Use this file to answer the following questions:

- How many words have at least 3 vowels?
- How many words have at least 3 vowels in a row?
- How many words have at least 4 consonants in a row?
- How many words start and end with the same letter?
- How many words start and end with a vowel?
- How many words contain the same letter 3 times in a row?
- What other interesting patterns in words can you find?

In [33]:
#read csv of list of words on my mac
words = pd.read_csv('/usr/share/dict/words', header=None, squeeze=True).dropna()
words = words.str.lower()

In [34]:
# How many words have at least 3 vowels?
at_least_3_vowels = (words.str.count(r"[aeiou]")  >= 3)
words[at_least_3_vowels].head()
at_least_3_vowels.sum()

191365

In [35]:
# How many words have at least 3 vowels in a row?
words.str.count(r"[aeiou]{3}").sum()

6251

In [36]:
# How many words have at least 4 consonants in a row?
words.str.count(r'[^aeiou]{4,}').value_counts()

0    216643
1     18881
2       360
Name: 0, dtype: int64

In [37]:
# Words with at least 6 consonants in a row
words[words.str.contains(r"[^aeiouy]{6}")]

12492     archchronicler
21118        bergschrund
64716      eschscholtzia
73886     fruchtschiefer
104560       latchstring
105552        lengthsman
122114        nachschlag
151826      postphthisic
227486       veldtschoen
Name: 0, dtype: object

In [38]:
# How many words start and end with the same letter?
# ^ starts with thing in the first capture group, which is ., anything
# anything, ., for zero or more times,
# finishing with the thing in capture group 1: \1$
words.str.contains(r'^(.).*\1$').sum()

11452

In [39]:
# How many words start and end with a vowel?
words.str.contains(r'^[aeiou].*[aeiou]$').sum()

14657

In [40]:
# How many words contain the same letter 3 times in a row?
words[words.str.contains(r'(.)\1\1')]

24988             bossship
50636      demigoddessship
78498          goddessship
82997     headmistressship
140481       patronessship
230262            wallless
231688           whenceeer
Name: 0, dtype: object

In [41]:
# Find the words that contain "q" but not "qu"
words[words.str.contains(r'q([^u]|$)')]

97907         iraq
97908        iraqi
97909      iraqian
108449      louiqa
116731       miqra
122607    nastaliq
150881     pontacq
161159           q
161160           q
161161      qasida
161162        qere
161163        qeri
161164      qintar
161165    qoheleth
161166        qoph
173530       saqib
180565        shoq
198373       tareq
235046      zaqqum
Name: 0, dtype: object