In [1]:
import re
import pandas as pd
import numpy as np

**1. Is it a vowel?**

In [2]:
def is_vowel(x:str):
    """ Returns match object if inputted string x is a vowel """
    return re.search(r'^[aeiouAEIOU]$',x)

In [37]:
is_vowel('you')

In [45]:
is_vowel('a')

<re.Match object; span=(0, 1), match='a'>

### Alternate Method to get true and false

In [47]:
def is_a_vowel(var):
    '''
    Takes in a string and returns true or false if it is a vowel.  
    Must by a single character to return True - ignores case.
    Prints error to screen if a string was not passed.
    '''
    if not isinstance(var,str): 
        print(f'Error: expected a string')
        return None
    #if returns match object (aka not None) & only one character 
    if re.search(r'[aeiou]',var,re.IGNORECASE) and len(var) == 1: return True
    else: return False

In [43]:
is_a_vowel('Cap')

False

In [44]:
is_a_vowel('A')

False

**2. User Names**

In [3]:
def is_valid_username(x:str):
    """ Checks whether inputted string is a valid username:
    -Starts with lowercase letter
    -Only lowercase letters, numbers or _
    -No longer than 32 characters"""
    if re.search(r'^[a-z][_a-z0-9]{,31}$',x):
        return True
    else:
        return False

In [4]:
is_valid_username('c_2odeup')

True

In [8]:
is_valid_username('Codeup')

False

In [6]:
is_valid_username('codeup123')

True

In [7]:
is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')

False

### Alternate Method to give error messages

In [50]:
def is_valid_username2(username):
    '''
    Takes in a string and checks if it is a valid username.
    Prints an error if a string is not passed.
    Returns True if a valid username, returns False if not valid and prints error message to screen
    
    A username is valid if:
        It starts with a lowercase letter
        It only contains lowercase letters, numbers and an underscore
        It is 32 characters or less.
    '''
    #error out if not passed a string
    if not isinstance(username,str): 
        print(f'Error: expected a string')
        return None
    #intitialize error message string
    err_msg = ''
    #if string doesn't start with lowercase letter
    if not username[0].islower():
        err_msg += "Username must start with a lowercase letter.\n"
    
    #If contains anything other than lower case letters, numbers or the _ character
    if re.search(r'[^0-9a-z_]',username):
        err_msg += "Username must only contain lowercase letters, numbers and the _ character (underscore).\n"
        
    #less than 32 characters
    if len(username) > 32: err_msg += "Username must be 32 characters or less.\n"
    
    #return true if we didn't find any errors
    if err_msg == '': return True
    #else print the error message and return false
    else: 
        print(err_msg)
        return False

In [52]:
is_valid_username2('Codeup')

Username must start with a lowercase letter.
Username must only contain lowercase letters, numbers and the _ character (underscore).



False

In [53]:
is_valid_username2('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')

Username must be 32 characters or less.



False

In [54]:
is_valid_username2('codeup123')

True

**3. Phone Numbers**

In [9]:
subjects = ['(210) 867 5309',
            '+1 210.867.5309', 
            '867-5309',
            '210-867-5309']
regexp = r'\d{0,3}\D?\d{3}\D\d{4}'
for subject in subjects:
    if re.search(regexp, subject):
        print(subject, "matches")
    else:
        print(subject, "does not match")

(210) 867 5309 matches
+1 210.867.5309 matches
867-5309 matches
210-867-5309 matches


In [57]:
phone_regex = re.compile(
"""
^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<prefix>\d{3})
\D*?
(?P<line>\d{4})
""", re.VERBOSE)


In [59]:
pd.concat([pd.DataFrame({'num':subjects}),pd.DataFrame({'num':subjects}).num.str.extract(phone_regex)],axis=1)

Unnamed: 0,num,country_code,area_code,prefix,line
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309


**4. convert to year - month - day**

In [11]:
df = pd.DataFrame({'original_dates':['02/04/19','02/05/19','02/06/19','02/07/19','02/08/19','02/09/19','02/10/19']})
regexp=r'(\d{2})/(\d{2})/(\d{2})'

In [12]:
df['converted'] = df.original_dates.str.replace(pat = regexp,repl = r'20\3-\1-\2', regex=True)

In [13]:
df

Unnamed: 0,original_dates,converted
0,02/04/19,2019-02-04
1,02/05/19,2019-02-05
2,02/06/19,2019-02-06
3,02/07/19,2019-02-07
4,02/08/19,2019-02-08
5,02/09/19,2019-02-09
6,02/10/19,2019-02-10


5. Logs

In [63]:
#define regex
logfile_re = r'''
^(?P<method>GET|POST)
\s+
(?P<path>.*?)
\s+
\[(?P<timestamp>.*?)\]
\s+
(?P<http_version>.*?)
\s+
\{(?P<status>\d+)\}
\s+
(?P<size>\d+)
\s+
"(?P<user_agent>.*)"
\s+
(?P<ip_address>\S*$)
'''

#create pd Series with log code 
lines = pd.Series([
    'GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
    'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
    'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58',
])

#use .str.extract to extract lines of code
lines.str.extract(logfile_re, re.VERBOSE)

Unnamed: 0,method,path,timestamp,http_version,status,size,user_agent,ip_address
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58


In [64]:
df = pd.DataFrame({'original_logs':['GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
        'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
        'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58']})


df = pd.concat([df, 
                df.original_logs.str.extract(r'(?P<method>^[A-Z]{3,})\s*(?P<path>/\S*\b)\s*\[(?P<timestamp>[^]]*).*(?P<http_version>[H]\S*).\{(?P<status_code>\d{3,})\}\s*(?P<size>\s\d{1,}\s)\s*"(?P<user_agent>.*)"\s(?P<ip_address>\S*$)')],axis=1)

In [65]:
df

Unnamed: 0,original_logs,method,path,timestamp,http_version,status_code,size,user_agent,ip_address
0,GET /api/v1/sales?page=86 [16/Apr/2019:193452+...,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST /users_accounts/file-upload [16/Apr/2019:...,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET /api/v1/items?page=3 [16/Apr/2019:193453+0...,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58


Bonus

In [17]:
words = pd.read_table('/usr/share/dict/words')
words = words.rename(columns = {'A':'word'}).dropna()

In [18]:
words

Unnamed: 0,word
0,a
1,aa
2,aal
3,aalii
4,aam
...,...
235880,zythem
235881,Zythia
235882,zythum
235883,Zyzomys


In [19]:
def count_vowels(x:str):
    num_vowels = 0
    for letter in str(x):
        if re.search(r'[aeiouAEIOU]',letter):
            num_vowels += 1
        else:
            continue
    return num_vowels

In [20]:
words["num_vowels"] = words.apply(lambda row: count_vowels(row.word), axis=1)

In [21]:
# Words with at least 3 vowles
(words.num_vowels>=3).sum()

191365

In [22]:
def count_consec_vowels(x:str):
    if re.search(r'[aeiouAEIOU]{3,}',x):
        return True
    else:
        return False

In [23]:
words["consec_vowels_3"] = words.apply(lambda row: count_consec_vowels(row.word), axis=1)

In [24]:
print(f"{words.consec_vowels_3.sum()} words with at least 3 vowels in a row")


6182 words with at least 3 vowels in a row


In [25]:
words["consec_cons_4"] = words.word.str.contains(r'[^aeiouAEIOU]{4,}')

In [26]:
print(f"{words.consec_cons_4.sum()} words contain at least 4 consonants in a row")

19241 words contain at least 4 consonants in a row


In [27]:
'a'[-1]

'a'

In [28]:
words["start_end_same"] = words.word.apply(lambda row: row[0]==row[-1])

print(f"{words.start_end_same.sum()} words start and end with the same letter")

9967 words start and end with the same letter


In [29]:
def start_and_ends_with_vowel(x:str):
    if is_vowel(x[0]):
        if is_vowel(x[-1]):
            return True
        else:
            return False
    else:
        return False

In [30]:
words["start_end_vowel"] = words.word.apply(lambda row: start_and_ends_with_vowel(row))

print(f"{words.start_end_vowel.sum()} words start and end with a vowel")

14666 words start and end with a vowel
