In [1]:
import re
import pandas as pd

In [2]:
original_str = "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

# 1
Extract the names of each individual from the unformatted text string and store them in a vector of some sort. When complete, your vector should contain the following entries:
    - "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
    - "Ned Flanders" "Simpson,Homer" "Dr. Julius Hibbert"

In [3]:
# 1st char is [A-Z]
# middle chars are 1 to n [a-zA-Z\s\,\.]
# last char is [a-z]
pattern = r'[A-Z][a-zA-Z\s\,\.]+[a-z]'

# compile pattern
regex = re.compile(pattern)

# find all
original_names = regex.findall(original_str)
original_names

['Moe Szyslak',
 'Burns, C. Montgomery',
 'Rev. Timothy Lovejoy',
 'Ned Flanders',
 'Simpson, Homer',
 'Dr. Julius Hibbert']

# 2

Using your new vector containing only the names of the six individuals, complete the following tasks:
    - a. (4 Points) Use your regex skills to rearrange the vector so that all elements conform to the standard “firstname lastname”, preserving any titles (e.g., “Rev.”, “Dr.”, etc) or middle/second names.
    - b. (4 Points) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
    - c. (4 Points) Construct a logical vector indicating whether a character has a middle/second name

#### a. (4 Points) Use your regex skills to rearrange the vector so that all elements conform to the standard “firstname lastname”, preserving any titles (e.g., “Rev.”, “Dr.”, etc) or middle/second names.

In [4]:
original_names

['Moe Szyslak',
 'Burns, C. Montgomery',
 'Rev. Timothy Lovejoy',
 'Ned Flanders',
 'Simpson, Homer',
 'Dr. Julius Hibbert']

In [5]:
# pattern of middle name
pattern_middle_name = r'([\w]+),\s([\w.]+)\s([\w]+)'
regex_middle_name = re.compile(pattern_middle_name)

# pattern of last name, first name
pattern_comma_name = r'([\w]+),\s([\w]+)'
regex_comma_name = re.compile(pattern_comma_name)

# a result list
normal_name_list = []

# loop all original names
for name in original_names:
    
    # does a name have a comma and a space, like ", "
    is_last_first = name.find(", ")
    
    # if string has ", ", it is a "last name, first name"
    if is_last_first > 0:
        
        # split a name by space
        name_lens_large_2 = re.split('\s+', name)
        
        # split a name by ,
        has_comma = re.split(',+', name)

        # if name has 3 parts
        if len(name_lens_large_2) > 2:
            
            # find middle name
            name_list = regex_middle_name.findall(name)
            
            if len(name_list) > 0:
                # change name sequce to (first name middle name last name)
                n = name_list[0][2] + ' ' + name_list[0][1] + ' ' + name_list[0][0]
                normal_name_list.append(n)

        # if name has a comma ','
        elif len(has_comma) > 1:
            
            # find (last name, first name)
            name_list = regex_comma_name.findall(name)
            
            if len(name_list) > 0:
                # change name sequce to (first name last name)
                n = name_list[0][1] + ' ' + name_list[0][0]
                normal_name_list.append(n)
    else:
        normal_name_list.append(name)
        
normal_name_list

['Moe Szyslak',
 'Montgomery C. Burns',
 'Rev. Timothy Lovejoy',
 'Ned Flanders',
 'Homer Simpson',
 'Dr. Julius Hibbert']

#### b. (4 Points) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

In [6]:
# create a pandas series
name_df = pd.Series(normal_name_list)
name_df

0             Moe Szyslak
1     Montgomery C. Burns
2    Rev. Timothy Lovejoy
3            Ned Flanders
4           Homer Simpson
5      Dr. Julius Hibbert
dtype: object

In [7]:
# check each data
name_df.str.contains('Dr.|Rev.')

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

#### c. (4 Points) Construct a logical vector indicating whether a character has a middle/second name

In [8]:
# pattern： 1st character is a space, 2nd is one character of A-z, 3rd is ., the end is a space
pattern_m = r' [A-z]{1}\. '
name_df.str.contains(pattern_m)

0    False
1     True
2    False
3    False
4    False
5    False
dtype: bool

# 3  
```python
Consider the HTML string <title>+++BREAKING NEWS+++<title>. We would like to extract the first HTML tag
(i.e., “<title>”). To do so we write the regular expression “<.+>”. Explain why this fails and correct the expression.
```

In [9]:
html3 = '<title>+++BREAKING NEWS+++<title>'
pattern3 = r'<.+>'
regex = re.compile(pattern3, flags=re.IGNORECASE)
regex.findall(html3)


['<title>+++BREAKING NEWS+++<title>']

- <.+>
- 1st character is <
- . matches any character except a newline.
- '+' to match 1 or more repetitions of the preceding
- last character is >
- so <.+> can match any string that starts a '<' and ends a '>'. 

#### the right pattern is as following
- <[A-z]+>
- 1st character is <
- [A-z] equals [A-Za-z]
- [A-z]+ means 1 or more A-z character
- last character is >

In [10]:
# can match <any character>
pattern_correct3 = r'<[A-z]+>'

# compile pattern
regex = re.compile(pattern_correct3, flags=re.IGNORECASE)

# find all matches
result = regex.findall(html3)

# print
result

['<title>', '<title>']

In [11]:
# To get the first <title>
if len(result) > 0:
    print(result[0])

<title>


# 4
```python
(6 Points) Consider the string “(5-3)^2=5^2-2*5*3+3^2” conforms to the binomial theorem. We would like to extract
the formula in the string. To do so we write the regular expression “[^0-9=+*()]+”. Explain why this fails and correct the
expression.
```

In [12]:
formular4 = '(5-3)^2=5^2-2*5*3+3^2'
pattern4 = r'[^0-9=+*()]+'
regex = re.compile(pattern4, flags=re.IGNORECASE)
regex.findall(formular4)

['-', '^', '^', '-', '^']

- [] is a set. If the first character of the set is '^', all the characters that are not in the set will be matched. [^0-9=+*()] means not digit.
- In a set, \\^' means a character ^
- There is not '-' in '[^0-9=+*()]+', so We should add a '-' 

In [13]:
s = '(5-3)^2=5^2-2*5*3+3^2'
pattern4 = r'[0-9\^()*+-=]+'
regex = re.compile(pattern4, flags=re.IGNORECASE)
regex.findall(s)

['(5-3)^2=5^2-2*5*3+3^2']