8.2 Formatting Strings

In [2]:
f'{17.489:.2f}'

'17.49'

In [3]:
#d presentation type formats integers as strings
f'{10:d}'

'10'

In [4]:
#c presentation type formats an integer character code as the corresponding character
f'{65:c} {97:c}'

'A a'

In [5]:
#s presentation type for strings
f'{"hello":s} {7}'

'hello 7'

In [7]:
from decimal import Decimal

In [8]:
#floating point and decimal values
f'{Decimal("1000000000000.0"):.3f}'

'1000000000000.000'

In [9]:
#exponential (scientific) notation
f'{Decimal("10000000000000.0"):.3e}'

'1.000e+13'

8.2.1 Self Check 3

In [10]:
print(f'{58:c} {45:c} {41:c}')

: - )


8.2.2 Field Widths and Alignment

In [11]:
f'[{27:10d}]'

'[        27]'

In [12]:
f'[{2.5:10f}]'

'[  2.500000]'

In [13]:
f'[{"hello":10}]'

'[hello     ]'

In [14]:
#left alignment with <
f'[{27:<15d}]'

'[27             ]'

In [15]:
f'[{3.5:<15f}]'

'[3.500000       ]'

In [16]:
#right alignment with >
f'[{"hello":>15}]'

'[          hello]'

In [18]:
#center values with ^
f'[{27:^7d}]'

'[  27   ]'

In [20]:
f'[{3.5:^7.1f}]'

'[  3.5  ]'

In [21]:
f'[{"hello":^7}]'

'[ hello ]'

8.2.2 Self Check 2

In [22]:
print(f'[{"Amanda":>10}]\n[{"Amanda":^10}]\n[{"Amanda":<10}]')

[    Amanda]
[  Amanda  ]
[Amanda    ]


8.2.3 Numeric Formatting

In [23]:
#add sign to positive number
f'[{27:+10d}]'

'[       +27]'

In [24]:
#fill remaining characters with 0s instead of spaces by placing a 0 before the field width and after + if there is one
f'[{27:+010d}]'

'[+000000027]'

In [25]:
#space indicates that positive numbers should show a space character in the sign position
print(f'{27:d}\n{27: d}\n{-27: d}')

27
 27
-27


In [26]:
#format numbers with thousands separators by using a comma
f'{12345678:,d}'

'12,345,678'

In [27]:
f'{123456.78:,.2f}'

'123,456.78'

8.2.3 Self Check 2

In [28]:
print(f'{10240.473:+10,.2f}\n{-3210.9521:+10,.2f}')

+10,240.47
 -3,210.95


8.2.4 String's format Method

In [29]:
#format method used before Python 3.6, replaced by f-string. May still be useful to know in case you run into old code

In [30]:
'{:.2f}'.format(17.489)

'17.49'

In [31]:
'{} {}'.format('Amanda', 'Cyan')

'Amanda Cyan'

In [32]:
'{0} {0} {1}'.format('Happy', 'Birthday')

'Happy Happy Birthday'

In [33]:
'{first} {last}'.format(first='Amanda', last='Gray')

'Amanda Gray'

In [34]:
'{last} {first}'.format(first='Amanda', last='Gray')

'Gray Amanda'

8.2.4 Self Check 1

In [35]:
print('{:c}{:c}{:c}'.format(58, 45, 41))

:-)


In [36]:
print('[{0:>10}]\n[{0:^10}]\n[{0:<10}]'.format('Amanda'))

[    Amanda]
[  Amanda  ]
[Amanda    ]


In [37]:
print('{:+10,.2f}\n{:+10.2f}'.format(10240.473, -3210.9521))

+10,240.47
  -3210.95


8.3 Concatenating and Repeating Strings

In [38]:
# + to concatenate, * to repeat strings
s1 = 'happy'

In [39]:
s2 = 'birthday'

In [40]:
s1 += ' ' + s2

In [41]:
s1

'happy birthday'

In [42]:
symbol = '>'

In [44]:
symbol *= 5

In [45]:
symbol

'>>>>>'

8.3 Self Check 1

In [46]:
name = 'Zach'

In [48]:
name += ' Fuller'

In [49]:
bar = '*'

In [50]:
bar *= len(name)

In [51]:
print(f'{bar}\n{name}\n{bar}')

***********
Zach Fuller
***********


8.4 Stripping WHitespace from Strings

In [52]:
#removing leading and trailing whitespace
sentence = '\t  \n  This is a test string. \t\t \n'

In [53]:
sentence.strip()

'This is a test string.'

In [54]:
#removing only leading white space with lstrip
sentence.lstrip()

'This is a test string. \t\t \n'

In [55]:
#remove only trailing white space with rstrip
sentence.rstrip()

'\t  \n  This is a test string.'

8.4 Self Check 1

In [56]:
name = '     Margo Magenta     '

In [57]:
name.strip()

'Margo Magenta'

In [58]:
name.lstrip()

'Margo Magenta     '

In [59]:
name.rstrip()

'     Margo Magenta'

8.5 Changing Character Case

In [60]:
#method capitalize to capitalize only first letter
'happy birthday'.capitalize()

'Happy birthday'

In [61]:
#method title to cap. every word in a string
'strings: a deeper look'.title()

'Strings: A Deeper Look'

8.5 Self Check 1

In [63]:
test_string = 'happy new year'

In [64]:
test_string.capitalize()

'Happy new year'

In [65]:
test_string.title()

'Happy New Year'

8.6 Comparison OPerator for Strings

In [66]:
print(f'A: {ord("A")}; a: {ord("a")}')

A: 65; a: 97


In [67]:
'Orange' == 'orange'

False

In [69]:
'Orange' != 'orange'

True

In [70]:
'Orange' < 'orange'

True

In [71]:
'Orange' <= 'orange'

True

In [72]:
'Orange' > 'orange'

False

In [73]:
'Orange' >= 'orange'

False

8.7 Searching for Substrings

In [74]:
sentence = 'to be or not to be that is the question'

In [75]:
sentence.count('to')

2

In [76]:
#specifying a start_index as the second argument searches only the slice string[start_index:]
sentence.count('to', 12)

1

In [77]:
#specifying end index as third argument restricts search up to but not including the end index
sentence.count('that', 12, 25)

1

In [78]:
#method index searches for a substring and returns the first index at which it is found
sentence.index('be')

3

In [79]:
#method rindex is similar to index, but searches from end of string and returns last index at which substring is found
sentence.rindex('be')

16

In [80]:
# use in or not in to see if string contains substring
'that' in sentence

True

In [81]:
'THAT' in sentence

False

In [82]:
'THAT' not in sentence

True

In [83]:
#startswith and endswith return True if string starts or ends with the substring
sentence.startswith('to')

True

In [84]:
sentence.startswith('be')

False

In [85]:
sentence.endswith('question')

True

In [86]:
sentence.endswith('quest')

False

8.7 Self Check 3

In [87]:
for word in 'to be or not to be that is the question'.split():
    if word.startswith('t'):
        print(word, end=' ')

to to that the 

8.8 Replacing Substrings

In [88]:
#method replace looks for substring in first argument, replaces it with substring in second argument
values = '1\t2\t3\t4\t5'

In [89]:
values.replace('\t', ',')

'1,2,3,4,5'

In [90]:
'1 2 3 4 5'.replace(' ', ' --> ')

'1 --> 2 --> 3 --> 4 --> 5'

8.9 Splitting and Joining Strings

In [92]:
#to tokenize a string at a custom delimiter, specify the delimiter string (such as ', ') that split uses to tokenize the string
letters = 'A, B, C, D'

In [93]:
letters.split(', ')

['A', 'B', 'C', 'D']

In [94]:
#if you provide an integer as the 2nd argument, it specifies the max number of splits
#the last token is the remainder of the string after the max number of splits
letters.split(', ', 2)

['A', 'B', 'C, D']

In [95]:
#method join concatenates string in its arguments; call it on separator between concatenated items
letters_list = ['A', 'B', 'C', 'D']

In [96]:
','.join(letters_list)

'A,B,C,D'

In [97]:
','.join([str(i) for i in range(10)])

'0,1,2,3,4,5,6,7,8,9'

In [99]:
#partition splits into tuple of 3 strings based on method's separator argument
'Amanda: 89, 97, 92'.partition(': ')

('Amanda', ': ', '89, 97, 92')

In [100]:
#rpartition to split from the end of the string instead
url = 'http://www.deitel.com/books/PyCDS/table_of_contents.html'

In [101]:
rest_of_url, separator, document = url.rpartition('/')

In [102]:
document

'table_of_contents.html'

In [103]:
rest_of_url

'http://www.deitel.com/books/PyCDS'

In [104]:
lines = """This is line 1
This is line2
This is line3"""

In [105]:
lines

'This is line 1\nThis is line2\nThis is line3'

In [106]:
lines.splitlines()

['This is line 1', 'This is line2', 'This is line3']

In [107]:
lines.splitlines(True)

['This is line 1\n', 'This is line2\n', 'This is line3']

8.9 Self Check 2

In [109]:
', '.join(reversed('Pamela WHite'.split()))

'WHite, Pamela'

8.9 Self Check 3

In [111]:
url = 'http://www.deitel.com/books/PyCDS/table_of_contents.html'

In [114]:
protocol, separator, rest_of_url = url.partition('://')

In [115]:
host, separator, document_with_path = rest_of_url.partition('/')

In [116]:
host

'www.deitel.com'

In [117]:
path, separator, document = document_with_path.rpartition('/')

In [118]:
path

'books/PyCDS'

8.10 Characters and Character-Testing Methods

In [119]:
#isdigit returns True if the string contains only digit characaters(0-9)

In [120]:
'-27'.isdigit()

False

In [121]:
'27'.isdigit()

True

In [122]:
#isalnum returns True if string is alphanumeric (contains only digits and letters)

In [123]:
'A9876'.isalnum()

True

In [124]:
'123 Main Street'.isalnum()

False

8.11 Raw Strings

In [125]:
#recall that \ is used as an escape sequence, like \n for newline or \t for tab
#if you want to include \ in your string, instead use \\
#can cause some confusion as windows uses something like C:\\asdfasd\asdf
#may write something like:
file_path = 'C:\\MyFolder\\MySubFolder\\MyFIle.txt'

In [126]:
#for such cases as above, r treats each backslash as a regular character rather than beginning of an escape sequence
file_path = r'C:\MyFolder\MySubFolder\MyFile.txt'

In [127]:
file_path

'C:\\MyFolder\\MySubFolder\\MyFile.txt'

8.12 Intro to Regular Expressions

8.12.1 re Module and Function fullmatch

In [128]:
import re

In [129]:
#fullmatch - check whether the entire string in 2nd argument matches teh pattern in its 1st argument

In [130]:
pattern = '02215'

In [131]:
'Match' if re.fullmatch(pattern, '02215') else 'No match'

'Match'

In [132]:
'Match' if re.fullmatch(pattern, '51220') else 'No match'

'No match'

In [134]:
'Valid' if re.fullmatch(r'\d{5}', '02215') else 'Invalid'

'Valid'

In [135]:
'Valid' if re.fullmatch(r'\d{5}', '9876') else 'Invalid'

'Invalid'

In [136]:
'Valid' if re.fullmatch('[A-Z][a-z]*', 'Wally') else 'Invalid'

'Valid'

In [138]:
'Valid' if re.fullmatch('[A-Z][a-z]*', 'eva') else 'Invalid'
#won't match because * is only after the lowercase characters in the 1st argument

'Invalid'

In [139]:
#when a custom character class start with a caret (^), the class matches any character that's not specified
'Match' if re.fullmatch('[^a-z]', 'A') else 'No match'

'Match'

In [140]:
'Match' if re.fullmatch('[^a-z]', 'a') else 'No match'

'No match'

In [141]:
#metacharacters in a custom character class are treated as literal characters - the characters themselves
'Match' if re.fullmatch('[*+$]', '*') else 'No match'

'Match'

In [142]:
'Match' if re.fullmatch('[*+$]', '!') else 'No match'

'No match'

In [143]:
# + quantifier matches at least one occurence of a subexpression
'Valid' if re.fullmatch('[A-Z][a-z]+', 'Wally') else 'Invalid'

'Valid'

In [145]:
'Valid' if re.fullmatch('[A-Z][a-z]+', 'E') else 'Invalid'

'Invalid'

In [146]:
# ? matches 0 or 1 occurrences of a subexpression
'Match' if re.fullmatch('labell?ed', 'labelled') else 'No match'

'Match'

In [147]:
'Match' if re.fullmatch('labell?ed', 'labeled') else 'No match'

'Match'

In [150]:
'Match' if re.fullmatch('labell?ed', 'labellled') else 'No match'

'No match'

In [155]:
# can match at least n occurrences of a subexpression with {n,} quantifier
#so the following 3 snippets look for strings with at least 3 digits
'Match' if re.fullmatch(r'\d{3,}', '123') else 'No match'

'Match'

In [153]:
'Match' if re.fullmatch(r'\d{3,}', '1234567890') else 'No match'

'Match'

In [154]:
'Match' if re.fullmatch(r'\d{3,}', '12') else 'No match'

'No match'

In [156]:
#can match between n and m (inclusive) occurrences of a subexpression with th e{n, m} quantifier
#following regular expressions match strings containing 3 to 6 digits
'Match' if re.fullmatch(r'\d{3,6}', '123') else 'No match'

'Match'

In [157]:
'Match' if re.fullmatch(r'\d{3,6}', '123456') else 'No match'

'Match'

In [158]:
'Match' if re.fullmatch(r'\d{3,6}', '1234567') else 'No match'

'No match'

In [159]:
'Match' if re.fullmatch(r'\d{3,6}', '12') else 'No match'

'No match'

8.12.1 Self Check 4

In [160]:
street = r'\d+ [A-Z][a-z]* [A-Z][a-z]*'

In [161]:
'Match' if re.fullmatch(street, '123 Main Street') else 'No match'

'Match'

In [162]:
'Match' if re.fullmatch(street, 'Main Street') else 'No match'

'No match'

8.12.2 Replacing Substrings and SPlitting Strings

In [163]:
#use sub to replace all occurrences of a pattern with the replacement
re.sub(r'\t', ', ', '1\t2\t3\t4')

'1, 2, 3, 4'

In [164]:
#can use keyword argument count to specify max number of replacements
re.sub(r'\t', ', ', '1\t2\t3\t4', count=2)

'1, 2, 3\t4'

In [166]:
#\s is the white space character class, * indicates 0 or more occurrences of the preceding subexpression
re.split(r',\s*', '1,  2,  3,4,   5,6,7,8')

['1', '2', '3', '4', '5', '6', '7', '8']

In [167]:
#can use keyword argument maxsplit to specify max number of splits
re.split(r',\s*', '1, 2,   3,4,   5,6,7,8', maxsplit=3)

['1', '2', '3', '4,   5,6,7,8']

8.12.2 Self Check 1

In [168]:
re.sub(r'\t+', ', ', 'A\tB\tC\t\t\tD')

'A, B, C, D'

8.12.2 Self Check 2

In [169]:
re.split('\$+', '123$Main$$Street')

['123', 'Main', 'Street']

8.12.3 Other Search FUnctions; Accessing Matchs

In [171]:
#search looks for first occurrence of a substring
#group returns the substring
result = re.search('Python', 'Python is fun')

In [172]:
result.group() if result else 'not found'

'Python'

In [174]:
result2 = re.search ('fun!', 'Python is fun')

In [175]:
result.group() if result else 'not found'

'Python'

In [176]:
#many re module functions have optional flag keywords
#can use re module's IGNORECASE constant to perform case-insensitive search
result3 = re.search('Sam', 'SAM WHITE', flags=re.IGNORECASE)

In [177]:
result3.group() if result3 else 'not found'

'SAM'

In [178]:
# ^ metacharacter at beginning of regular expression (not inside []) indicates that the expression matches
#only the beginning of a string
result = re.search('^Python', 'Python is fun')

In [179]:
result.group() if result else 'not found'

'Python'

In [180]:
result = re.search('^fun', 'Python is fun')

In [181]:
result.group() if result else 'not found'

'not found'

In [182]:
# $ metacharacter at end of RE indicates teh expression matches only the end of a string
result = re.search('Python$', 'Python is fun')

In [183]:
result.group() if result else 'not found'

'not found'

In [185]:
result = re.search('fun$', 'Python is fun')

In [186]:
result.group() if result else 'not found'

'fun'

In [187]:
#findall finds every matching substring in a string and returns a list of the matching substrings
contact = 'Wally White, Home: 555-555-1234, Work: 555-555-4321'

In [189]:
re.findall(r'\d{3}-\d{3}-\d{4}', contact)

['555-555-1234', '555-555-4321']

In [190]:
#finditer returns one match at a time, whereas findall returns all matches at once
for phone in re.finditer(r'\d{3}-\d{3}-\d{4}', contact):
    print(phone.group())

555-555-1234
555-555-4321


In [191]:
# can use ( ) to capture substring in a match
text = 'Charlie Cyan, e-mail: demo1@deitel.com'

In [192]:
pattern = r'([A-Z][a-z]+ [A-Z][a-z]+), e-mail: (\w+@\w+\.\w{3})'

In [193]:
result = re.search(pattern, text)

In [194]:
result.group()

'Charlie Cyan, e-mail: demo1@deitel.com'

In [196]:
#the captured substrings are numbered from 1, unlike list indices
result.group(1)

'Charlie Cyan'

In [197]:
result.group(2)

'demo1@deitel.com'

8.12.2 Self Check 2

In [198]:
result = re.search(r'(\d+) ([-+*/]) (\d+)', '10 + 5')

In [199]:
result.groups()

('10', '+', '5')

In [200]:
result.group(1)

'10'

In [201]:
result.group(2)

'+'

In [202]:
result.group(3)

'5'

8.13 Intro to Data Science: Pandas, Regular Expressions and Data Munging

In [203]:
#data validation
import pandas as pd

In [204]:
zips = pd.Series({'Boston': '02215', 'Miami': '3310'})

In [205]:
zips

Boston    02215
Miami      3310
dtype: object

In [206]:
zips.str.match(r'\d{5}')

Boston     True
Miami     False
dtype: bool

In [207]:
#Zach Fuller has completed teh zip code portion of 8.13

In [211]:
cities = pd.Series(['Boston, MA 02215', 'Miami, FL 33101'])

In [212]:
cities

0    Boston, MA 02215
1     Miami, FL 33101
dtype: object

In [213]:
cities.str.contains(r' [A-Z]{2} ')

0    True
1    True
dtype: bool

In [214]:
cities.str.match(r' [A-Z]{2} ')

0    False
1    False
dtype: bool

In [215]:
#Zach Fuller has completed the city portion of 8.13

In [216]:
contacts = [['MIke Green', 'demo1@deitel.com', '5555555555'], ['Sue Brown', 'demo2@deitel.com', '5555551234']]

In [217]:
contactsdf = pd.DataFrame(contacts, columns =['Name', 'Email', 'Phone'])

In [218]:
contactsdf

Unnamed: 0,Name,Email,Phone
0,MIke Green,demo1@deitel.com,5555555555
1,Sue Brown,demo2@deitel.com,5555551234


In [219]:
def get_formatted_phone(value):
    result = re.fullmatch(r'(\d{3})(\d{3})(\d{4})', value)
    return '-'.join(result.groups()) if result else value

In [220]:
formatted_phone = contactsdf['Phone'].map(get_formatted_phone)

In [221]:
formatted_phone

0    555-555-5555
1    555-555-1234
Name: Phone, dtype: object

In [222]:
contactsdf['Phone'] = formatted_phone

In [223]:
contactsdf

Unnamed: 0,Name,Email,Phone
0,MIke Green,demo1@deitel.com,555-555-5555
1,Sue Brown,demo2@deitel.com,555-555-1234


In [224]:
#Zach Fuller has completed the contact portion of 8.13

8.13 Self Check 2

In [225]:
contacts = [['Mike Green', 'demo1@deitel.com', '5555555555'], ['Sue Brown', 'demo2@deitel.com', '5555551234']]

In [228]:
contactdf = pd.DataFrame(contacts, columns=['Name', 'Email', 'Phone'])

In [229]:
def get_formatted_phone(value):
    result = re.fullmatch(r'(\d{3}) (\d{3}) (\d{4})', value)
    if result:
        part1, part2, part3 = result.groups()
        return '(' + part1 + ') ' + part2 + '-' + part3
    else:
        return value

In [230]:
contactsdf['Phone'] = contactsdf['Phone'].map(get_formatted_phone)

In [231]:
contactsdf

Unnamed: 0,Name,Email,Phone
0,MIke Green,demo1@deitel.com,555-555-5555
1,Sue Brown,demo2@deitel.com,555-555-1234
