---
# Chapter 4
## Text versus Bytes
---

## Text versus Bytes

---
### Example 4-1: Encoding and decoding

In [1]:
s = 'café'
print(s)
print(len(s))

b = s.encode('utf-8')
print(b)
print(len(b))

b.decode('utf-8')

café
4
b'caf\xc3\xa9'
5


'café'

---
### Example 4-2: Five-byte sequence as bytes and as bytearray

In [2]:
cafe = bytes('café', encoding='utf_8')
print(cafe)

print(cafe[0])
print(cafe[:1])

b'caf\xc3\xa9'
99
b'c'


In [3]:
cafe_arr = bytearray(cafe)
print(cafe_arr)

cafe_arr[-1:]

bytearray(b'caf\xc3\xa9')


bytearray(b'\xa9')

In [4]:
bytes.fromhex('31 4B CE A9')

b'1K\xce\xa9'

---
### Example 4-3: Initializing bytes from the raw data of an array

In [5]:
import array

numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
print(octets)

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'


---
### Example 4-4:  Using memoryview and struc to inspec a GIF image header

In [6]:
import struct

fmt = '<3s3sHH'

with open('sample.gif', 'rb') as fp:
    img = memoryview(fp.read())

header = img[:10]
header_bytes = bytes(header)
print(header_bytes)

unpacked = struct.unpack(fmt, header)
print(unpacked)

del header
del header_bytes
del unpacked
del img

b'GIF89a\xf2\x01\x18\x01'
(b'GIF', b'89a', 498, 280)


---
### Example 4-5: The string "El Niño" encoded with three codecs producing very different bytes sequences

In [7]:

s = 'El Niño'
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, s.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


---
### Example 4-6: Encodign to bytes: success and error handling

In [8]:
city = 'São Paulo'
city.encode('utf-8')

b'S\xc3\xa3o Paulo'

In [9]:
city.encode('utf-16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [10]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [11]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [12]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [13]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [14]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

---
### Example 4-7: Decoding from str to bytes: succes and error handling

In [15]:
octets = b'Mont\xe9al'
octets.decode('cp1252')

'Montéal'

In [16]:
octets.decode('iso8859_7')

'Montιal'

In [17]:
octets.decode('koi8_r')

'MontИal'

In [18]:
octets.decode('utf-8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 4: invalid continuation byte

In [19]:
octets.decode('utf-8', errors='replace')

'Mont�al'

---
### Example 4-9: A platform encoding issue (if you try this in your machine, ypu may or may not see the problem)

In [20]:
open('cafe.txt', 'w', encoding='utf-8').write('café')

open('cafe.txt').read()

'cafÃ©'

---
### Example 4-10: Closer inspection of example 4-9 running on Windows reveals the bug and how fix it.

In [21]:
fp = open('cafe.txt', 'w', encoding='utf-8')
fp

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf-8'>

In [22]:
fp.write('café')

4

In [23]:
fp.close()

In [24]:
import os
os.stat('cafe.txt').st_size

5

In [25]:
fp2 = open('cafe.txt')
fp2

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>

---
### Example 4-11: Exploring encoding defaults

In [26]:
import sys, locale

expressions= """
    locale.getpreferredencoding()
    type(myfile)
    myfile.encoding
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
    
"""
myfile = open('dummy', 'w')

for expresion in expressions.split():
    value = eval(expresion)
    print(expresion.rjust(30), '->', repr(value))

 locale.getpreferredencoding() -> 'cp1252'
                  type(myfile) -> <class '_io.TextIOWrapper'>
               myfile.encoding -> 'cp1252'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'utf-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


## Normalizing Unicode for Saner Comparisons

In [27]:
s1 = 'café'
s2 = 'cafe\u0301'

s1, s2

('café', 'café')

In [28]:
len(s1), len(s2)

(4, 5)

In [29]:
s1 == s2

False

In [34]:
from unicodedata import normalize

s1 = 'café'
s2 = 'cafe\u0301'

print(len(s1), len(s2))

ns1 = normalize('NFC', s1)
ns2 = normalize('NFC', s2)
print(len(ns1), len(ns2))
print(ns1 == ns2)

n2s1 = normalize('NFD', s1)
n2s2 = normalize('NFD', s2)
print(len(n2s1), len(n2s2))
print(ns1 == ns2)


4 5
4 4
True
5 5
True


In [39]:
from unicodedata import normalize, name

ohm = '\u2126'
print(name(ohm))

ohm_c = normalize('NFC', ohm)
print(name(ohm_c))

print(ohm == ohm_c)

normalize('NFC', ohm) == normalize('NFC', ohm_c)

OHM SIGN
GREEK CAPITAL LETTER OMEGA
False


True

---
### Example 4-14: Function to remove all combining marks.

In [55]:
import unicodedata
import string

def shave_marks(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

---
### Example 4-15: Two examples using shave_marks from Example 4-14

In [47]:
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
shave_marks(order)

'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'

In [56]:
greek = 'Ζέφυρος, Zéfiro'
shave_marks(greek)

'Ζεφυρος, Zefiro'

---
### Example 4-16: Function to remove combining marks from Latin characters.

In [58]:
def shave_marks_latin(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue

        keepers.append(c)
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters

    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)

In [59]:
shave_marks_latin(order)

'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'

In [60]:
shave_marks(order) == shave_marks_latin(order)

True

---
### Example 4-17: Transform some Western typographical symbols into ASCII

In [67]:
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",
                           """'f"*^<''""---~>""")

multi_map = str.maketrans({
    '€': '<euro>',
    '…': '...',
    'Œ': 'OE',
    '™': '(TM)',
    'œ': 'oe',
    '‰': '<per mille>',
    '‡': '**',
})

multi_map.update(single_map)

def dewinize(txt):
    return txt.translate(multi_map)

def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFC', no_marks)

In [68]:
dewinize(order)

'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'

In [72]:
dewinize(greek)

'Ζέφυρος, Zéfiro'

---
### Example 4-18: Two examples using asciize from Example 4-17

In [73]:
asciize(order)

'"Herr Voss: - ½ cup of OEtker(TM) caffe latte - bowl of acai."'

In [74]:
asciize(greek)

'Ζέφυρος, Zefiro'

## Sorting Unicode Text

In [75]:
fruits = ['caju', 'atemoia', 'cajá', 'acai', 'acerola']

sorted(fruits)

['acai', 'acerola', 'atemoia', 'caju', 'cajá']

---
### Example 4-19: Using the locale.strxfrm function as sort key

In [79]:
import locale

locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')

fruits = ['caju', 'atemoia', 'cajá', 'acai', 'acerola']

sorted_fruits = sorted(fruits, key=locale.strxfrm)
print(sorted_fruits)

['acai', 'acerola', 'atemoia', 'cajá', 'caju']


## Sorting with the Unicode Collation Algorithm

---
### Example 4-20: Using the pyuca.Collator.sort_key method

In [84]:
import pyuca

coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'acai', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
print(sorted_fruits)

['acai', 'acerola', 'atemoia', 'cajá', 'caju']


## The Unicode Database

---
### Example 4-21: Demo of Unicode daabase numerical character metadata (callouts describe each column in the output)

In [85]:
import unicodedata
import re

re_digit = re.compile('\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print('U+%04x' % ord(char),
        char.center(6),
        're_dig' if re_digit.match(char) else '-',
        'isdigit' if char.isdigit() else '-',
        'isnum' if char.isnumeric() else '-',
        format(unicodedata.numeric(char), '5.2f'),
        unicodedata.name(char),
        sep='\t'
    )
    

U+0031	  1   	re_dig	isdigit	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdigit	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdigit	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdigit	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdigit	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


---
### Example 4-22: Compare behavior of simple str and bytes regular expressions

In [3]:
import re

re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
            " as 1729 = 1³ + 12³ = 9³ + 10³.")

text_bytes = text_str.encode('utf-8')
print('Text', repr(text_str), sep='\n')
print('Numbers')
print('  str  :', re_numbers_str.findall(text_str))
print('  bytes  :', re_numbers_bytes.findall(text_bytes))
print('Words')
print('  str  :', re_words_str.findall(text_str))
print('  bytes  :', re_words_bytes.findall(text_bytes))

Text
'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes  : [b'1729', b'1', b'12', b'9', b'10']
Words
  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
  bytes  : [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']
