# 第 4 章 Unicode 文本和字节序列

> 文本给人类阅读，字节序列供计算机处理。

## 字符问题 
- 
字符的标识：码点，`是0~111411`1范围内的数（十进制），在Unicode标准中`以4~`6个十六进制数表示，前面加“U+”，取值范围`是U+0000~U+10FFF`F
  
-  
字符的具体描述取决于所用的编码。编码是在码点和字节序列之间转换时使用的算

把码点转换成字节序列的过程叫编码，把字节序列转换成码点的过程叫 解码法。

In [1]:
# 编码和解码
s = 'café'
len(s)

4

In [3]:
b = s.encode('utf8')

print(b)

len(b)

b'caf\xc3\xa9'


5

In [4]:
b.decode('utf8')

'café'

In [12]:
b[0]

99

In [13]:
ord('c')

99

In [19]:
b[0:3]

b'caf'

## 字节概要

In [6]:
cafe = bytes('café', encoding='utf_8') 
cafe

b'caf\xc3\xa9'

In [7]:
cafe[0]

99

In [8]:
cafe[:1]

b'c'

In [9]:
cafe_arr = bytearray(cafe)

In [10]:
cafe_arr

bytearray(b'caf\xc3\xa9')

In [11]:
cafe_arr[-1:]

bytearray(b'\xa9')

In [None]:
## 基本的编码解码器

Python 自带超过 100 种编码解码器（codec，encoder/decoder）

In [20]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


## 处理编码和解码问题

处理UnicodeEncodeError

In [21]:
# 把str编码成字节序列，有些成功，有些需要处理错误
city = 'Sāo Paulo'
city.encode('utf_8')


b'S\xc4\x81o Paulo'

In [22]:
city.encode('utf_16')

b'\xff\xfeS\x00\x01\x01o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [23]:
city.encode('iso8859_1', errors='ignore')

b'So Paulo'

In [24]:
city.encode('cp437', errors='replace')


b'S?o Paulo'

处理UnicodeDecodeError

In [25]:
# 把字节序列解码成str，有些成功，有些需要处理错误
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [26]:
octets.decode('iso8859_7')


'Montrιal'

In [27]:
octets.decode('koi8_r')


'MontrИal'

In [28]:
octets.decode('utf_8', errors='replace')


'Montr�al'

默认编码


实用工具：chardetect

```
chardetect 04-text-byte.asciidoc
```

In [32]:
import locale
import sys

expressions = """
        locale.getpreferredencoding()
        type(my_file)
        my_file.encoding
        sys.stdout.isatty()
        sys.stdout.encoding
        sys.stdin.isatty()
        sys.stdin.encoding
        sys.stderr.isatty()
        sys.stderr.encoding
        sys.getdefaultencoding()
        sys.getfilesystemencoding()
    """

my_file = open('dummy', 'w')

for expression in expressions.split():
    value = eval(expression)   # eval() 函数用来执行一个字符串表达式，并返回表达式的值。
    print(f'{expression:>30} -> {value!r}')


 locale.getpreferredencoding() -> 'cp936'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'cp936'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'gbk'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


In [None]:
BOM：有用的鬼符

In [33]:
u16 = 'El Niño'.encode('utf_16')
u16

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

In [34]:
list(u16)

[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

## 处理文本文件

In [35]:
open('cafe.txt', 'w', encoding='utf_8').write('café')

4

In [36]:
open('cafe.txt').read()

'caf茅'

In [38]:
fp = open('cafe.txt', 'w', encoding='utf_8')
fp

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>

In [39]:
fp.write('café')

4

In [40]:
fp.close()

In [41]:
import os
os.stat('cafe.txt').st_size

5

In [42]:
fp2 = open('cafe.txt')
fp2

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp936'>

In [43]:
fp2.encoding 

'cp936'

In [44]:
fp2.read()

'caf茅'

In [45]:
fp3 = open('cafe.txt', encoding='utf_8')

In [46]:
fp3

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='utf_8'>

In [47]:
fp3.read()

'café'

In [48]:
fp4 = open('cafe.txt', 'rb')
fp4

<_io.BufferedReader name='cafe.txt'>

In [49]:
fp4.read()

b'caf\xc3\xa9'

## 为了正确比较而规范化 Unicode 字 符串

In [50]:
s1 = 'café'
s2 = 'cafe\N{COMBINING ACUTE ACCENT}'  
s1,s2

('café', 'café')

In [51]:
len(s1), len(s2) # 但是它们的长度不同

(4, 5)

按 Unicode 标准规定，'é' 和 'e\u0301' 是标准等价物 （canonical equivalent），可以通过 unicodedata.normalize() 解决：

In [54]:
from unicodedata import normalize
len(normalize('NFC', s1)), len(normalize('NFC', s2))


(5, 5)

In [None]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))  

参数共有：'NFC'、'NFD'、'NFKC' 和 'NFKD' 可自行试一试其他两个

In [55]:
from unicodedata import normalize, name
half = '\N{VULGAR FRACTION ONE HALF}'
print(half)

½


In [56]:
normalize('NFKC', half)

'1⁄2'

In [57]:
for char in normalize('NFKC', half):
    print(char, name(char), sep='\t')

1	DIGIT ONE
⁄	FRACTION SLASH
2	DIGIT TWO


In [58]:
four_squared = '4²'
normalize('NFKC', four_squared)

'42'

In [59]:
micro = 'μ'
micro_kc = normalize('NFKC', micro)
micro, micro_kc

('μ', 'μ')

In [60]:
ord(micro), ord(micro_kc)

(956, 956)

In [61]:
name(micro), name(micro_kc)

('GREEK SMALL LETTER MU', 'GREEK SMALL LETTER MU')

In [62]:
import unicodedata


def shave_marks(txt):
    """删除所有变音符"""
    # 把所有字符分解成基字符和组合记号
    norm_txt = unicodedata.normalize('NFD', txt)
    # 过滤所有组合记号
    shaved = ''.join(c for c in norm_txt
                     if not unicodedata.combining(c))
    # 重组所有字符
    return unicodedata.normalize('NFC', shaved)

order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
shave_marks(order)



'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'

In [63]:
greek = 'Ζέφυρος, Zéfiro'
shave_marks(greek)


'Ζεφυρος, Zefiro'

In [64]:
import string


def shave_marks_latin(txt):
    """删除所有拉丁基字符上的变音符"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    preserve = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue  # 忽略拉丁基字符的变音符
        preserve.append(c)
        # 如果不是组合字符，那就是新的基字符
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(preserve)
    return unicodedata.normalize('NFC', shaved)


In [65]:
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""",  # <1>
                           """'f"^<''""---~>""")

multi_map = str.maketrans({  # <2>
    '€': 'EUR',
    '…': '...',
    'Æ': 'AE',
    'æ': 'ae',
    'Œ': 'OE',
    'œ': 'oe',
    '™': '(TM)',
    '‰': '<per mille>',
    '†': '**',
    '‡': '***',
})

multi_map.update(single_map)  # <3>


def dewinize(txt):
    """把cp1252符号替换为ASCII字符或字符序列"""
    return txt.translate(multi_map)

def asciize(txt):
    # 去掉变音符
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    # 使用NFKC规范化形式把字符和码点组合起来
    return unicodedata.normalize('NFKC', no_marks)


In [67]:
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
dewinize(order)



'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'

In [68]:
asciize(order)


'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'

## Unicode文本排序

In [70]:
# 巴西产水果的列表排序
import locale
my_locale = locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')

fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted(fruits, key=locale.strxfrm)


['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

使用 Unicode 排序算法排序

!pip install pyuca

Miro 建议使用 PyICU 排序 Unicode 文本

In [72]:
!pip install pyuca

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting pyuca
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/98/88/aeeee34d88f841aca712a8c18fbd62a33eaad8f2dbe535e87f3c829b02f9/pyuca-1.2-py2.py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 4.1 MB/s eta 0:00:00
Installing collected packages: pyuca
Successfully installed pyuca-1.2


In [73]:
import pyuca
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
sorted_fruits

['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

## Unicode 数据库  


Unicode标准提供了一个完整的数据库，不仅包括码点与字符名称之间的映射表，还包括各个字符的元数据，以及字符之间的关系。
- 
unicodedata.name()：返回一个字符在标准中的官方  名- 称
unicodedata.numeric()：返回一个字符在标准中
的数值

In [77]:
from unicodedata import name

name('A')

'LATIN CAPITAL LETTER A'

In [78]:
import sys
import unicodedata

START, END = ord(' '), sys.maxunicode + 1           # <1>

def find(*query_words, start=START, end=END):       # <2>
    query = {w.upper() for w in query_words}        # <3>
    for code in range(start, end):
        char = chr(code)                            # <4>
        name = unicodedata.name(char, None)         # <5>
        if name and query.issubset(name.split()):   # <6>
            print(f'U+{code:04X}\t{char}\t{name}')  # <7>

def main(words):
    if words:
        find(*words)
    else:
        print('Please provide words to find.')

if __name__ == '__main__':
    main(sys.argv[1:])


In [None]:

支持str和bytes的双模式API

1. 通过正则表达式

In [76]:
import re

# str类型
re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
# bytes类型
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
            " as 1729 = 1³ + 12³ = 9³ + 10³.")

# bytes正则表达式只能搜索bytes字符串
text_bytes = text_str.encode('utf_8')

print(f'Text\n  {text_str!r}')
print('Numbers')
# str模式r'\d+'只能匹配泰米尔数值和ASCII数字
print('  str  :', re_numbers_str.findall(text_str)) 
# bytes模式rb'\d+'只能匹配ASCII字节中的数字
print('  bytes:', re_numbers_bytes.findall(text_bytes))  
print('Words')
# str模式r'\w+'能匹配字母、上标、泰米尔数字和ASCII数字
print('  str  :', re_words_str.findall(text_str))  
# bytes模式rb'\w+'只能匹配ASCII字节中的字母和数字
print('  bytes:', re_words_bytes.findall(text_bytes)) 


Text
  'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes: [b'1729', b'1', b'12', b'9', b'10']
Words
  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
  bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']


In [None]:
2. os函数中的str和bytes


- os模块中所有接收文件名或路径名的函数，既可以传入str参数，也可以传入bytes参数。
- 传入str参数时，使用sys.getfilesystemencoding()获得的编码解码器自动转换参数，操作系统回显时也使用编码解码器进行解码。
- os模块提供了特殊的编码解码函数os.fsencode(name_or_path)和os.fsdecode(name_or_path)。