In [4]:
s = 'café'
len(s)

4

In [5]:
b = s.encode('utf8')
b

b'caf\xc3\xa9'

In [6]:
len(b)

5

In [7]:
b.decode('utf8')

'café'

In [8]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octsts = bytes(numbers)
octsts

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

In [17]:
import struct
fmt = '<3s3sHH'
with open('sr.png', 'rb') as fp:
    img = memoryview(fp.read())
header = img[:10]
bytes(header)

b'\x89PNG\r\n\x1a\n\x00\x00'

In [18]:
struct.unpack(fmt, header)


(b'\x89PN', b'G\r\n', 2586, 0)

In [19]:
del header
del img

In [22]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [24]:
city = 'São Paulo'
city.encode('utf_8')


b'S\xc3\xa3o Paulo'

In [25]:
city.encode('utf_16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [26]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [27]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [28]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [29]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [30]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

In [31]:
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [32]:
octets.decode('iso8859_7')

'Montrιal'

In [34]:
octets.decode('koi8_R')

'MontrИal'

In [36]:
octets.decode('utf_8', errors='replace')

'Montr�al'

In [38]:
我 = '啊'
print(我)

啊


In [40]:
u16 = 'El Niño'.encode('utf_16')
list(u16)

[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [41]:
u16le = 'El Niño'.encode('utf_16le')
list(u16le)

[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [42]:
u16be = 'El Niño'.encode('utf_16be')
list(u16be)

[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]

In [47]:
with open('cafe.txt', 'w', encoding='utf_8') as f:
    f.write('café')

In [48]:
with open('cafe.txt', encoding='utf_8') as f:
    t = f.read()
    print(t)

café


In [52]:
fp = open('cafe.txt', 'w', encoding='utf_8')
print(fp)
fp.write('café')
import os
os.stat('cafe.txt').st_size

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>


5

In [53]:
fp2 = open('cafe.txt')
fp2

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp936'>

In [54]:
fp2.encoding

'cp936'

In [56]:
fp2.read()

'caf茅'

In [57]:
fp3 = open('cafe.txt', encoding='utf_8')
fp3

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='utf_8'>

In [58]:
fp3.read()

'café'

In [59]:
fp4 = open('cafe.txt', 'rb')
fp4

<_io.BufferedReader name='cafe.txt'>

In [60]:
fp4.read()

b'caf\xc3\xa9'

In [61]:
fp.close()
fp2.close()
fp3.close()
fp4.close()

In [62]:
import sys, locale

expressions = """
locale.getpreferredencoding() type(my_file) my_file.encoding sys.stdout.isatty() sys.stdout.encoding sys.stdin.isatty() sys.stdin.encoding sys.stderr.isatty() sys.stderr.encoding sys.getdefaultencoding() sys.getfilesystemencoding()
"""

In [75]:
my_file = open('dummy', 'w')
for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))
my_file.close()

 locale.getpreferredencoding() -> 'cp936'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'cp936'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'cp936'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


In [76]:
from unicodedata import normalize
s1 = 'café'
s2 = 'cafe\u0301'
len(s1), len(s2)

(4, 5)

In [77]:
len(normalize('NFC', s1)), len(normalize('NFC', s2))

(4, 4)

In [78]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))

(5, 5)

In [80]:
normalize('NFC', s1) == normalize('NFC', s2)

True

In [81]:
from unicodedata import name

ohm = '\u2126'
name(ohm)

'OHM SIGN'

In [82]:
ohm_c = normalize('NFC', ohm)
name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

In [83]:
ohm == ohm_c

False

In [84]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)

True

In [87]:
half = '½'
normalize('NFKC', half)

'1⁄2'

In [89]:
four_squared = '4²'
normalize('NFKC', four_squared)

'42'

In [95]:
micro = 'μ'
micro_kc = normalize('NFKC', micro)
micro, micro_kc

('μ', 'μ')

In [96]:
ord(micro), ord(micro_kc)

(956, 956)

In [97]:
name(micro), name(micro_kc)

('GREEK SMALL LETTER MU', 'GREEK SMALL LETTER MU')

In [99]:
def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return (normalize('NFC', str1).casefold() == normalize('NFC', str2).casefold())
s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2
nfc_equal(s1, s2)

True

In [100]:
nfc_equal('A', 'a')

False

In [104]:
import unicodedata
import string

def shave_marks(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    shave = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shave)

order = '“Herr Voß: • ½ cup of OEtker™ caffè latte • bowl of açaí.”'
shave_marks(order)
Greek = 'Zέφupoς, Zéfiro'
shave_marks(Greek)

'Zεφupoς, Zefiro'

In [105]:
def shave_marks_latin(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = Flase
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        keepers.append(c)
        if not unicodedata.combinbing(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)



In [107]:
import locale
locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')

'pt_BR.UTF-8'

In [111]:
import unicodedata 
import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
for char in sample:
    print('U+%04x' % ord(char),
          char.center(6),
          're_dig' if re_digit.match(char) else '-',
          'isdig' if char.isdigit() else '-',
          'isnum' if char.isnumeric() else '-',
          format(unicodedata.numeric(char), '5.2f'),
          unicodedata.name(char),
          sep='\t')
    

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


In [114]:
re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef as 1729 = 1³ + 12³ = 9³ + 10³.")
text_bytes = text_str.encode('utf_8')
print('Text', repr(text_str), sep='\n ')
print('Numbers')
print(' str :', re_numbers_str.findall(text_str))
print(' bytes:', re_numbers_bytes.findall(text_str))
print('Words')
print(' str :', re_words_str.findall(text_str))
print(' bytes:', re_words_bytes.findall(text_bytes))


Text
 'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
 str : ['௧௭௨௯', '1729', '1', '12', '9', '10']


TypeError: cannot use a bytes pattern on a string-like object

In [115]:
os.listdir('.')

['.ipynb_checkpoints',
 '04_chapter.ipynb',
 'cafe.txt',
 'debug.log',
 'dummy',
 'first_chapter.ipynb',
 'sr.png',
 'three_chapter.ipynb',
 'two_chapter.ipynb',
 '流畅的python2015@www.java1234.com.pdf']

In [116]:
os.listdir(b'.')

[b'.ipynb_checkpoints',
 b'04_chapter.ipynb',
 b'cafe.txt',
 b'debug.log',
 b'dummy',
 b'first_chapter.ipynb',
 b'sr.png',
 b'three_chapter.ipynb',
 b'two_chapter.ipynb',
 b'\xe6\xb5\x81\xe7\x95\x85\xe7\x9a\x84python2015@www.java1234.com.pdf']