In [16]:
import sys
size = sys.getsizeof

In [90]:
s1 = u''

In [91]:
type(s1)

unicode

In [93]:
type('')

str

In [96]:
'ed' + u' henderson\u1ff0'

u'ed henderson\u1ff0'

In [92]:
type(b'')

str

In [5]:
type(basestring)

type

In [8]:
u''.__class__.__bases__

(basestring,)

In [14]:
str().__class__.__bases__

(basestring,)

In [19]:
basestring.__class__.__bases__

(object,)

In [100]:
print("3 chars {} 4 chars{}".format(size(u'abc'), size(u'abcd')))

3 chars 56 4 chars58


In [27]:
print("3 chars {} 4 chars{}".format(size('abc'), size('abcd')))

3 chars 40 4 chars41


the unicode function will convert the first argument to unicode. The default encoding is 'ascii', which cannot handle charcters with codes > 127.

In [28]:
unicode('abcdef')

u'abcdef'

In [101]:
unicode('abcdef'+chr(255), errors='ignore')

u'abcdef'

In [111]:
u = unichr(40960) + u'abc' + unichr(1972) + u'\u27f0'
print u
len(u), len(u.encode('utf-8'))

ꀀabc޴⟰


(6, 11)

In [106]:
u.encode('utf-8')

'\xea\x80\x80abc\xde\xb4\xe2\x9f\xb0'

In [107]:
u.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode character u'\ua000' in position 0: ordinal not in range(128)

In [108]:
u.encode('ascii', 'ignore')

'abc'

In [109]:
u.encode('ascii', 'xmlcharrefreplace')

'&#40960;abc&#1972;&#10224;'

Let us take the unicode string, and encode it into utf-8

In [110]:
utf8_version = u.encode('utf-8')

Now, lets check the types of each of the strings.

In [46]:
type(u), type(utf8_version), utf8_version

(unicode, str, '\xea\x80\x80abc\xde\xb4')

Note that the encoded version is an str, not a unicode. this is a string of bytes, not a string of unicode characters. Lets look at the length of these strings.

In [50]:
len(u), len(utf8_version)

(5, 8)

That should be interesting. There were 5 characters in the unicode string, 2 NON ascii chars, and 3 ascii chars. When encoded, there are 8 chars. We still have the 3 ascii chars, but the other two chars take up 5 bytes. One is encoded into 2 bytes, the other into 3 bytes. This should give you an idea why unicode isn't compatible with str format.. but, someties they can be equivalent, if there were no charcters wiht a code point > 127.

Next, lets decode the utf-8 version and compare it to the original.. of course we expect them to be the same.

In [51]:
u == utf8_version.decode('utf-8')

True

For testing, we might need to create unicode strings. It's quite easy. There are multiple ways to express a code string. You can use hex, octal, or two different types of unicode escape.

\xXX is a hex escape. \x followed by 2 hex characters can be used to represent any character in the 0-255 range.
\oOOO is an octal escape. You can use that re represent characters from 0-01ff, or 511. Octal is hard to think in.
\uXXXX is unicode and can be followed by 4 hex digits. So, you can represent any 2-byte unicode character.
\UXXXXXXXX is followed by 8 hex digits, and can represent a 4 byte character. 

4 bytes is all that is needed to represent any character in any language.


In [84]:
vowels = u'\u1f00\u1f10\u1f30\u1f40\u1f50'
math = u'\u2234 \u2260 \u2297'

In [85]:
print vowels
print math

ἀἐἰὀὐ
∴ ≠ ⊗


There we have stored the 'greek' representation of vowels as a string.. sort if like 'aeiou'.
Want to find some more unicode characters? Here is a chart http://unicode.org/charts/

The following are some examples of 3 byte unicode code points. I had to try a few to find ones that would print.. these worked on my machine, but YMMV

In [75]:
morebytes = u'\U0001f201 \U0001f250 \U0001f202'
print morebytes

🈁 🉐 🈂


You can encode all characters with these escape sequences, but that's not the best way. If you want to paste code and text that includes unicode code points, you need to make sure python will interpret the characters in the code file appropriately, that's why we use this:

```python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
```

We place that in our code file, and then we can use utf8 characters in our program file.

The Unicode specification includes a database of information about code points. For each code point that’s defined, the information includes the character’s name, its category, the numeric value if applicable (Unicode has characters representing the Roman numerals and fractions such as one-third and four-fifths). There are also properties related to the code point’s use in bidirectional text and other display-related properties.

The following program displays some information about several characters, and prints the numeric value of one particular character:

In [87]:
import unicodedata

u = unichr(233) + unichr(0x0bf2) + unichr(3972) + unichr(6000) + unichr(13231)

for i, c in enumerate(u):
    print i, '%04x' % ord(c), unicodedata.category(c),
    print unicodedata.name(c)

# Get numeric value of second character
print unicodedata.numeric(u[1])

0 00e9 Ll LATIN SMALL LETTER E WITH ACUTE
1 0bf2 No TAMIL NUMBER ONE THOUSAND
2 0f84 Mn TIBETAN MARK HALANTA
3 1770 Lo TAGBANWA LETTER SA
4 33af So SQUARE RAD OVER S SQUARED
1000.0


The category codes are abbreviations describing the nature of the character. These are grouped into categories such as “Letter”, “Number”, “Punctuation”, or “Symbol”, which in turn are broken up into subcategories. To take the codes from the above output, 'Ll' means ‘Letter, lowercase’, 'No' means “Number, other”, 'Mn' is “Mark, nonspacing”, and 'So' is “Symbol, other”. See <http://www.unicode.org/reports/tr44/#General_Category_Values> for a list of category codes.

# Reading and Writing Unicode Data

This is where we usually get into trouble...the normal file open function reads data as 8 bit bytes, assuming ascii encoding. If the file is actually in utf-8 format, then the bytes must be decoded. I many cases, this isn't an issue, since a lot of strings are just ascii, but as soon as we get a unicode character in there, we are in trouble.


In [88]:
sys.getfilesystemencoding()

'utf-8'

In [112]:
import os
print os.listdir('.')


['.DS_Store', '.git', '.gitignore', '.gitignore~', '.idea', '.ipynb_checkpoints', 'BSU Scripts', 'configs', 'dstTools.iml', 'environments', 'gb', 'gitstats', 'hombin', 'jgraph', 'JiraAPI.ipynb', 'leadreports', 'liquidplanner_API.pdf', 'nodeinstall.sh', 'partsim', 'partsim_deploy', 'pngcrush-1.7.17', 'pngcrush-1.7.17.zip', 'projectFilesBackup', 'requirements_deffiles.txt', 'schematics_deploy', 'scripts', 'shell_functions.sh', 'shell_functions.sh~', 'symview', 'test', 'tmp', 'Unicode.ipynb', 'untitled folder', 'upgrade_pip.sh', 'www']


In [113]:
print os.listdir(u'.')

[u'.DS_Store', u'.git', u'.gitignore', u'.gitignore~', u'.idea', u'.ipynb_checkpoints', u'BSU Scripts', u'configs', u'dstTools.iml', u'environments', u'gb', u'gitstats', u'hombin', u'jgraph', u'JiraAPI.ipynb', u'leadreports', u'liquidplanner_API.pdf', u'nodeinstall.sh', u'partsim', u'partsim_deploy', u'pngcrush-1.7.17', u'pngcrush-1.7.17.zip', u'projectFilesBackup', u'requirements_deffiles.txt', u'schematics_deploy', u'scripts', u'shell_functions.sh', u'shell_functions.sh~', u'symview', u'test', u'tmp', u'Unicode.ipynb', u'untitled folder', u'upgrade_pip.sh', u'www']


In [117]:
import codecs
import sys
UTF8Writer = codecs.getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
print 'café'

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 3: ordinal not in range(128)

In [119]:
import locale
locale.getpreferredencoding()

'UTF-8'

In [125]:
u'café'.encode('utf-8')

'caf\xc3\xa9'

In [127]:
with open('/tmp/testfile.txt', 'w') as fp:
    fp.write(u'café'.encode('utf-8'))
    

In [5]:
import codecs
with codecs.open(u'/tmp/testfile.txt', 'r') as fp:
    for line in fp:
        print type(line)
        print(line)

<type 'str'>
café


In [6]:
import locale
from kitchen.text.converters import getwriter, to_bytes, to_unicode
from kitchen.i18n import get_translation_object

encoding = locale.getpreferredencoding()
Writer = getwriter(encoding)
sys.stdout = Writer(sys.stdout)

ImportError: No module named kitchen.text.converters

In [3]:
s = u'La Pe\xf1a'
print s
print s.encode('latin-1')
print s.encode('utf-8')

La Peña
La Pe�a
La Peña
