Skip to content

Commit

Permalink
Support multibyte codecs using a byte buffer. Support \ucN. Support n…
Browse files Browse the repository at this point in the history
…egative numbers in \u
  • Loading branch information
brendonh committed May 18, 2015
1 parent 800f92a commit 381a306
Show file tree
Hide file tree
Showing 8 changed files with 489 additions and 17,674 deletions.
8 changes: 6 additions & 2 deletions examples/reading/rtf15.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import sys
import os.path

from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter

import sys

if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = "sample.rtf"
filename = os.path.normpath(os.path.join(
os.path.dirname(__file__),
'../../tests/rtfs/sample.rtf'))

doc = Rtf15Reader.read(open(filename, "rb"))

Expand Down
17,648 changes: 0 additions & 17,648 deletions examples/reading/sampleWithImage.rtf

This file was deleted.

57 changes: 33 additions & 24 deletions pyth/plugins/rtf15/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pyth.encodings import symbol

_CONTROLCHARS = set(string.ascii_letters + string.digits + "-*")
_DIGITS = set(string.digits)
_DIGITS = set(string.digits + "-")


_CODEPAGES = {
Expand Down Expand Up @@ -56,17 +56,17 @@
255: "cp850", # OEM
}

# All the ones named by number in my 2.6 encodings dir
# All the ones named by number in my 2.6 encodings dir, and those listed above
_CODEPAGES_BY_NUMBER = dict(
(x, "cp%s" % x) for x in (37, 1006, 1026, 1140, 1250, 1251, 1252, 1253, 1254, 1255,
1256, 1257, 1258, 424, 437, 500, 737, 775, 850, 852, 855,
856, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874,
875, 932, 949, 950))
(x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856,
857, 860, 861, 862, 863, 864, 865, 866, 869, 874,
875, 932, 936, 949, 950, 1006, 1026, 1140, 1250,
1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361))

# Miscellaneous, incomplete
_CODEPAGES_BY_NUMBER.update({
10000: "mac-roman",
10007: "mac-greek",
10000: "mac-roman",
10007: "mac-greek"
})


Expand Down Expand Up @@ -123,6 +123,7 @@ def parse(self):
subGroup = Group(self, self.group, self.charsetTable)
self.stack.append(subGroup)
subGroup.skip = self.group.skip
self.group.flushChars()
self.group = subGroup
elif next == '}':
subGroup = self.stack.pop()
Expand Down Expand Up @@ -380,9 +381,20 @@ def __init__(self, reader, parent=None, charsetTable=None):
self.charsetTable = charsetTable

self.content = []
self.charBuffer = []
self.skipCount = 0


def flushChars(self):
chars = "".join(self.charBuffer).decode(self.charset, self.reader.errors)
self.content.append(chars)
self.charBuffer = []


def handle(self, control, digits):
if self.charBuffer and control != "ansi_escape":
self.flushChars()

if control == '*':
self.destination = True
return
Expand All @@ -405,8 +417,11 @@ def handle(self, control, digits):
handler()


def char(self, char):
self.content.append(char.decode(self.charset, self.reader.errors))
def char(self, byte):
if self.skipCount:
self.skipCount -= 1
else:
self.charBuffer.append(byte)


def _finalize(self):
Expand All @@ -420,17 +435,7 @@ def _finalize(self):
if self.skip:
return

stuff = []
i = 0
while i < len(self.content):
thing = self.content[i]
if isinstance(thing, Skip):
i += thing.count
else:
stuff.append(thing)
i += 1

self.content = stuff
self.flushChars()


# This is only the default,
Expand Down Expand Up @@ -529,7 +534,8 @@ def handle_ansi_escape(self, code):
else:
char = chr(code)
if not self.isPcData:
char = char.decode(self.charset, self.reader.errors)
self.char(char)
return

self.content.append(char)

Expand All @@ -543,15 +549,18 @@ def handle_control_symbol(self, symbol):
def handle_u(self, codepoint):
codepoint = int(codepoint)
try:
char = unichr(codepoint)
char = unichr(codepoint % 2**16)
except ValueError:
if self.reader.errors == 'replace':
char = '?'
else:
raise

self.content.append(char)
self.content.append(Skip(self.props.get('unicode_skip', 1)))
self.skipCount = self.props.get('unicode_skip', 1)

def handle_uc(self, skipBytes):
self.props['unicode_skip'] = int(skipBytes)


def handle_par(self):
Expand Down
19 changes: 19 additions & 0 deletions tests/rtfs/ansi.rtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{\rtf1\ansi\deff0\adeflang1025
{\fonttbl{\f0\froman\fprq2\fcharset128 Times New Roman;}{\f1\froman\fprq2\fcharset128 Times New Roman;}{\f2\fswiss\fprq2\fcharset128 Arial;}{\f3\fnil\fprq2\fcharset128 Arial Unicode MS;}}
{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9\snext1 Normal;}
{\s2\sb240\sa120\keepn\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs28\lang1081\ltrch\dbch\langfe2052\hich\f2\fs28\lang9\loch\f2\fs28\lang9\sbasedon1\snext3 Heading;}
{\s3\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9\sbasedon1\snext3 Body Text;}
{\s4\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9\sbasedon3\snext4 List;}
{\s5\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ai\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\i\loch\f0\fs24\lang9\i\sbasedon1\snext5 caption;}
{\s6\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9\sbasedon1\snext6 Index;}
}
{\info{\author Boris Shemigon}{\creatim\yr2011\mo6\dy30\hr11\min11}{\revtim\yr0\mo0\dy0\hr0\min0}{\printim\yr0\mo0\dy0\hr0\min0}{\comment StarWriter}{\vern3300}}\deftab709
{\*\pgdsctbl
{\pgdsc0\pgdscuse195\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}}
\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9 {\rtlch \ltrch\loch\f0\fs24\lang9\i0\b0 Apostrophe: `}
\par \pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9 {\rtlch \ltrch\loch\f0\fs24\lang9\i0\b0 Quotation mark: ' \'81\'67}
\par \pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9
\par \pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9 {\rtlch \ltrch\loch\f0\fs24\lang9\i0\b0 ` ~ ! @ # $ % ^ & * ( ) - _ = + [ \{ ] \} \\ | ; : ' \'81\'67 , < . > / ?}
\par }
Loading

0 comments on commit 381a306

Please sign in to comment.