Support multibyte codecs using a byte buffer. Support \ucN. Support n…

…egative numbers in \u
brendonh · May 18, 2015 · 381a306 · 381a306
1 parent 800f92a
commit 381a306
Show file tree

Hide file tree

Showing 8 changed files with 489 additions and 17,674 deletions.
diff --git a/examples/reading/rtf15.py b/examples/reading/rtf15.py
@@ -1,12 +1,16 @@
+import sys
+import os.path
+
 from pyth.plugins.rtf15.reader import Rtf15Reader
 from pyth.plugins.xhtml.writer import XHTMLWriter
 
-import sys
 
 if len(sys.argv) > 1:
     filename = sys.argv[1]
 else:
-    filename = "sample.rtf"
+    filename = os.path.normpath(os.path.join(
+        os.path.dirname(__file__), 
+        '../../tests/rtfs/sample.rtf'))
 
 doc = Rtf15Reader.read(open(filename, "rb"))
 

diff --git a/examples/reading/sampleWithImage.rtf b/examples/reading/sampleWithImage.rtf
diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py
@@ -13,7 +13,7 @@
 from pyth.encodings import symbol
 
 _CONTROLCHARS = set(string.ascii_letters + string.digits + "-*")
-_DIGITS = set(string.digits)
+_DIGITS = set(string.digits + "-")
 
 
 _CODEPAGES = {
@@ -56,17 +56,17 @@
     255: "cp850",  # OEM
 }
 
-# All the ones named by number in my 2.6 encodings dir
+# All the ones named by number in my 2.6 encodings dir, and those listed above
 _CODEPAGES_BY_NUMBER = dict(
-    (x, "cp%s" % x) for x in (37, 1006, 1026, 1140, 1250, 1251, 1252, 1253, 1254, 1255,
-                              1256, 1257, 1258, 424, 437, 500, 737, 775, 850, 852, 855,
-                              856, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874,
-                              875, 932, 949, 950))
+    (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856, 
+                              857, 860, 861, 862, 863, 864, 865, 866, 869, 874,
+                              875, 932, 936, 949, 950, 1006, 1026, 1140, 1250, 
+                              1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361))
 
 # Miscellaneous, incomplete
 _CODEPAGES_BY_NUMBER.update({
-   10000: "mac-roman",
-   10007: "mac-greek",
+    10000: "mac-roman",
+    10007: "mac-greek"
 })
 
 
@@ -123,6 +123,7 @@ def parse(self):
                 subGroup = Group(self, self.group, self.charsetTable)
                 self.stack.append(subGroup)
                 subGroup.skip = self.group.skip
+                self.group.flushChars()
                 self.group = subGroup
             elif next == '}':
                 subGroup = self.stack.pop()
@@ -380,9 +381,20 @@ def __init__(self, reader, parent=None, charsetTable=None):
         self.charsetTable = charsetTable
 
         self.content = []
+        self.charBuffer = []
+        self.skipCount = 0
+
+
+    def flushChars(self):
+        chars = "".join(self.charBuffer).decode(self.charset, self.reader.errors)
+        self.content.append(chars)
+        self.charBuffer = []
 
 
     def handle(self, control, digits):
+        if self.charBuffer and control != "ansi_escape":
+            self.flushChars()
+
         if control == '*':
             self.destination = True
             return
@@ -405,8 +417,11 @@ def handle(self, control, digits):
             handler()
 
 
-    def char(self, char):
-        self.content.append(char.decode(self.charset, self.reader.errors))
+    def char(self, byte):
+        if self.skipCount:
+            self.skipCount -= 1
+        else:
+            self.charBuffer.append(byte)
 
 
     def _finalize(self):
@@ -420,17 +435,7 @@ def _finalize(self):
         if self.skip:
             return
 
-        stuff = []
-        i = 0
-        while i < len(self.content):
-            thing = self.content[i]
-            if isinstance(thing, Skip):
-                i += thing.count
-            else:
-                stuff.append(thing)
-            i += 1
-
-        self.content = stuff
+        self.flushChars()
 
 
     # This is only the default,
@@ -529,7 +534,8 @@ def handle_ansi_escape(self, code):
         else:
             char = chr(code)
             if not self.isPcData:
-                char = char.decode(self.charset, self.reader.errors)
+                self.char(char)
+                return
 
         self.content.append(char)
 
@@ -543,15 +549,18 @@ def handle_control_symbol(self, symbol):
     def handle_u(self, codepoint):
         codepoint = int(codepoint)
         try:
-            char = unichr(codepoint)
+            char = unichr(codepoint % 2**16)
         except ValueError:
             if self.reader.errors == 'replace':
                 char = '?'
             else:
                 raise
 
         self.content.append(char)
-        self.content.append(Skip(self.props.get('unicode_skip', 1)))
+        self.skipCount = self.props.get('unicode_skip', 1)
+
+    def handle_uc(self, skipBytes):
+        self.props['unicode_skip'] = int(skipBytes)
 
 
     def handle_par(self):

diff --git a/tests/rtfs/ansi.rtf b/tests/rtfs/ansi.rtf
@@ -0,0 +1,19 @@
+{\rtf1\ansi\deff0\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset128 Times New Roman;}{\f1\froman\fprq2\fcharset128 Times New Roman;}{\f2\fswiss\fprq2\fcharset128 Arial;}{\f3\fnil\fprq2\fcharset128 Arial Unicode MS;}}
+{\colortbl;\red0\green0\blue0;\red128\green128\blue128;}
+{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9\snext1 Normal;}
+{\s2\sb240\sa120\keepn\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs28\lang1081\ltrch\dbch\langfe2052\hich\f2\fs28\lang9\loch\f2\fs28\lang9\sbasedon1\snext3 Heading;}
+{\s3\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9\sbasedon1\snext3 Body Text;}
+{\s4\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9\sbasedon3\snext4 List;}
+{\s5\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ai\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\i\loch\f0\fs24\lang9\i\sbasedon1\snext5 caption;}
+{\s6\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9\sbasedon1\snext6 Index;}
+}
+{\info{\author Boris Shemigon}{\creatim\yr2011\mo6\dy30\hr11\min11}{\revtim\yr0\mo0\dy0\hr0\min0}{\printim\yr0\mo0\dy0\hr0\min0}{\comment StarWriter}{\vern3300}}\deftab709
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse195\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Standard;}}
+\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
+\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9 {\rtlch \ltrch\loch\f0\fs24\lang9\i0\b0 Apostrophe: `}
+\par \pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9 {\rtlch \ltrch\loch\f0\fs24\lang9\i0\b0 Quotation mark: ' \'81\'67}
+\par \pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9 
+\par \pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af3\afs24\lang1081\ltrch\dbch\af3\langfe2052\hich\f0\fs24\lang9\loch\f0\fs24\lang9 {\rtlch \ltrch\loch\f0\fs24\lang9\i0\b0 ` ~ ! @ # $ % ^ & * ( ) - _ = + [ \{ ] \} \\ | ; : ' \'81\'67 , < . > / ?}
+\par }