Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

* pdf2line.

  • Loading branch information...
commit b72c232e328d64d78328536f1ac64ce051d77584 1 parent 02c5e54
@audreyt authored
Showing with 4,238 additions and 114 deletions.
  1. +4,103 −0 bin/TextOutputDev.cc
  2. +14 −113 bin/afp-page.hs
  3. +11 −1 bin/afp2line.hs
  4. +110 −0 bin/pdf2line.hs
View
4,103 bin/TextOutputDev.cc
4,103 additions, 0 deletions not shown
View
127 bin/afp-page.hs
@@ -2,9 +2,8 @@
module Main where
import OpenAFP
-import Data.ByteString as S (findIndex, take, drop)
-import Data.ByteString.Internal (fromForeignPtr)
import System.Exit
+import qualified Data.ByteString.Char8 as C
main :: IO ()
main = do
@@ -21,14 +20,13 @@ main = do
, _T_RLI ... io . writeIORef cnt . fromEnum . t_rli
])
])
- , _MCF1 === mcf1Handler
- , _PTX === (`filterChunks`
- [ _PTX_TRN === trnHandler
- , _PTX_SCFL ... io . writeIORef currentFont
- ])
]
writeAFP outputFile cs'
+{-# NOINLINE cnt #-}
+cnt :: IORef Int
+cnt = unsafePerformIO (newIORef 0)
+
cs !==>! list = iter cs
where
iter [] = return []
@@ -37,113 +35,16 @@ cs !==>! list = iter cs
rest <- unsafeInterleaveIO (iter cs)
return $ this ++ rest
-data TrnMode = Nil | SBCS | DBCS deriving (Show, Eq, Enum)
-
--- We now have a trn. We need to break it into DBCS and SBCS parts;
--- if we're in a SBCS part, any thing in 0x41-0x7F goes to DBCS.
--- if we're in a DBCS part, 0x40 needs one-char lookahead, and everything else goes to SBCS.
-trnHandler trn = scanTrn 0 0 Nil
- where
- bs@(PS pstr off len) = packBuf $ ptx_trn trn
- pstr' = castForeignPtr pstr
- cstr = unsafeForeignPtrToPtr pstr' `plusPtr` off
- scanTrn i prev mode
- | i == len = case mode of
- Nil -> push trn
- SBCS -> emit
- DBCS -> emit >> (io $ readIORef currentFont) >>= push
- | otherwise = io (peekElemOff cstr i :: IO Word8) >>= \ch -> case mode of
- Nil -> {-# SCC "Nil" #-} case isSBCS ch of
- DBCS -> case i `mod` 2 of
- 0 -> scanTrn (i+2) 0 DBCS
- _ -> emit >> scanTrn (i+2) i DBCS
- SBCS -> scanTrn (i+1) 0 (if ch == 0x40 then Nil else SBCS) -- len 0 Nil -- Begun with SBCS -- Skip to end
- Nil -> let i' = i+1 in if i' == len then scanTrn i' 0 SBCS else do
- ch' <- io (peekElemOff cstr i' :: IO Word8)
- if isAfterSBCS ch' || ch' == ch then scanTrn i' 0 SBCS else scanTrn (i+2) 0 DBCS
- SBCS -> {-# SCC "SBCS" #-} do
- let emitDBCS = emit >> scanTrn (i+2) i DBCS
- emitSBCS = scanTrn (i+1) prev SBCS
- case isSBCS ch of
- DBCS -> emitDBCS
- SBCS -> emitSBCS
- Nil -> let i' = i+1 in if i' == len then emitSBCS else do
- ch' <- io (peekElemOff cstr i' :: IO Word8)
- if isAfterSBCS ch' || ch' == ch then emitSBCS else emitDBCS
- DBCS -> {-# SCC "DBCS" #-} ($ isDBCS ch) . fix $ \redo mch -> case mch of
- Nil -> let i' = i+1 in if i' == len then redo SBCS else do
- ch' <- io (peekElemOff cstr i' :: IO Word8)
- redo (case ch' :: Word8 of
- 0x40 -> DBCS
- _ -> SBCS)
- DBCS -> scanTrn (i+2) prev DBCS
- _ -> emit >> scanTrn (i+1) i SBCS
- where
- emit = {-# SCC "emit" #-} do
- scfl <- io $ readIORef currentFont
- case mode of
- SBCS -> push scfl -- { ptx_scfl = 2 }
- DBCS | prev > 0 || i /= len || isJust (S.findIndex isUnambiguous bs) -> do
- c <- io $ readIORef cnt
- push scfl{ ptx_scfl = if c >= 35 then 36 else 1 }
- _ -> return ()
- push curTRN
- curTRN = {-# SCC "curTRN" #-} trn{ ptx_trn = mkBuf (S.take (i-prev) (S.drop prev bs)) }
-
-{-# INLINE isDBCS #-}
-isDBCS 0x40 = Nil
-isDBCS ch = if (ch >= 0x41 && ch <= 0x7F) then DBCS else SBCS
-
-{-# INLINE isSBCS #-}
-isSBCS ch = if ch < 0x41 || ch > 0x7F then SBCS else case ch of
- 0x4B -> Nil -- "."
- 0x5C -> Nil -- "*"
- 0x60 -> Nil -- "-"
- 0x61 -> Nil
- 0x6B -> Nil
- 0x6D -> Nil -- "_"
- 0x7A -> Nil -- ":"
- 0x7E -> Nil -- "="
- _ -> DBCS
-
-{-# INLINE isAfterSBCS #-}
-isAfterSBCS ch' = (
- (ch' >= 0x81) && (ch' <= 0xF9) &&
- ((ch' <= 0x89 || ch' >= 0xF0) ||
- (ch' >= 0x91 && ch' <= 0x99) ||
- (ch' >= 0xA2 && ch' <= 0xA9) ||
- (ch' >= 0xC1 && ch' <= 0xC9) ||
- (ch' >= 0xD1 && ch' <= 0xD9) ||
- (ch' >= 0xE2 && ch' <= 0xE9)) )
- || (ch' == 0x6D) -- "_"
- || (ch' == 0x40) -- " "
-
-{-# INLINE isUnambiguous #-}
-isUnambiguous 0x40 = False
-isUnambiguous ch = case isSBCS ch of
- Nil -> False
- _ -> True
-
-mcf1Handler mcf1 = do
- push $ writeData mcf1 (map fixMCF1 $ readData mcf1)
- where
- fixMCF1 :: Record MCF1_Data -> Record MCF1_Data
- fixMCF1 (Record mcf@MCF1_Data{ mcf1_CodedFontLocalId = 31 }) = Record mcf{ mcf1_CodedFontName = toA8 "T0XXXX" }
- fixMCF1 x = x
-
-{-# NOINLINE currentFont #-}
-currentFont :: IORef PTX_SCFL
-currentFont = unsafePerformIO (newIORef _PTX_SCFL)
-
-{-# NOINLINE cnt #-}
-cnt :: IORef Int
-cnt = unsafePerformIO (newIORef 0)
-
fqnHandler fqn = do
c <- io $ readIORef cnt
let fqn' = case c of
- 0 -> fqn{ t_fqn = toAStr "T0XXXX" }
- 35 -> fqn{ t_fqn = toAStr "T0XXXX" }
- _ -> fqn
+ 0 -> fqnDBCS
+ 35 -> fqnDBCS
+ _ | isM__T -> fqnDBCS
+ _ -> fqn
push fqn'
- return ()
+ where
+ a = packAStr (t_fqn fqn)
+ isM__T = (C.length a >= 6) && ((C.index a 2 == 'M') || (C.index a 5 == 'T'))
+ fqnDBCS = fqn{ t_fqn = toAStr "T0XXXX " }
+
View
12 bin/afp2line.hs
@@ -1,16 +1,26 @@
-{-# OPTIONS -O2 -fglasgow-exts #-}
+{-# OPTIONS_GHC -O2 -fglasgow-exts #-}
module Main where
import OpenAFP
import CP835
+import System.Posix.Resource
import qualified Data.IntMap as IM
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Char8 as C
+__1GB__ :: Integer
+__1GB__ = 1024 * 1024 * 1024
+
+mkLimit :: Integer -> ResourceLimits
+mkLimit x = ResourceLimits (ResourceLimit x) (ResourceLimit x)
+
main :: IO ()
main = do
+ setResourceLimit ResourceTotalMemory (mkLimit __1GB__)
+ setResourceLimit ResourceCoreFileSize (mkLimit 0)
+
hSetBinaryMode stdout True
args <- getArgs
when (null args) $ do
View
110 bin/pdf2line.hs
@@ -0,0 +1,110 @@
+{-# OPTIONS_GHC -O2 -fglasgow-exts #-}
+
+module Main where
+import Data.Binary
+import Data.Binary.Get
+import Control.Monad
+import Data.IORef
+import System.IO
+import System.IO.Unsafe
+import System.Environment (getArgs)
+import Data.ByteString (ByteString)
+import qualified Data.IntMap as IM
+import qualified Data.ByteString as S
+import qualified Data.ByteString.Lazy as L
+import qualified Data.ByteString.Char8 as C
+
+main = do
+ hSetBinaryMode stdin True
+ hSetBinaryMode stdout True
+ res <- L.getContents
+ mapM_ dumpPage (fromDoc $ decode res)
+
+dumpPage page
+ | IM.null pg = return ()
+ | otherwise = do
+ forM_ (IM.elems pg) $ \line -> do
+ _CurrentColumn <- newIORef 0
+ forM_ (IM.toAscList line) $ \(col, str) -> do
+ cur <- readIORef _CurrentColumn
+ S.putStr $ S.take (col - cur) _Spaces
+ S.putStr str
+ writeIORef _CurrentColumn (col + S.length str)
+ S.putStr _NewLine
+ S.putStr _NewPage
+ where
+ pg = fromPage page
+
+_Spaces, _NewLine, _NewPage :: ByteString
+_Spaces = S.replicate 4096 0x20
+_NewLine = C.pack "\r\n"
+_NewPage = C.pack "\r\n\x0C\r\n"
+
+-- A Page is a IntMap from line-number to a map from column-number to bytestring.
+newtype Doc = MkDoc { fromDoc :: [Page] } deriving Show
+newtype Page = MkPage { fromPage :: IM.IntMap Line } deriving Show
+type Line = IM.IntMap S.ByteString
+
+instance Binary Doc where
+ put = undefined
+ get = liftM MkDoc getList
+ where
+ getList = do
+ rv <- isEmpty
+ if rv then return [] else do
+ x <- get
+ xs <- getList
+ return (x:xs)
+
+data Chunk = MkChunk
+ { c_right :: !Int
+ , c_upper :: !Int
+ , c_str :: !ByteString
+ }
+
+instance Binary Page where
+ put = undefined
+ get = getChunk 0 []
+ where
+ getChunk minFont chunks = do
+ rv <- isEmpty
+ if rv then done else do
+ w8 <- getWord8
+ case w8 of
+ 0x6C -> do -- 'l'
+ skip 10
+ col' <- getInt 6
+ skip 21
+ ln <- getInt 6
+ skip 3
+ font <- getInt 6
+ skip 7
+ sz <- getInt 4
+ skip 1
+ str <- getByteString sz
+ w8' <- getWord8
+ case w8' of
+ 0x0D -> skip 1
+ 0x0A -> return ()
+ _ -> fail $ "Bad parse: " ++ show w8'
+ let font' = if minFont == 0 then font else min minFont (font `div` 2)
+ getChunk font' (MkChunk col' ln str:chunks)
+ 0x0D -> skip 1 >> done
+ 0x0A -> done
+ _ -> fail $ "Bad parse: " ++ show w8
+ where
+ done = return $ foldl (buildPage minFont) (MkPage IM.empty) chunks
+ getInt 0 = return 0
+ getInt n = do
+ digit <- getWord8
+ rest <- getInt (n-1)
+ return $ (fromEnum $ digit - 0x30) * (10 ^ (n-1)) + rest
+ buildPage minFont (MkPage pg) (MkChunk col' ln' str) = MkPage $ IM.insert ln entry pg
+ where
+ sz = S.length str
+ width = (col' `div` minFont) - sz
+ ln = ln' `div` minFont
+ entry = case IM.lookup ln pg of
+ Nothing -> IM.singleton width str
+ Just im -> IM.insert width str im
+
Please sign in to comment.
Something went wrong with that request. Please try again.