Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: f13f763570
Fetching contributors…

Cannot retrieve contributors at this time

file 123 lines (103 sloc) 4.081 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
module Main where
import OpenAFP
import System.Exit
import System.FilePath
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.Lazy as L

-- Algorithm:
-- Gather everything up to first BPG
-- write out each BPG/EPG chunks
-- append ENG+EDT

data PageSize = PSmall | PLarge deriving (Show)

type MaybeHandleRef = IORef (Maybe Handle)

initFh :: [AFP_] -> (MaybeHandleRef, FilePath) -> IO Handle
initFh chunks (ref, fn) = do
    rv <- readIORef ref
    case rv of
        Just fh -> return fh
        _ -> do
            fh <- openBinaryFile fn WriteMode
            L.hPut fh $ encodeList chunks
            writeIORef ref (Just fh)
            return fh

finalizeFh :: [AFP_] -> MaybeHandleRef -> IO ()
finalizeFh chunks ref = do
    rv <- readIORef ref
    case rv of
        Just fh -> do
            L.hPut fh $ encodeList chunks
            hClose fh
        _ -> return ()

main :: IO ()
main = do
    args <- getArgs

    let (inFile, maxSmallPages) = case args of
            [] -> error "Usage: afp-split-tcb file.afp [max-small-pages (defaults to 3)]"
            [x] -> (x, 3)
            (x:y:_) -> (x, read y)
        (dir, fn) = splitFileName inFile

    let ?maxSmallPages = maxSmallPages

    cs <- readAFP inFile

    let (preamble:rest) = splitPages cs
        _edt = encodeChunk $ Record _EDT

    smallOpened <- newIORef Nothing
    largeOpened <- newIORef Nothing

    forM_ rest $ \page -> do
        fh <- initFh preamble $ case pageSizeOf page of
                PSmall -> (smallOpened, dir `combine` "small_" ++ fn)
                _ -> (largeOpened, dir `combine` "large_" ++ fn)
        L.hPut fh $ encodeList page

    finalizeFh [_edt] smallOpened
    finalizeFh [_edt] largeOpened

isBeginPage :: AFP_ -> Bool
isBeginPage t = (t ~~ _BPG) || (t ~~ _BNG)

-- Find the non-zero AMB with lowest number
pageSizeOf :: (?maxSmallPages :: Int) => [AFP_] -> PageSize
pageSizeOf [] = PSmall
pageSizeOf cs = case sortBy compareAMB rows of
    [] -> PSmall -- A page with no text?
    (row:_) -> case sortBy compareTRN [packNStr $ ptx_trn (decodeChunk c) | c <- row, c ~~ _PTX_TRN] of
        [] -> PSmall -- A column with no TRN?
        (col:_) -> case [ S.map fromDigitLike x | x <- S.splitWith (not . isDigitLike) col, not (S.null x) ] of
            [] -> PSmall -- No digits :-/
            xs -> case C.readInt (last xs) of
                Nothing -> trace (shows xs ": Non-parsable") PSmall -- A non-numeric token?
                Just (i, _) | i <= ?maxSmallPages -> trace (shows xs ": Small") PSmall
                            | otherwise -> trace (shows xs ": Large") PLarge
    where
    rows = case splitRecords _PTX_AMB $ concat [ptx_Chunks $ decodeChunk c | c <- cs, c ~~ _PTX ] of
        [] -> []
        (_:xs) -> xs
    compareTRN x y = compare (C.length y) (C.length x)
    compareAMB (x:_) (y:_) = compare (ambOf x) (ambOf y)
    compareAMB _ _ = EQ
    ambOf c = case ptx_amb $ decodeChunk c of
        0 -> maxBound -- We don't really care about rows with AMB 0.
        x -> x

fromDigitLike :: Word8 -> Word8
fromDigitLike n
    | n >= 0xF0 = n - 0xC0
    | otherwise = n

isDigitLike :: Word8 -> Bool
isDigitLike n = (n >= 0x30 && n <= 0x39) || (n >= 0xF0 && n <= 0xF9)

-- | Selects words corresponding to white-space characters in the Latin-1 range
-- ordered by frequency.
isSpaceWord8' :: Word8 -> Bool
isSpaceWord8' w =
    w == 0x20 ||
    w == 0x00 || -- This Case Is Specific To Us
    w == 0x0A || -- LF, \n
    w == 0x09 || -- HT, \t
    w == 0x0C || -- FF, \f
    w == 0x0D || -- CR, \r
    w == 0x0B || -- VT, \v
    w == 0xA0 -- spotted by QC..
{-# INLINE isSpaceWord8' #-}


splitPages :: [AFP_] -> [[AFP_]]
splitPages cs = if null rest then [this] else case splitPages rest' of
    [] -> [this, rest]
    (y:ys) -> (this:(begins ++ y):ys)
    where
    (this, rest) = break isBeginPage cs
    (begins, rest') = span isBeginPage rest
Something went wrong with that request. Please try again.