Skip to content
Browse files

Implement and document the library

  • Loading branch information...
1 parent 1e8afec commit efc87b4b154ffd4c2b886896d9528e1711f9118b @batterseapower committed Dec 23, 2010
Showing with 137 additions and 14 deletions.
  1. +8 −0 .gitignore
  2. +80 −4 Codec/Text/Detect.hs
  3. +26 −7 charsetdetect.cabal
  4. +1 −0 example/ASCII
  5. +18 −0 example/Main.hs
  6. +1 −0 example/UTF-8
  7. +3 −3 generate-cabal
View
8 .gitignore
@@ -0,0 +1,8 @@
+# Build artifacts
+dist/
+*.hi
+*.o
+
+# Operating system rubbish
+.DS_Store
+Thumbs.db
View
84 Codec/Text/Detect.hs
@@ -1,15 +1,91 @@
-module Codec.Text.Detect () where
+{-# LANGUAGE ForeignFunctionInterface #-}
+-- | Detect the likely character encoding for a stream of bytes using Mozilla's Universal Character Set Detector.
+module Codec.Text.Detect (detectEncodingName, detectEncoding) where
+
+import Control.Exception
+
+import qualified Data.ByteString.Internal as SI
+import qualified Data.ByteString.Lazy as L
+import Data.Traversable (traverse)
+
+import Foreign.C.Types
+import Foreign.C.String
+import Foreign.Ptr
+import Foreign.ForeignPtr
+
+import System.IO
+import System.IO.Unsafe
-- typedef void* csd_t;
type Csd_t = Ptr ()
-- csd_t csd_open(void);
-foreign import "csd_open" c_csd_open :: IO Csd_t
+foreign import ccall unsafe "csd_open" c_csd_open :: IO Csd_t
-- int csd_consider(csd_t csd, const char *data, int length);
-foreign import "csd_consider" c_csd_consider :: Csd_t -> CString -> CInt -> IO CInt
+foreign import ccall unsafe "csd_consider" c_csd_consider :: Csd_t -> CString -> CInt -> IO CInt
-- const char *csd_close(csd_t csd);
-foreign import "csd_close" c_csd_close :: Csd_t -> IO CString
+foreign import ccall unsafe "csd_close" c_csd_close :: Csd_t -> IO CString
+
+
+-- | Detect the likely encoding used by a 'L.ByteString'. At the time of writing, the encoding
+-- returned will be drawn from this list:
+--
+-- > Big5
+-- > EUC-JP
+-- > EUC-KR
+-- > GB18030
+-- > gb18030
+-- > HZ-GB-2312
+-- > IBM855
+-- > IBM866
+-- > ISO-2022-CN
+-- > ISO-2022-JP
+-- > ISO-2022-KR
+-- > ISO-8859-2
+-- > ISO-8859-5
+-- > ISO-8859-7
+-- > ISO-8859-8
+-- > KOI8-R
+-- > Shift_JIS
+-- > TIS-620
+-- > UTF-8
+-- > UTF-16BE
+-- > UTF-16LE
+-- > UTF-32BE
+-- > UTF-32LE
+-- > windows-1250
+-- > windows-1251
+-- > windows-1252
+-- > windows-1253
+-- > windows-1255
+-- > x-euc-tw
+-- > X-ISO-10646-UCS-4-2143
+-- > X-ISO-10646-UCS-4-3412
+-- > x-mac-cyrillic
+--
+-- Note that there are two capitalisations of @gb18030@. For this reason (and to be future-proof against any future behaviour
+-- like this for newly-added character sets) we recommend that you compare character set names case insensitively.
+{-# NOINLINE detectEncodingName #-}
+detectEncodingName :: L.ByteString -> Maybe String
+detectEncodingName b = unsafePerformIO $ do
+ mask $ \restore -> do
+ csd <- c_csd_open
+ restore ((\f -> foldr f (return ()) (L.toChunks b)) $ \chunk feed_more -> do
+ let (fptr, ptr_offset, chunk_length) = SI.toForeignPtr chunk
+ res <- withForeignPtr fptr $ \ptr -> c_csd_consider csd (ptr `plusPtr` ptr_offset) (fromIntegral chunk_length)
+ case res `compare` 0 of
+ LT -> return () -- Some sort of error: could report it?
+ EQ -> feed_more -- Feed more data to come to a conclusion
+ GT -> return () -- We have enough data!
+ ) `onException` c_csd_close csd
+ c_encoding_ptr <- c_csd_close csd
+ if c_encoding_ptr == nullPtr
+ then return Nothing
+ else fmap Just (peekCString c_encoding_ptr)
+-- | Detect the encoding for a 'L.ByteString' and attempt to create a 'TextEncoding' suitable for decoding it.
+detectEncoding :: L.ByteString -> IO (Maybe TextEncoding)
+detectEncoding = traverse mkTextEncoding . detectEncodingName
View
33 charsetdetect.cabal
@@ -1,18 +1,31 @@
Name: charsetdetect
-Version: 0.1
+Version: 1.0
Cabal-Version: >= 1.2
Category: Text
Synopsis: Character set detection for Haskell using Mozilla's Universal Character Set Detector
+Description: Mozilla have developed a robust and efficient character set detection algorithm for
+ use in their web browsers. The algorithm is able to detect all of the most frequently
+ encountered character encodings totally automatically.
+ .
+ This library wraps up their library and exposes a very simple Haskell interface to it.
License: LGPL
Author: Max Bolingbroke <batterseapower@hotmail.com>
Maintainer: Max Bolingbroke <batterseapower@hotmail.com>
Homepage: http://www.github.com/batterseapower/charsetdetect
Build-Type: Simple
+Extra-Source-Files: libcharsetdetect/README.md
+ libcharsetdetect/nspr-emu/README.md
+
Extra-Source-Files: libcharsetdetect/charsetdetect.h
libcharsetdetect/charsetdetectPriv.h
libcharsetdetect/nscore.h
+ libcharsetdetect/mozilla/extensions/universalchardet/src/base/Big5Freq.tab
libcharsetdetect/mozilla/extensions/universalchardet/src/base/CharDistribution.h
+ libcharsetdetect/mozilla/extensions/universalchardet/src/base/EUCKRFreq.tab
+ libcharsetdetect/mozilla/extensions/universalchardet/src/base/EUCTWFreq.tab
+ libcharsetdetect/mozilla/extensions/universalchardet/src/base/GB2312Freq.tab
+ libcharsetdetect/mozilla/extensions/universalchardet/src/base/JISFreq.tab
libcharsetdetect/mozilla/extensions/universalchardet/src/base/JpCntx.h
libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsBig5Prober.h
libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsCharSetProber.h
@@ -42,16 +55,22 @@ Extra-Source-Files: libcharsetdetect/charsetdetect.h
libcharsetdetect/nspr-emu/obsolete/protypes.h
-
Library
Exposed-Modules: Codec.Text.Detect
- Cc-Options: -Ilibcharsetdetect
- -Ilibcharsetdetect/mozilla/extensions/universalchardet/src/base
- -Ilibcharsetdetect/nspr-emu
- -Ilibcharsetdetect/nspr-emu/obsolete
- Ghc-Options: -pgml g++
+ Build-Depends: base >= 4.3.1 && < 5, bytestring >= 0.9.1.8 && < 0.10
+
+ -- We really need to ensure that the *final program* links with g++
+ -- It is useless to specify that the library links with it..
+ --Ghc-Options: -pgml g++
+
+ -- This is a bit dodgy since g++ might link in more stuff, but will probably work in practice:
+ Extra-Libraries: stdc++
+ Include-Dirs: libcharsetdetect
+ libcharsetdetect/mozilla/extensions/universalchardet/src/base
+ libcharsetdetect/nspr-emu
+ libcharsetdetect/nspr-emu/obsolete
C-Sources: libcharsetdetect/charsetdetect.cpp
libcharsetdetect/mozilla/extensions/universalchardet/src/base/CharDistribution.cpp
libcharsetdetect/mozilla/extensions/universalchardet/src/base/JpCntx.cpp
View
1 example/ASCII
@@ -0,0 +1 @@
+Hello World!
View
18 example/Main.hs
@@ -0,0 +1,18 @@
+import System.Environment
+import System.Exit
+import System.IO
+
+import Codec.Text.Detect
+
+import qualified Data.ByteString.Lazy as L
+
+main :: IO ()
+main = do
+ args <- getArgs
+ bs <- case args of
+ [fp] -> L.readFile fp
+ [] -> L.getContents
+
+ case detectEncodingName bs of
+ Nothing -> hPutStrLn stderr "Could not detect encoding" >> exitWith (ExitFailure 1)
+ Just encoding -> putStrLn encoding
View
1 example/UTF-8
@@ -0,0 +1 @@
+我爱说中文
View
6 generate-cabal
@@ -23,7 +23,7 @@ main = do
else return Nothing
let buildable = filter (\fp -> takeExtension fp `elem` [".cpp", ".c"]) fps
- extras = filter (\fp -> takeExtension fp `elem` [".h"]) fps
+ extras = filter (\fp -> takeExtension fp `elem` [".h", ".tab"]) fps
include_dirs = nub $ map takeDirectory extras
showBlock header n xs = [padRight (if is_first then header else "") n ' ' ++ x | (is_first, x) <- (True : repeat False) `zip` xs]
@@ -33,9 +33,9 @@ main = do
extra_source_files = unlines $ showBlock "Extra-Source-Files:" (length "Extra-Source-Files: ") extras
c_sources = unlines $ map (" " ++) $ showBlock "C-Sources:" (length "C-Sources: ") buildable
- c_opts = unlines $ map (" " ++) $ showBlock "Cc-Options:" (length "Cc-Options: ") ["-I" ++ include_dir | include_dir <- include_dirs]
+ include_dirs_block = unlines $ map (" " ++) $ showBlock "Include-Dirs:" (length "Include-Dirs: ") include_dirs
- putStr c_opts
+ putStr include_dirs_block
putStr extra_source_files
putStr c_sources

0 comments on commit efc87b4

Please sign in to comment.
Something went wrong with that request. Please try again.