Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

experimental RFC4180 compliant (ish) quoting support #25

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
130 changes: 92 additions & 38 deletions src/Frames/CSV.hs
Expand Up @@ -46,51 +46,108 @@ import Control.Monad (void)

type Separator = T.Text

type QuoteChar = Char

data QuotingMode
-- | No quoting enabled. The separator may not appear in values
= NoQuoting
-- | Quoted values with the given quoting character. Quotes are escaped by doubling them.
-- Mostly RFC4180 compliant, except doesn't support newlines in values
| RFC4180Quoting QuoteChar
deriving (Eq, Show)

data ParserOptions = ParserOptions { headerOverride :: Maybe [T.Text]
, columnSeparator :: Separator }
deriving (Eq, Ord, Show)
, columnSeparator :: Separator
, quotingMode :: QuotingMode }
deriving (Eq, Show)

instance Lift QuotingMode where
lift NoQuoting = [|NoQuoting|]
lift (RFC4180Quoting char) = [|RFC4180Quoting $(litE . charL $ char)|]

instance Lift ParserOptions where
lift (ParserOptions Nothing sep) = [|ParserOptions Nothing $sep'|]
lift (ParserOptions Nothing sep quoting) = [|ParserOptions Nothing $sep' $quoting'|]
where sep' = [|T.pack $(stringE $ T.unpack sep)|]
lift (ParserOptions (Just hs) sep) = [|ParserOptions (Just $hs') $sep'|]
quoting' = lift quoting
lift (ParserOptions (Just hs) sep quoting) = [|ParserOptions (Just $hs') $sep' $quoting'|]
where sep' = [|T.pack $(stringE $ T.unpack sep)|]
hs' = [|map T.pack $(listE $ map (stringE . T.unpack) hs)|]
quoting' = lift quoting

-- | Default 'ParseOptions' get column names from a header line, and
-- use commas to separate columns.
defaultParser :: ParserOptions
defaultParser = ParserOptions Nothing (T.pack defaultSep)
defaultParser = ParserOptions Nothing defaultSep (RFC4180Quoting '\"')

-- | Default separator string.
defaultSep :: String
defaultSep = ","
defaultSep :: Separator
defaultSep = T.pack ","

-- * Parsing

-- | Helper to split a 'T.Text' on commas and strip leading and
-- trailing whitespace from each resulting chunk.
tokenizeRow :: Separator -> T.Text -> [T.Text]
tokenizeRow sep = map (unquote . T.strip) . T.splitOn sep
where unquote txt
| quoted txt = case T.dropEnd 1 (T.drop 1 txt) of
txt' | T.null txt' -> "Col"
| numish txt' -> txt
| otherwise -> txt'
| otherwise = txt
numish = T.all (`elem` ("-+.0123456789"::String))
quoted txt = case T.uncons txt of
Just ('"', rst)
| not (T.null rst) -> T.last rst == '"'
_ -> False
tokenizeRow :: ParserOptions -> T.Text -> [T.Text]
tokenizeRow options =
handleQuoting . T.splitOn sep
where sep = columnSeparator options
quoting = quotingMode options
handleQuoting = case quoting of
NoQuoting -> id
RFC4180Quoting quote -> reassembleRFC4180QuotedParts sep quote

-- | Post processing applied to a list of tokens split by the
-- separator which should have quoted sections reassembeld
reassembleRFC4180QuotedParts :: Separator -> QuoteChar -> [T.Text] -> [T.Text]
reassembleRFC4180QuotedParts sep quoteChar = finish . foldr f ([], Nothing)
where f :: T.Text -> ([T.Text], Maybe T.Text) -> ([T.Text], Maybe T.Text)
f part (rest, Just accum)
| prefixQuoted part = let token = unescape (T.drop 1 part) <> sep <> accum
in (token : rest, Nothing)
| otherwise = (rest, Just (unescape part <> sep <> accum))
f part (rest, Nothing)
| prefixQuoted part &&
suffixQuoted part = ((unescape . T.drop 1 . T.dropEnd 1 $ part) : rest, Nothing)
| suffixQuoted part = (rest, Just (unescape . T.dropEnd 1 $ part))
| otherwise = (T.strip part : rest, Nothing)

prefixQuoted t =
quoteText `T.isPrefixOf` t &&
(T.length t - (T.length . T.dropWhile (== quoteChar) $ t)) `mod` 2 == 1
suffixQuoted t =
quoteText `T.isSuffixOf` t &&
(T.length t - (T.length . T.dropWhileEnd (== quoteChar) $ t)) `mod` 2 == 1

quoteText = T.singleton quoteChar

unescape :: T.Text -> T.Text
unescape = T.replace (quoteText <> quoteText) quoteText

finish :: ([T.Text], Maybe T.Text) -> [T.Text]
finish (rest, Just dangling) = dangling : rest -- FIXME? just assumes the close quote if it's missing
finish (rest, Nothing ) = rest

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Preface to comments: this is a fantastic PR. I really appreciate how you've taken care to make useful, documented definitions.

Is there a problem here with not stripping the first pass at tokenization? Eg, if a line of input is "hey", "you, guys", the second comma-separated piece starts with a space. Of course, naively stripping those pieces is no good either as we need to preserve internal spaces, but the prefix/suffix predicates should probably be applied to stripped Text values. This makes using a space as a quote character impossible, but I think that's a pretty pathological corner case. We may also want to preserve the stripping behavior of the previous version to ease the life of other Readable instances, too. I think all we would need to do is have a where part' = T.strip part in f, and then use part' in the right places.

Another question is whether it makes sense to do this as a foldr rather than a a left-associated thing like a mapAccumL. It seems like finish is going to force the whole thing, so there's not an opportunity for incrementally computing the result.

This is also complicated enough that it deserves to have some test cases written for it. Once we're happy with this PR, we can hash out some basic tests.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re Preface: Thanks!

yes, I missed the strip step. RFC4180 specifies no spaces between the comma and the quote, but I could imagine that people might want different. I'd think that stripping non-quoted tokens and preserving spaces inside quotes is noncontroversial though. We could also have some kind of ParserOption to turn on stripping inside quotes?

I used foldr just to avoid reversing the list at the end in finish, no particular laziness intent.

and yeah, definitely has several corner cases and should have test cases to go with them

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alright I added the uncontroversial strip pending further discussion

--tokenizeRow :: Separator -> T.Text -> [T.Text]
--tokenizeRow sep = map (unquote . T.strip) . T.splitOn sep
-- where unquote txt
-- | quoted txt = case T.dropEnd 1 (T.drop 1 txt) of
-- txt' | T.null txt' -> "Col"
-- | numish txt' -> txt
-- | otherwise -> txt'
-- | otherwise = txt
-- numish = T.all (`elem` ("-+.0123456789"::String))
-- quoted txt = case T.uncons txt of
-- Just ('"', rst)
-- | not (T.null rst) -> T.last rst == '"'
-- _ -> False

-- | Infer column types from a prefix (up to 1000 lines) of a CSV
-- file.
prefixInference :: (ColumnTypeable a, Monoid a)
=> T.Text -> Handle -> IO [a]
prefixInference sep h = T.hGetLine h >>= go prefixSize . inferCols
=> ParserOptions -> Handle -> IO [a]
prefixInference opts h = T.hGetLine h >>= go prefixSize . inferCols
where prefixSize = 1000 :: Int
inferCols = map inferType . tokenizeRow sep
inferCols = map inferType . tokenizeRow opts
go 0 ts = return ts
go !n ts =
hIsEOF h >>= \case
Expand All @@ -101,11 +158,10 @@ prefixInference sep h = T.hGetLine h >>= go prefixSize . inferCols
readColHeaders :: (ColumnTypeable a, Monoid a)
=> ParserOptions -> FilePath -> IO [(T.Text, a)]
readColHeaders opts f = withFile f ReadMode $ \h ->
zip <$> maybe (tokenizeRow sep <$> T.hGetLine h)
zip <$> maybe (tokenizeRow opts <$> T.hGetLine h)
pure
(headerOverride opts)
<*> prefixInference sep h
where sep = columnSeparator opts
<*> prefixInference opts h

-- * Loading Data

Expand All @@ -122,7 +178,7 @@ instance (Readable t, ReadRec ts) => ReadRec (s :-> t ': ts) where
readRec (h:t) = frameCons (fromText h) (readRec t)

-- | Read a 'RecF' from one line of CSV.
readRow :: ReadRec rs => Separator -> T.Text -> Rec Maybe rs
readRow :: ReadRec rs => ParserOptions -> T.Text -> Rec Maybe rs
readRow = (readRec .) . tokenizeRow

-- | Produce rows where any given entry can fail to parse.
Expand All @@ -133,10 +189,9 @@ readTableMaybeOpt opts csvFile =
h <- openFile csvFile ReadMode
when (isNothing $ headerOverride opts) (void $ T.hGetLine h)
return h
let sep = columnSeparator opts
go = liftIO (hIsEOF h) >>= \case
let go = liftIO (hIsEOF h) >>= \case
True -> return ()
False -> liftIO (readRow sep <$> T.hGetLine h) >>= P.yield >> go
False -> liftIO (readRow opts <$> T.hGetLine h) >>= P.yield >> go
go
{-# INLINE readTableMaybeOpt #-}

Expand All @@ -156,10 +211,9 @@ readTableOpt' opts csvFile =
h <- openFile csvFile ReadMode
when (isNothing $ headerOverride opts) (void $ T.hGetLine h)
return h
let sep = columnSeparator opts
go = liftIO (hIsEOF h) >>= \case
let go = liftIO (hIsEOF h) >>= \case
True -> mzero
False -> let r = recMaybe . readRow sep <$> T.hGetLine h
False -> let r = recMaybe . readRow opts <$> T.hGetLine h
in liftIO r >>= maybe go (flip mplus go . return)
go
{-# INLINE readTableOpt' #-}
Expand Down Expand Up @@ -202,7 +256,7 @@ sanitizeTypeName :: T.Text -> T.Text
sanitizeTypeName = fixupStart . T.concat . T.split (not . valid) . toTitle'
where valid c = isAlphaNum c || c == '\'' || c == '_'
toTitle' = foldMap (onHead toUpper) . T.split (not . isAlphaNum)
onHead f = maybe mempty (uncurry T.cons) . fmap (first f) . T.uncons
onHead f = maybe mempty (uncurry T.cons) . fmap (first f) . T.uncons
fixupStart t = case T.uncons t of
Nothing -> "Col"
Just (c,_) | isAlpha c -> t
Expand Down Expand Up @@ -243,7 +297,7 @@ colDec :: ColumnTypeable a => T.Text -> T.Text -> a -> DecsQ
colDec prefix colName colTy = (:) <$> mkColTDec colTypeQ colTName'
<*> mkColPDec colTName' colTyQ colPName
where colTName = sanitizeTypeName (prefix <> colName)
colPName = fromMaybe "colDec impossible" $
colPName = fromMaybe "colDec impossible" $
fmap (\(c,t) -> T.cons (toLower c) t) (T.uncons colTName)
colTName' = mkName $ T.unpack colTName
colTyQ = colType colTy
Expand All @@ -259,7 +313,7 @@ data RowGen a = RowGen { columnNames :: [String]
, tablePrefix :: String
-- ^ A common prefix to use for every generated
-- declaration.
, separator :: String
, separator :: Separator
-- ^ The string that separates the columns on a
-- row.
, rowTypeName :: String
Expand Down Expand Up @@ -310,7 +364,7 @@ tableType' (RowGen {..}) csvFile =
where recDec' = recDec :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' (T.pack separator)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')

-- | Like 'tableType'', but additionally generates a type synonym for
-- each column, and a proxy value of that type. If the CSV file has
Expand All @@ -334,4 +388,4 @@ tableTypes' (RowGen {..}) csvFile =
where recDec' = recDec :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' (T.pack separator)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')