Skip to content

Commit

Permalink
Merge pull request #73 from malcolmstill/bad-encoding-replacement
Browse files Browse the repository at this point in the history
Allow replacement of bad encoding
  • Loading branch information
beatrichartz committed Dec 20, 2017
2 parents c3da6e1 + e32fc93 commit e936399
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 1 deletion.
2 changes: 2 additions & 0 deletions lib/csv/decoding/decoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ defmodule CSV.Decoding.Decoder do
When set to `false` (default), will use no header values.
When set to anything but `false`, the resulting rows in the matrix will
be maps instead of lists.
* `:replacement` – The replacement string to use where lines have bad
encoding. Defaults to `nil`, which disables replacement.
## Examples
Expand Down
18 changes: 17 additions & 1 deletion lib/csv/decoding/lexer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,22 @@ defmodule CSV.Decoding.Lexer do
* `:separator` – The separator token to use, defaults to `?,`. Must be a
codepoint.
* `:replacement` – The replacement string to use where lines have bad
encoding. Defaults to `nil`, which disables replacement.
"""

def lex({ line, index }, options \\ []) when is_list(options) do
separator = options |> Keyword.get(:separator, @separator)
replacement = options |> Keyword.get(:replacement, @replacement)

case String.valid?(line) do
false -> { :error, EncodingError, "Invalid encoding", index }
false ->
if replacement do
replace_bad_encoding(line, replacement) |> lex(index, separator)
else
{ :error, EncodingError, "Invalid encoding", index }
end
true -> lex(line, index, separator)
end
end
Expand Down Expand Up @@ -68,4 +77,11 @@ defmodule CSV.Decoding.Lexer do
defp add_token(tokens, token) do
tokens ++ [token]
end

defp replace_bad_encoding(line, replacement) do
line
|> String.codepoints
|> Enum.map(fn codepoint -> if String.valid?(codepoint), do: codepoint, else: replacement end)
|> Enum.join
end
end
1 change: 1 addition & 0 deletions lib/csv/defaults.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ defmodule CSV.Defaults do
@delimiter << @carriage_return :: utf8 >> <> << @newline :: utf8 >>
@double_quote ?"
@escape_max_lines 1000
@replacement nil
end
end

Expand Down
7 changes: 7 additions & 0 deletions test/decoding/baseline_exceptions_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ defmodule DecodingTests.BaselineExceptionsTest do
]
end

test "invalid encoding can be replaced" do
stream = [<<"a,", 255>>, "c,d"] |> to_stream
result = Decoder.decode(stream, replacement: "?") |> Enum.take(2)

assert result == [ok: ~w(a ?), ok: ~w(c d)]
end

test "discards any state in the current message queues when halted" do
stream = ["a,be", "c,d", "e,f", "g,h", "i,j", "k,l"] |> to_stream
result = Decoder.decode(stream) |> Enum.take(2)
Expand Down

0 comments on commit e936399

Please sign in to comment.