Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encode Formulas to prevent CSV injection #104

Merged
merged 2 commits into from Sep 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
@@ -1,5 +1,8 @@
# Changelog

## Unreleased
- Optional parameter `escape_formulas` to prevent CSV injection. [Fixes #103](https://github.com/beatrichartz/csv/issues/103) reported by [@maennchen](https://github.com/maennchen). Contributed by [@maennchen](https://github.com/maennchen) in [PR #104](https://github.com/beatrichartz/csv/pull/104).

## 2.4.1 (2020-09-12)

- Fix unnecessary escaping of delimiters when encoding [Fixes #70](https://github.com/beatrichartz/csv/issues/70)
Expand Down
6 changes: 3 additions & 3 deletions lib/csv.ex
Expand Up @@ -211,9 +211,9 @@ defmodule CSV do
end

defp yield_or_raise!({:error, StrayQuoteError, field, index}, _) do
raise StrayQuoteError,
field: field,
line: index + 1
raise StrayQuoteError,
field: field,
line: index + 1
end

defp yield_or_raise!({:error, mod, message, index}, _) do
Expand Down
17 changes: 11 additions & 6 deletions lib/csv/decoding/decoder.ex
Expand Up @@ -22,23 +22,25 @@ defmodule CSV.Decoding.Decoder do

These are the options:

* `:separator` – The separator token to use, defaults to `?,`.
* `:separator` – The separator token to use, defaults to `?,`.
Must be a codepoint (syntax: ? + (your separator)).
* `:strip_fields` – When set to true, will strip whitespace from fields.
Defaults to false.
* `:num_workers` – The number of parallel operations to run when producing
* `:num_workers` – The number of parallel operations to run when producing
the stream.
* `:worker_work_ratio` – The available work per worker, defaults to 5.
Higher rates will mean more work sharing, but might also lead to work
fragmentation slowing down the queues.
* `:headers` – When set to `true`, will take the first row of the csv
* `:headers` – When set to `true`, will take the first row of the csv
and use it as header values.
When set to a list, will use the given list as header values.
When set to `false` (default), will use no header values.
When set to anything but `false`, the resulting rows in the matrix will
be maps instead of lists.
* `:replacement` – The replacement string to use where lines have bad
* `:replacement` – The replacement string to use where lines have bad
encoding. Defaults to `nil`, which disables replacement.
* `:escape_formulas – Remove Formular Escaping inserted to prevent
[CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).

## Examples

Expand All @@ -50,7 +52,7 @@ defmodule CSV.Decoding.Decoder do
...> |> Enum.take(2)
[ok: [\"a\", \"b\"], ok: [\"c\", \"d\"]]

Map an existing stream of lines separated by a token to a stream of rows with
Map an existing stream of lines separated by a token to a stream of rows with
a header row:

iex> [\"a;b\",\"c;d\", \"e;f\"]
Expand All @@ -62,7 +64,7 @@ defmodule CSV.Decoding.Decoder do
ok: %{\"a\" => \"e\", \"b\" => \"f\"}
]

Map an existing stream of lines separated by a token to a stream of rows with
Map an existing stream of lines separated by a token to a stream of rows with
a header row with duplications:

iex> [\"a;b;b\",\"c;d;e\", \"f;g;h\"]
Expand Down Expand Up @@ -198,6 +200,7 @@ defmodule CSV.Decoding.Decoder do
defp add_row_length({line, index, headers}, {false, options}) do
{[{line, index, headers, false}], {false, options}}
end

defp add_row_length({line, 0, false}, {true, options}) do
case parse_row({line, 0}, options) do
{:ok, row, _} ->
Expand All @@ -208,10 +211,12 @@ defmodule CSV.Decoding.Decoder do
{[{line, 0, false, false}], {false, options}}
end
end

defp add_row_length({line, index, headers}, {true, options}) when is_list(headers) do
row_length = headers |> Enum.count()
{[{line, index, headers, row_length}], {row_length, options}}
end

defp add_row_length({line, index, headers}, {row_length, options}) do
{[{line, index, headers, row_length}], {row_length, options}}
end
Expand Down
70 changes: 49 additions & 21 deletions lib/csv/decoding/lexer.ex
Expand Up @@ -25,69 +25,97 @@ defmodule CSV.Decoding.Lexer do
def lex({line, index}, options \\ []) when is_list(options) do
separator = options |> Keyword.get(:separator, @separator)
replacement = options |> Keyword.get(:replacement, @replacement)
escape_formulas = options |> Keyword.get(:escape_formulas, @escape_formulas)

case String.valid?(line) do
false ->
if replacement do
replace_bad_encoding(line, replacement) |> lex(index, separator)
replace_bad_encoding(line, replacement) |> lex(index, separator, escape_formulas)
else
{:error, EncodingError, "Invalid encoding", index}
end

true ->
lex(line, index, separator)
lex(line, index, separator, escape_formulas)
end
end

defp lex(line, index, separator) do
case lex([], nil, line, separator) do
defp lex(line, index, separator, escape_formulas) do
case lex([], nil, line, separator, escape_formulas) do
{:ok, tokens} -> {:ok, tokens, index}
end
end

defp lex(tokens, {:delimiter, value}, <<@newline::utf8>> <> tail, separator) do
lex(tokens, {:delimiter, value <> <<@newline::utf8>>}, tail, separator)
defp lex(tokens, {:delimiter, value}, <<@newline::utf8>> <> tail, separator, escape_formulas) do
lex(tokens, {:delimiter, value <> <<@newline::utf8>>}, tail, separator, escape_formulas)
end

defp lex(tokens, current_token, <<@newline::utf8>> <> tail, separator) do
lex(tokens |> add_token(current_token), {:delimiter, <<@newline::utf8>>}, tail, separator)
defp lex(tokens, current_token, <<@newline::utf8>> <> tail, separator, escape_formulas) do
lex(
tokens |> add_token(current_token),
{:delimiter, <<@newline::utf8>>},
tail,
separator,
escape_formulas
)
end

defp lex(tokens, current_token, <<@carriage_return::utf8>> <> tail, separator) do
defp lex(tokens, current_token, <<@carriage_return::utf8>> <> tail, separator, escape_formulas) do
lex(
tokens |> add_token(current_token),
{:delimiter, <<@carriage_return::utf8>>},
tail,
separator
separator,
escape_formulas
)
end

defp lex(tokens, current_token, <<@double_quote::utf8>> <> tail, separator) do
defp lex(tokens, current_token, <<@double_quote::utf8>> <> tail, separator, escape_formulas) do
lex(
tokens |> add_token(current_token),
{:double_quote, <<@double_quote::utf8>>},
tail,
separator
separator,
escape_formulas
)
end

defp lex(tokens, current_token, <<head::utf8>> <> tail, separator) when head == separator do
lex(tokens |> add_token(current_token), {:separator, <<separator::utf8>>}, tail, separator)
defp lex(tokens, current_token, <<head::utf8>> <> tail, separator, escape_formulas)
when head == separator do
lex(
tokens |> add_token(current_token),
{:separator, <<separator::utf8>>},
tail,
separator,
escape_formulas
)
end

defp lex(tokens, {:content, value}, <<head::utf8>> <> tail, separator) do
lex(tokens, {:content, value <> <<head::utf8>>}, tail, separator)
for start <- @escape_formula_start do
defp lex(tokens, current_token, "'#{unquote(start)}" <> tail, separator, true) do
lex(tokens, current_token, unquote(start) <> tail, separator, true)
end
end

defp lex(tokens, {:content, value}, <<head::utf8>> <> tail, separator, escape_formulas) do
lex(tokens, {:content, value <> <<head::utf8>>}, tail, separator, escape_formulas)
end

defp lex(tokens, nil, <<head::utf8>> <> tail, separator) do
lex(tokens, {:content, <<head::utf8>>}, tail, separator)
defp lex(tokens, nil, <<head::utf8>> <> tail, separator, escape_formulas) do
lex(tokens, {:content, <<head::utf8>>}, tail, separator, escape_formulas)
end

defp lex(tokens, current_token, <<head::utf8>> <> tail, separator) do
lex(tokens |> add_token(current_token), {:content, <<head::utf8>>}, tail, separator)
defp lex(tokens, current_token, <<head::utf8>> <> tail, separator, escape_formulas) do
lex(
tokens |> add_token(current_token),
{:content, <<head::utf8>>},
tail,
separator,
escape_formulas
)
end

defp lex(tokens, current_token, "", _) do
defp lex(tokens, current_token, "", _, _) do
{:ok, tokens |> add_token(current_token)}
end

Expand Down
2 changes: 2 additions & 0 deletions lib/csv/defaults.ex
Expand Up @@ -12,6 +12,8 @@ defmodule CSV.Defaults do
@double_quote ?"
@escape_max_lines 1000
@replacement nil
@escape_formulas false
@escape_formula_start ["=", "-", "+", "@"]
end
end

Expand Down
34 changes: 26 additions & 8 deletions lib/csv/encoding/encode.ex
Expand Up @@ -33,23 +33,41 @@ defimpl CSV.Encode, for: BitString do
def encode(data, env \\ []) do
separator = env |> Keyword.get(:separator, @separator)
delimiter = env |> Keyword.get(:delimiter, @delimiter)
escape_formulas = env |> Keyword.get(:escape_formulas, @escape_formulas)

data =
if escape_formulas and String.starts_with?(data, @escape_formula_start) do
"'" <> data
else
data
end

patterns = [
<<separator::utf8>>,
delimiter,
<<@carriage_return::utf8>>,
<<@newline::utf8>>,
<<@double_quote::utf8>>
]

patterns =
if escape_formulas do
patterns ++ @escape_formula_start
else
patterns
end

cond do
String.contains?(data, [
<<separator::utf8>>,
delimiter,
<<@carriage_return::utf8>>,
<<@newline::utf8>>,
<<@double_quote::utf8>>
]) ->
String.contains?(data, patterns) ->
<<@double_quote::utf8>> <>
(data
|> String.replace(
<<@double_quote::utf8>>,
<<@double_quote::utf8>> <> <<@double_quote::utf8>>
)) <> <<@double_quote::utf8>>

true -> data
true ->
data
end
end
end
14 changes: 8 additions & 6 deletions lib/csv/encoding/encoder.ex
Expand Up @@ -14,16 +14,18 @@ defmodule CSV.Encoding.Encoder do

These are the options:

* `:separator` – The separator token to use, defaults to `?,`.
* `:separator` – The separator token to use, defaults to `?,`.
Must be a codepoint (syntax: ? + your separator token).
* `:delimiter` – The delimiter token to use, defaults to `\"\\r\\n\"`.
* `:headers` – When set to `true`, uses the keys of the first map as
* `:delimiter` – The delimiter token to use, defaults to `\"\\r\\n\"`.
* `:headers` – When set to `true`, uses the keys of the first map as
the first element in the stream. All subsequent elements are the values
of the maps. When set to a list, will use the given list as the first
element in the stream and order all subsequent elements using that list.
When set to `false` (default), will use the raw inputs as elements.
When set to anything but `false`, all elements in the input stream are
assumed to be maps.
* `:escape_formulas` – Escape formulas to prevent
[CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).

## Examples

Expand Down Expand Up @@ -89,13 +91,13 @@ defmodule CSV.Encoding.Encoder do

encoded =
row
|> Enum.map(&encode_cell(&1, separator, delimiter))
|> Enum.map(&encode_cell(&1, options))
|> Enum.join(<<separator::utf8>>)

encoded <> delimiter
end

defp encode_cell(cell, separator, delimiter) do
CSV.Encode.encode(cell, separator: separator, delimiter: delimiter)
defp encode_cell(cell, options) do
CSV.Encode.encode(cell, options)
end
end
5 changes: 3 additions & 2 deletions lib/csv/exceptions.ex
Expand Up @@ -45,8 +45,9 @@ defmodule CSV.StrayQuoteError do
line = options |> Keyword.fetch!(:line)
field = options |> Keyword.fetch!(:field)

message = "Stray quote on line " <>
Integer.to_string(line) <> " near \"" <> field <> "\""
message =
"Stray quote on line " <>
Integer.to_string(line) <> " near \"" <> field <> "\""

%__MODULE__{
line: line,
Expand Down
1 change: 0 additions & 1 deletion test/csv_exceptions_test.exs
Expand Up @@ -22,5 +22,4 @@ defmodule CSVExceptionsTest do
CSV.decode!(stream) |> Stream.run()
end
end

end
13 changes: 7 additions & 6 deletions test/csv_test.exs
Expand Up @@ -29,17 +29,18 @@ defmodule CSVTest do
test "decodes in strict mode not raising validation errors on variable row length if row length validation is disabled" do
stream = ~w(a,be a c,d) |> to_stream

CSV.decode!(stream, [validate_row_length: false]) |> Stream.run
CSV.decode!(stream, validate_row_length: false) |> Stream.run()
end

test "decodes in normal mode not not validating row length when row length validation is disabled" do
stream = ~w(a,be a c,d) |> to_stream
result = CSV.decode(stream, [validate_row_length: false]) |> Enum.to_list
result = CSV.decode(stream, validate_row_length: false) |> Enum.to_list()

assert result == [
ok: ~w(a be),
ok: ~w(a),
ok: ~w(c d)
]
ok: ~w(a be),
ok: ~w(a),
ok: ~w(c d)
]
end

test "uses the :lines preprocessor by default" do
Expand Down
17 changes: 15 additions & 2 deletions test/decoding/baseline_exceptions_test.exs
Expand Up @@ -84,8 +84,8 @@ defmodule DecodingTests.BaselineExceptionsTest do
]
end

def encode_decode_loop(l) do
l |> CSV.encode() |> Decoder.decode() |> Enum.to_list()
def encode_decode_loop(l, opts \\ []) do
l |> CSV.encode(opts) |> Decoder.decode(opts) |> Enum.to_list()
end

test "does not get corrupted after an error" do
Expand All @@ -101,4 +101,17 @@ defmodule DecodingTests.BaselineExceptionsTest do
assert result_b == [ok: ~w(b)]
assert result_c == [ok: ~w(b)]
end

test "removes escaping for formula" do
input = [["=1+1", ~S(=1+2";=1+2), ~S(=1+2'" ;,=1+2)], ["-10+7"], ["+10+7"], ["@A1:A10"]]

assert [
ok: [
"=1+1=1+2\";=1+2=1+2'\" ;,=1+2",
"-10+7",
"+10+7",
"@A1:A10"
]
] = encode_decode_loop([input], escape_formulas: true)
end
end