beatrichartz · beatrichartz · Sep 17, 2022 · Oct 6, 2021 · Oct 6, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## Unreleased
+- Optional parameter `escape_formulas` to prevent CSV injection. [Fixes #103](https://github.com/beatrichartz/csv/issues/103) reported by [@maennchen](https://github.com/maennchen). Contributed by [@maennchen](https://github.com/maennchen) in [PR #104](https://github.com/beatrichartz/csv/pull/104).
+
 ## 2.4.1 (2020-09-12)
 
 - Fix unnecessary escaping of delimiters when encoding [Fixes #70](https://github.com/beatrichartz/csv/issues/70)

diff --git a/lib/csv.ex b/lib/csv.ex
@@ -211,9 +211,9 @@ defmodule CSV do
   end
 
   defp yield_or_raise!({:error, StrayQuoteError, field, index}, _) do
-     raise StrayQuoteError,
-       field: field,
-       line: index + 1
+    raise StrayQuoteError,
+      field: field,
+      line: index + 1
   end
 
   defp yield_or_raise!({:error, mod, message, index}, _) do

diff --git a/lib/csv/decoding/decoder.ex b/lib/csv/decoding/decoder.ex
@@ -22,23 +22,25 @@ defmodule CSV.Decoding.Decoder do
 
   These are the options:
 
-  * `:separator`   – The separator token to use, defaults to `?,`.
+  * `:separator`    – The separator token to use, defaults to `?,`.
       Must be a codepoint (syntax: ? + (your separator)).
   * `:strip_fields` – When set to true, will strip whitespace from fields.
       Defaults to false.
-  * `:num_workers` – The number of parallel operations to run when producing
+  * `:num_workers`  – The number of parallel operations to run when producing
       the stream.
   * `:worker_work_ratio` – The available work per worker, defaults to 5.
       Higher rates will mean more work sharing, but might also lead to work
       fragmentation slowing down the queues.
-  * `:headers`     – When set to `true`, will take the first row of the csv
+  * `:headers`      – When set to `true`, will take the first row of the csv
       and use it as header values.
       When set to a list, will use the given list as header values.
       When set to `false` (default), will use no header values.
       When set to anything but `false`, the resulting rows in the matrix will
       be maps instead of lists.
-  * `:replacement`   – The replacement string to use where lines have bad
+  * `:replacement`    – The replacement string to use where lines have bad
       encoding. Defaults to `nil`, which disables replacement.
+  * `:escape_formulas – Remove Formular Escaping inserted to prevent
+      [CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
 
   ## Examples
 
@@ -50,7 +52,7 @@ defmodule CSV.Decoding.Decoder do
       ...> |> Enum.take(2)
       [ok: [\"a\", \"b\"], ok: [\"c\", \"d\"]]
 
-  Map an existing stream of lines separated by a token to a stream of rows with 
+  Map an existing stream of lines separated by a token to a stream of rows with
   a header row:
 
       iex> [\"a;b\",\"c;d\", \"e;f\"]
@@ -62,7 +64,7 @@ defmodule CSV.Decoding.Decoder do
         ok: %{\"a\" => \"e\", \"b\" => \"f\"}
       ]
 
-  Map an existing stream of lines separated by a token to a stream of rows with 
+  Map an existing stream of lines separated by a token to a stream of rows with
   a header row with duplications:
 
       iex> [\"a;b;b\",\"c;d;e\", \"f;g;h\"]
@@ -198,6 +200,7 @@ defmodule CSV.Decoding.Decoder do
   defp add_row_length({line, index, headers}, {false, options}) do
     {[{line, index, headers, false}], {false, options}}
   end
+
   defp add_row_length({line, 0, false}, {true, options}) do
     case parse_row({line, 0}, options) do
       {:ok, row, _} ->
@@ -208,10 +211,12 @@ defmodule CSV.Decoding.Decoder do
         {[{line, 0, false, false}], {false, options}}
     end
   end
+
   defp add_row_length({line, index, headers}, {true, options}) when is_list(headers) do
     row_length = headers |> Enum.count()
     {[{line, index, headers, row_length}], {row_length, options}}
   end
+
   defp add_row_length({line, index, headers}, {row_length, options}) do
     {[{line, index, headers, row_length}], {row_length, options}}
   end

diff --git a/lib/csv/decoding/lexer.ex b/lib/csv/decoding/lexer.ex
@@ -25,69 +25,97 @@ defmodule CSV.Decoding.Lexer do
   def lex({line, index}, options \\ []) when is_list(options) do
     separator = options |> Keyword.get(:separator, @separator)
     replacement = options |> Keyword.get(:replacement, @replacement)
+    escape_formulas = options |> Keyword.get(:escape_formulas, @escape_formulas)
 
     case String.valid?(line) do
       false ->
         if replacement do
-          replace_bad_encoding(line, replacement) |> lex(index, separator)
+          replace_bad_encoding(line, replacement) |> lex(index, separator, escape_formulas)
         else
           {:error, EncodingError, "Invalid encoding", index}
         end
 
       true ->
-        lex(line, index, separator)
+        lex(line, index, separator, escape_formulas)
     end
   end
 
-  defp lex(line, index, separator) do
-    case lex([], nil, line, separator) do
+  defp lex(line, index, separator, escape_formulas) do
+    case lex([], nil, line, separator, escape_formulas) do
       {:ok, tokens} -> {:ok, tokens, index}
     end
   end
 
-  defp lex(tokens, {:delimiter, value}, <<@newline::utf8>> <> tail, separator) do
-    lex(tokens, {:delimiter, value <> <<@newline::utf8>>}, tail, separator)
+  defp lex(tokens, {:delimiter, value}, <<@newline::utf8>> <> tail, separator, escape_formulas) do
+    lex(tokens, {:delimiter, value <> <<@newline::utf8>>}, tail, separator, escape_formulas)
   end
 
-  defp lex(tokens, current_token, <<@newline::utf8>> <> tail, separator) do
-    lex(tokens |> add_token(current_token), {:delimiter, <<@newline::utf8>>}, tail, separator)
+  defp lex(tokens, current_token, <<@newline::utf8>> <> tail, separator, escape_formulas) do
+    lex(
+      tokens |> add_token(current_token),
+      {:delimiter, <<@newline::utf8>>},
+      tail,
+      separator,
+      escape_formulas
+    )
   end
 
-  defp lex(tokens, current_token, <<@carriage_return::utf8>> <> tail, separator) do
+  defp lex(tokens, current_token, <<@carriage_return::utf8>> <> tail, separator, escape_formulas) do
     lex(
       tokens |> add_token(current_token),
       {:delimiter, <<@carriage_return::utf8>>},
       tail,
-      separator
+      separator,
+      escape_formulas
     )
   end
 
-  defp lex(tokens, current_token, <<@double_quote::utf8>> <> tail, separator) do
+  defp lex(tokens, current_token, <<@double_quote::utf8>> <> tail, separator, escape_formulas) do
     lex(
       tokens |> add_token(current_token),
       {:double_quote, <<@double_quote::utf8>>},
       tail,
-      separator
+      separator,
+      escape_formulas
     )
   end
 
-  defp lex(tokens, current_token, <<head::utf8>> <> tail, separator) when head == separator do
-    lex(tokens |> add_token(current_token), {:separator, <<separator::utf8>>}, tail, separator)
+  defp lex(tokens, current_token, <<head::utf8>> <> tail, separator, escape_formulas)
+       when head == separator do
+    lex(
+      tokens |> add_token(current_token),
+      {:separator, <<separator::utf8>>},
+      tail,
+      separator,
+      escape_formulas
+    )
   end
 
-  defp lex(tokens, {:content, value}, <<head::utf8>> <> tail, separator) do
-    lex(tokens, {:content, value <> <<head::utf8>>}, tail, separator)
+  for start <- @escape_formula_start do
+    defp lex(tokens, current_token, "'#{unquote(start)}" <> tail, separator, true) do
+      lex(tokens, current_token, unquote(start) <> tail, separator, true)
+    end
+  end
+
+  defp lex(tokens, {:content, value}, <<head::utf8>> <> tail, separator, escape_formulas) do
+    lex(tokens, {:content, value <> <<head::utf8>>}, tail, separator, escape_formulas)
   end
 
-  defp lex(tokens, nil, <<head::utf8>> <> tail, separator) do
-    lex(tokens, {:content, <<head::utf8>>}, tail, separator)
+  defp lex(tokens, nil, <<head::utf8>> <> tail, separator, escape_formulas) do
+    lex(tokens, {:content, <<head::utf8>>}, tail, separator, escape_formulas)
   end
 
-  defp lex(tokens, current_token, <<head::utf8>> <> tail, separator) do
-    lex(tokens |> add_token(current_token), {:content, <<head::utf8>>}, tail, separator)
+  defp lex(tokens, current_token, <<head::utf8>> <> tail, separator, escape_formulas) do
+    lex(
+      tokens |> add_token(current_token),
+      {:content, <<head::utf8>>},
+      tail,
+      separator,
+      escape_formulas
+    )
   end
 
-  defp lex(tokens, current_token, "", _) do
+  defp lex(tokens, current_token, "", _, _) do
     {:ok, tokens |> add_token(current_token)}
   end
 

diff --git a/lib/csv/defaults.ex b/lib/csv/defaults.ex
@@ -12,6 +12,8 @@ defmodule CSV.Defaults do
       @double_quote ?"
       @escape_max_lines 1000
       @replacement nil
+      @escape_formulas false
+      @escape_formula_start ["=", "-", "+", "@"]
     end
   end
 

diff --git a/lib/csv/encoding/encode.ex b/lib/csv/encoding/encode.ex
@@ -33,23 +33,41 @@ defimpl CSV.Encode, for: BitString do
   def encode(data, env \\ []) do
     separator = env |> Keyword.get(:separator, @separator)
     delimiter = env |> Keyword.get(:delimiter, @delimiter)
+    escape_formulas = env |> Keyword.get(:escape_formulas, @escape_formulas)
+
+    data =
+      if escape_formulas and String.starts_with?(data, @escape_formula_start) do
+        "'" <> data
+      else
+        data
+      end
+
+    patterns = [
+      <<separator::utf8>>,
+      delimiter,
+      <<@carriage_return::utf8>>,
+      <<@newline::utf8>>,
+      <<@double_quote::utf8>>
+    ]
+
+    patterns =
+      if escape_formulas do
+        patterns ++ @escape_formula_start
+      else
+        patterns
+      end
 
     cond do
-      String.contains?(data, [
-        <<separator::utf8>>,
-        delimiter,
-        <<@carriage_return::utf8>>,
-        <<@newline::utf8>>,
-        <<@double_quote::utf8>>
-      ]) ->
+      String.contains?(data, patterns) ->
         <<@double_quote::utf8>> <>
           (data
            |> String.replace(
              <<@double_quote::utf8>>,
              <<@double_quote::utf8>> <> <<@double_quote::utf8>>
            )) <> <<@double_quote::utf8>>
 
-      true -> data
+      true ->
+        data
     end
   end
 end
diff --git a/lib/csv/encoding/encoder.ex b/lib/csv/encoding/encoder.ex
@@ -14,16 +14,18 @@ defmodule CSV.Encoding.Encoder do
 
   These are the options:
 
-    * `:separator`   – The separator token to use, defaults to `?,`.
+    * `:separator`      – The separator token to use, defaults to `?,`.
       Must be a codepoint (syntax: ? + your separator token).
-    * `:delimiter`   – The delimiter token to use, defaults to `\"\\r\\n\"`.
-    * `:headers`     – When set to `true`, uses the keys of the first map as
+    * `:delimiter`       – The delimiter token to use, defaults to `\"\\r\\n\"`.
+    * `:headers`         – When set to `true`, uses the keys of the first map as
       the first element in the stream. All subsequent elements are the values
       of the maps. When set to a list, will use the given list as the first
       element in the stream and order all subsequent elements using that list.
       When set to `false` (default), will use the raw inputs as elements.
       When set to anything but `false`, all elements in the input stream are
       assumed to be maps.
+    * `:escape_formulas` – Escape formulas to prevent
+      [CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
 
   ## Examples
 
@@ -89,13 +91,13 @@ defmodule CSV.Encoding.Encoder do
 
     encoded =
       row
-      |> Enum.map(&encode_cell(&1, separator, delimiter))
+      |> Enum.map(&encode_cell(&1, options))
       |> Enum.join(<<separator::utf8>>)
 
     encoded <> delimiter
   end
 
-  defp encode_cell(cell, separator, delimiter) do
-    CSV.Encode.encode(cell, separator: separator, delimiter: delimiter)
+  defp encode_cell(cell, options) do
+    CSV.Encode.encode(cell, options)
   end
 end
diff --git a/lib/csv/exceptions.ex b/lib/csv/exceptions.ex
@@ -45,8 +45,9 @@ defmodule CSV.StrayQuoteError do
     line = options |> Keyword.fetch!(:line)
     field = options |> Keyword.fetch!(:field)
 
-    message = "Stray quote on line " <>
-      Integer.to_string(line) <> " near \""  <> field <> "\""
+    message =
+      "Stray quote on line " <>
+        Integer.to_string(line) <> " near \"" <> field <> "\""
 
     %__MODULE__{
       line: line,

diff --git a/test/csv_exceptions_test.exs b/test/csv_exceptions_test.exs
@@ -22,5 +22,4 @@ defmodule CSVExceptionsTest do
       CSV.decode!(stream) |> Stream.run()
     end
   end
-
 end
diff --git a/test/csv_test.exs b/test/csv_test.exs
@@ -29,17 +29,18 @@ defmodule CSVTest do
   test "decodes in strict mode not raising validation errors on variable row length if row length validation is disabled" do
     stream = ~w(a,be a c,d) |> to_stream
 
-    CSV.decode!(stream, [validate_row_length: false]) |> Stream.run
+    CSV.decode!(stream, validate_row_length: false) |> Stream.run()
   end
 
   test "decodes in normal mode not not validating row length when row length validation is disabled" do
     stream = ~w(a,be a c,d) |> to_stream
-    result = CSV.decode(stream, [validate_row_length: false]) |> Enum.to_list
+    result = CSV.decode(stream, validate_row_length: false) |> Enum.to_list()
+
     assert result == [
-      ok: ~w(a be),
-      ok: ~w(a),
-      ok: ~w(c d)
-    ]
+             ok: ~w(a be),
+             ok: ~w(a),
+             ok: ~w(c d)
+           ]
   end
 
   test "uses the :lines preprocessor by default" do

diff --git a/test/decoding/baseline_exceptions_test.exs b/test/decoding/baseline_exceptions_test.exs
@@ -84,8 +84,8 @@ defmodule DecodingTests.BaselineExceptionsTest do
            ]
   end
 
-  def encode_decode_loop(l) do
-    l |> CSV.encode() |> Decoder.decode() |> Enum.to_list()
+  def encode_decode_loop(l, opts \\ []) do
+    l |> CSV.encode(opts) |> Decoder.decode(opts) |> Enum.to_list()
   end
 
   test "does not get corrupted after an error" do
@@ -101,4 +101,17 @@ defmodule DecodingTests.BaselineExceptionsTest do
     assert result_b == [ok: ~w(b)]
     assert result_c == [ok: ~w(b)]
   end
+
+  test "removes escaping for formula" do
+    input = [["=1+1", ~S(=1+2";=1+2), ~S(=1+2'" ;,=1+2)], ["-10+7"], ["+10+7"], ["@A1:A10"]]
+
+    assert [
+             ok: [
+               "=1+1=1+2\";=1+2=1+2'\" ;,=1+2",
+               "-10+7",
+               "+10+7",
+               "@A1:A10"
+             ]
+           ] = encode_decode_loop([input], escape_formulas: true)
+  end
 end
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,5 +22,4 @@ defmodule CSVExceptionsTest do @@
           CSV.decode!(stream) |> Stream.run()
         end
       end
     end