From 54d45f0552a1ce04c7bfb8ffc8da9d2c601d9bd1 Mon Sep 17 00:00:00 2001
From: Jim Kalafut <jim@kalafut.net>
Date: Sun, 16 Oct 2016 15:08:11 -0700
Subject: [PATCH] Add --incremental and --skip-dupes options

Closes #92
---
 README.md      | 11 +++++++++
 icsv2ledger.py | 65 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 0085788..696cbef 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,7 @@ Options can either be used from command line or in configuration file.
     --desc STR            CSV column number matching description
     --effective-date INT  CSV column number matching effective date
     --encoding STR        text encoding of CSV input file
+    --incremental         append output as transactions are processed
     --ledger-date-format STR
                           date format for ledger output file
     --ledger-decimal-comma
@@ -89,7 +90,9 @@ Options can either be used from command line or in configuration file.
     --accounts-file FILE  file which holds a list of allowed accounts
     --quiet, -q           do not prompt if account can be deduced
     --reverse             reverse the order of entries in the CSV file
+    --skip-dupes          skip transactions that have already been imported
     --skip-lines INT      number of lines to skip from CSV file
+    --skip-older-than     skip entries more than X days old
     --tags, -t            prompt for transaction tags
     --template-file FILE  file which holds the template
     -h, --help            show this help message and exit
@@ -219,6 +222,10 @@ is the text encoding of the CSV input file. Default is `utf-8`. The encoding
 should be specified if the CSV file contains non-ASCII characters (typically in
 the transaction description) in an encoding other than UTF-8.
 
+**`--incremental`**
+
+appends output as transactions are processed. The default flow is to process all CSV input and then output the result. When `--incremental` is specified, output is written after every transaction. This allows one to stop (ctrl-c) and restart to progressively process a CSV file (`--skip-dupes` is a useful companion option). This option cannot be used with `--reverse`.
+
 **`--ledger-date-format STR`**
 
 describes the date format to be used when creating ledger entries. If
@@ -283,6 +290,10 @@ is `False`.
 
 will print ledger entries in reverse of their order in the CSV file.
 
+**`--skip-dupes`**
+
+will skip transactions if the exact CSV already appears as a `; CSV: ...` comment in the current ledgerfile (which means your output template will need this comment). This can help if you download statements without using a precise date range. A useful pattern is to include CSV comments for both "sides" of a transaction if you download from multiple sources that resolve to a single transaction (e.g. paying a credit card from checking).
+
 **`--skip-lines INT`**
 
 is the number of lines to skip from the beginning of the CSV file.
diff --git a/icsv2ledger.py b/icsv2ledger.py
index efc9d2c..1e9e689 100755
--- a/icsv2ledger.py
+++ b/icsv2ledger.py
@@ -100,6 +100,8 @@ def get_locale_currency_symbol():
     'quiet': False,
     'reverse': False,
     'skip_lines': str(1),
+    'skip_dupes': False,
+    'incremental': False,
     'tags': False,
     'delimiter': ',',
     'csv_decimal_comma': False,
@@ -233,7 +235,7 @@ def parse_args_and_config_file():
     parser.add_argument(
         'outfile',
         nargs='?',
-        type=FileType('w', encoding='utf-8'),
+        type=FileType('a', encoding='utf-8'),
         default=sys.stdout,
         help=('output filename or stdout in Ledger syntax'
               ' (default: {0})'.format('stdout')))
@@ -265,6 +267,16 @@ def parse_args_and_config_file():
         type=int,
         help=('number of lines to skip from CSV file'
               ' (default: {0})'.format(DEFAULTS.skip_lines)))
+    parser.add_argument(
+        '--skip-dupes',
+        action='store_true',
+        help=('skip transactions that have already been imported'
+              ' (default: {0})'.format(DEFAULTS.skip_dupes)))
+    parser.add_argument(
+        '--incremental',
+        action='store_true',
+        help=('append output as transactions are processed'
+              ' (default: {0})'.format(DEFAULTS.incremental)))
     parser.add_argument(
         '--reverse',
         action='store_true',
@@ -388,6 +400,10 @@ def parse_args_and_config_file():
               file=sys.stderr)
         sys.exit(1)
 
+    if args.incremental and args.reverse:
+        print('reverse cannot be used in incremental mode')
+        sys.exit(1)
+
     if args.encoding != args.infile.encoding:
         args.infile = io.TextIOWrapper(args.infile.detach(),
                                        encoding=args.encoding)
@@ -486,8 +502,8 @@ def journal_entry(self, transaction_index, payee, account, tags):
         uuid_regex = re.compile(r"UUID:", re.IGNORECASE)
         uuid = [v for v in tags if uuid_regex.match(v)]
         if uuid:
-          uuid = uuid[0]
-          tags.remove(uuid)
+            uuid = uuid[0]
+            tags.remove(uuid)
         format_data = {
             'date': self.date,
             'effective_date': self.effective_date,
@@ -509,7 +525,12 @@ def journal_entry(self, transaction_index, payee, account, tags):
             'md5sum': self.md5sum,
             'csv': self.raw_csv}
         format_data.update(self.addons)
-        return template.format(**format_data)
+
+        # generate and clean output
+        output_lines = template.format(**format_data).split('\n')
+        output = '\n'.join([x.rstrip() for x in output_lines])
+
+        return output
 
 def get_field_at_index(fields, index, csv_decimal_comma, ledger_decimal_comma):
     """
@@ -551,6 +572,16 @@ def get_field_at_index(fields, index, csv_decimal_comma, ledger_decimal_comma):
     return value
 
 
+def csv_from_ledger(ledger_file):
+    pattern = re.compile(r"^\s*[;#]\s*CSV:\s*(.*?)\s*$")
+    csv_comments = set()
+    with open(ledger_file) as f:
+        for line in f:
+            m = pattern.match(line)
+            if m:
+                csv_comments.add(m.group(1))
+    return csv_comments
+
 
 def payees_from_ledger(ledger_file):
     return from_ledger(ledger_file, 'payees')
@@ -710,6 +741,7 @@ def main():
     if options.ledger_file:
         possible_accounts = accounts_from_ledger(options.ledger_file)
         possible_payees = payees_from_ledger(options.ledger_file)
+        csv_comments = csv_from_ledger(options.ledger_file)
 
     # Read mappings
     mappings = []
@@ -781,11 +813,15 @@ def process_input_output(in_file, out_file):
         Process them.
         Write Ledger lines either to filename or stdout.
         """
+        if not options.incremental:
+            out_file.truncate(0)
+
         csv_lines = in_file.readlines()
         if in_file.name == '<stdin>':
             reset_stdin()
-        ledger_lines = process_csv_lines(csv_lines)
-        print(*ledger_lines, sep='\n', file=out_file)
+        for line in  process_csv_lines(csv_lines):
+            print(line, sep='\n', file=out_file)
+            out_file.flush()
 
     def process_csv_lines(csv_lines):
         dialect = None
@@ -803,12 +839,23 @@ def process_csv_lines(csv_lines):
             if len(row) == 0:
                 continue
 
+            # Skip any lines already in the ledger file
+            if options.skip_dupes and csv_lines[options.skip_lines + i].strip() in csv_comments:
+                continue
+
             entry = Entry(row, csv_lines[options.skip_lines + i],
                           options)
             if (options.skip_older_than < 0) or (entry.days_old <= options.skip_older_than):
-                payee, account, tags = get_payee_and_account(entry)
-                ledger_lines.append(
-                    entry.journal_entry(i + 1, payee, account, tags))
+                try:
+                    payee, account, tags = get_payee_and_account(entry)
+                except KeyboardInterrupt:
+                    print()
+                    sys.exit(0)
+                line = entry.journal_entry(i + 1, payee, account, tags)
+                if options.incremental:
+                    yield line
+                else:
+                    ledger_lines.append(line)
 
         if options.reverse:
             ledger_lines.reverse()