In [25]:
from collections import OrderedDict
import datetime as dt
from pathlib import Path

import pandas as pd
import numpy as np
from highcharts import Highchart



In [26]:
# A data frame of *transactions* is one that contains at least these columns:
KEY_COLUMNS = {'date', 'credit', 'debit', 'balance'}

def find_key_columns(raw_transactions):
    """
    Given a data frame, try to find in it the columns that correspond those in ``KEY_COLUMNS``.
    Build a dictionary of this correspondance of the form
    key column name -> column name in given data frame.
    Return the resulting dictionary, which might be incomplete.
    """
    f = raw_transactions.copy()
    col_dict = {}
    for c in f.columns:
        for key in KEY_COLUMNS:
            if key in c.lower():
                col_dict[key] = c
    return col_dict

def read_transactions(path, date_format='%d-%m-%Y'):
    """
    Given a path to a CSV file of bank transactions, read the file, and 
    convert it to a data frame of transactions, that is, 
    a data frame with at least the columns in ``KEY_COLUMNS``.
    Parse the dates in CSV file according to the format ``date_format``, and 
    preserve any data in extra columns. 
    Return the resulting data frame.
    """
    f = pd.read_csv(path)
    col_dict = find_key_columns(f)
    if set(col_dict.keys()) != KEY_COLUMNS:
        raise ValueError('Could not find columns resembling {!s} in file'.format(
          KEY_COLUMNS))
    
    # Reformat column names
    rename1 = {val: key for key, val in col_dict.items()}
    rename2 = {c: c.strip().lower().replace(' ', '_') for c in f.columns}
    f = f.rename(columns=rename1).rename(columns=rename2)
    
    # Parse dates
    f['date'] = pd.to_datetime(f['date'], format=date_format)
    
    return f

def make_sample_transactions(start_date, end_date, freq='3D'):
    """
    Generate a data frame of transactions from the start date to the end date
    (date strings) with credits and debits at the given frequency (Pandas frequency string).
    """
    r = pd.date_range(start_date, end_date, freq='3D')
    f = pd.DataFrame(r, columns=['date'])
    n = len(r)
    f['credit'] = np.random.randint(0, 100, n)
    f['debit'] = np.random.randint(0, 100, n)
    f['balance'] = (f['credit'] - f['debit']).cumsum()
    return f

def summarize(transactions, freq='MS'):
    """
    Given a data frame of transactions summarize it at the given frequency (Pandas frequency string),
    summing credits, summing debits, and taking the last balance for each period.
    Return the resulting data frame.
    """
    cols = ['date', 'credit', 'debit', 'balance'] 
    f = transactions[cols].copy()
    if freq is None:
        g = {}
        g['date'] = f['date'].min()
        g['credit'] = f['credit'].sum()
        g['debit'] = f['debit'].sum()
        g['balance'] = f['balance'].iat[-1]
        g = pd.DataFrame(g, index=[0])
    else:
        g = f.set_index('date').resample(freq).agg({
          'credit': 'sum',
          'debit': 'sum',
          'balance': 'last', 
          }).fillna(0).reset_index()
    
    return g[cols].copy()

def plot(summary, currency='NZD', width=700, height=None):
    """
    Given a transaction summary of the form output by :func:`summarize`, plot it using Python HighCharts.
    Include the given currency units (string; e.g. 'NZD') in the plot labels.
    """
    f = summary.copy()
    chart = Highchart()

    # HighCharts kludge: use categorical x-axis to display dates properly
    dates = f['date'].map(lambda x:x.strftime('%Y-%m-%d')).unique()
    dates = sorted(dates.tolist())

    if currency:
        y_text = 'Money ({!s})'.format(currency)
    else:
        y_text = 'Money'
    
    options = {
        'lang': {
            'thousandsSep': ','
        },
        'chart' : {},
        'title': {
            'text': 'Account Summary'
        },
        'xAxis': {
            'type': 'category',
            'categories': dates,
        },
        'yAxis': {
            'title': {
                'text': y_text,
            }
        },
        'tooltip': {
            'headerFormat': '<b>{point.key}</b> ' +
              '(period start)<table>',
            'pointFormat': '''
              <tr>
              <td style="padding-right:1em">{series.name}</td>
              <td style="text-align:right">{point.y:,.0f} ''' + currency +\
              '''
              </td>
              </tr>
              ''',    
            'useHTML': True,
            'shared': True,
        },
        'plotOptions': {
            'column': {
                'pointPadding': 0,
                'borderWidth': 1,
                'borderColor': '#333333',
            }
        },
        'credits': {
                'enabled': False,
            },
    }

    if width is not None:
        options['chart']['width'] = width

    if height is not None:
        options['chart']['height'] = height

    chart.set_dict_options(options)
    for (col, opts) in [
      ('credit', {'series_type': 'column', 'color': '#8da0cb'}),
      ('debit', {'series_type': 'column', 'color': '#fc8d62'}),
      ('balance', {'series_type': 'line', 'color': '#555'}),
      ]:
        chart.add_data_set(f[col].values.tolist(), name=col, **opts)

    return chart

In [27]:
# Load real data

DATA_DIR = Path('~')/'tumeke_cycle_space'/'finances'
paths = [
    'transactions_20140701--20150630.csv',
    'transactions_20150701--20160630.csv',
    'transactions_20160701--20161209.csv'
    ]
paths = [DATA_DIR/p for p in paths]
    
frames = [read_transactions(p) for p in paths]
transactions = pd.concat(frames)
transactions.tail().T


Unnamed: 0,42,43,44,45,46
account_number,38-9008-0749578-00,38-9008-0749578-00,38-9008-0749578-00,38-9008-0749578-00,38-9008-0749578-00
date,2016-12-02 00:00:00,2016-12-05 00:00:00,2016-12-05 00:00:00,2016-12-05 00:00:00,2016-12-09 00:00:00
memo/description,Bill Payment raffle LUNDBERG KENT ;Ref: raffle...,Direct Credit e party T aint no partylike a bi...,Direct Credit fundraiser timduhamel MR T L DUH...,CASH DEPOSIT ;KOHA,PAY A J MAZONOWICZ ;
source_code_(payment_type),BP,DC,DC,,
tp_ref,,e party T,fundraiser,,
tp_part,raffle,aint no part,timduhamel,,
tp_code,,y like a bik,,,
op_ref,,,,,
op_part,,,,,trade me
op_code,,,,,


In [29]:
# Or load mock data

transactions = make_sample_transactions('2015-01-01', '2017-01-01')
transactions.tail()

Unnamed: 0,date,credit,debit,balance
239,2016-12-18,63,35,219
240,2016-12-21,94,76,237
241,2016-12-24,39,17,259
242,2016-12-27,70,77,252
243,2016-12-30,30,51,231


In [30]:
f = summarize(transactions, freq='MS')
print(f)
plot(f)


         date  credit  debit  balance
0  2015-01-01     665    627       38
1  2015-02-01     499    583      -46
2  2015-03-01     546    507       -7
3  2015-04-01     418    551     -140
4  2015-05-01     562    504      -82
5  2015-06-01     418    489     -153
6  2015-07-01     472    632     -313
7  2015-08-01     634    471     -150
8  2015-09-01     458    619     -311
9  2015-10-01     519    494     -286
10 2015-11-01     552    414     -148
11 2015-12-01     471    375      -52
12 2016-01-01     293    528     -287
13 2016-02-01     738    694     -243
14 2016-03-01     541    447     -149
15 2016-04-01     523    565     -191
16 2016-05-01     584    560     -167
17 2016-06-01     553    527     -141
18 2016-07-01     502    399      -38
19 2016-08-01     590    451      101
20 2016-09-01     441    402      140
21 2016-10-01     542    502      180
22 2016-11-01     494    561      113
23 2016-12-01     652    534      231
