Parse EVL volumetrics

alphagov · Aug 27, 2013 · e60a8e3 · e60a8e3
1 parent b39d453
commit e60a8e3
Show file tree

Hide file tree

Showing 8 changed files with 152 additions and 148 deletions.
diff --git a/backdrop/contrib/evl_upload_filters.py b/backdrop/contrib/evl_upload_filters.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 import itertools
+from backdrop.contrib.evl_volumetrics import remove_summary_columns, extract_transaction_rows, create_transaction_data
 from backdrop.core.timeutils import parse_time_as_utc, as_utc
 
 
@@ -137,14 +138,12 @@ def date_or_none(string):
 
 
 def volumetrics(sheets):
-    rows = list(list(sheets)[2])
+    sheet = list(list(sheets)[2])
 
     yield ["_timestamp", "service", "channel", "transaction", "volume"]
 
-    first_date = as_utc(datetime.strptime(rows[3][3], "%b %Y"))
-    service = "tax-disc"
-    channel = "assisted-digital"
-    transaction = rows[4][2]
-    volume = rows[4][3]
+    header, rows = extract_transaction_rows(remove_summary_columns(sheet))
 
-    yield [first_date.isoformat(), service, channel, transaction, volume]
+    for row in rows:
+        for data in create_transaction_data(header, row):
+            yield data
diff --git a/backdrop/contrib/evl_volumetrics.py b/backdrop/contrib/evl_volumetrics.py
@@ -0,0 +1,85 @@
+from datetime import datetime
+import re
+from backdrop.core.timeutils import as_utc
+
+def extract_column_header(sheet):
+    HEADER_INDEX = 3
+    return sheet[HEADER_INDEX]
+
+
+def extract_transaction_rows(sheet):
+    TRANSACTION_INDEXES = {
+           4: ["Assisted Digital", "Relicensing"],
+           5: ["Assisted Digital", "Relicensing"],
+           7: ["Assisted Digital", "SORN"],
+           10: ["Fully Digital", "Relicensing"],
+           11: ["Fully Digital", "Relicensing"],
+           12: ["Fully Digital", "Relicensing"],
+           13: ["Fully Digital", "Relicensing"],
+           15: ["Fully Digital", "SORN"],
+           16: ["Fully Digital", "SORN"],
+           17: ["Fully Digital", "SORN"],
+           18: ["Fully Digital", "SORN"],
+           21: ["Manual", "Relicensing"],
+           22: ["Manual", "Relicensing"],
+           23: ["Manual", "Relicensing"],
+           25: ["Manual", "SORN"],
+           26: ["Manual", "SORN"],
+           27: ["Manual", "SORN"],
+           28: ["Manual", "SORN"],
+           29: ["Manual", "SORN"],
+           30: ["Manual", "SORN"],
+           31: ["Manual", "SORN"],
+    }
+
+    def transaction_row(index):
+        channel_service = TRANSACTION_INDEXES[index]
+        return channel_service + sheet[index][2:]
+
+    return extract_column_header(sheet), map(transaction_row, TRANSACTION_INDEXES.keys())
+
+
+def create_transaction_data(header, row):
+    CHANNEL_INDEX = 0
+    SERVICE_INDEX = 1
+    TRANSACTION_NAME_INDEX = 2
+    DATES_START_INDEX = 3
+    SERVICES = {
+        "Relicensing": "tax-disc",
+        "SORN": "sorn"
+    }
+
+    volumes = zip(header, row)[DATES_START_INDEX:]
+
+    def transaction_data(date_volume):
+        date, volume = date_volume
+        date = as_utc(datetime.strptime(date, "%b %Y"))
+        service = SERVICES[row[SERVICE_INDEX]]
+        channel = row[CHANNEL_INDEX].lower().replace(" ", "-")
+        transaction = row[TRANSACTION_NAME_INDEX]
+
+        return [date.isoformat(), service, channel, transaction, volume]
+
+    return map(transaction_data, volumes)
+
+
+def remove_summary_columns(sheet):
+    DATES_START_INDEX = 3
+    DATE_REGEXP = re.compile("[A-Z][a-z]{2}\s\d{4}")
+
+    header = extract_column_header(sheet)
+
+    def add_date_index(mem, i):
+        if bool(DATE_REGEXP.match(header[i])):
+            mem.append(i)
+            return mem
+        else:
+            return mem
+
+    date_indexes = reduce(add_date_index, range(DATES_START_INDEX,len(header)), [])
+
+    def remove_columns_from_row(row):
+        return row[:DATES_START_INDEX] + [row[i] for i in date_indexes]
+
+
+    return map(remove_columns_from_row, sheet)
diff --git a/backdrop/core/upload/parse_excel.py b/backdrop/core/upload/parse_excel.py
@@ -4,6 +4,18 @@
 from backdrop.core.errors import ParseError
 from backdrop.core.timeutils import utc
 
+class ExcelError(object):
+    def __init__(self, description):
+        self.description = description
+
+    def __eq__(self, other):
+        return isinstance(other, self.__class__) and \
+               self.description == other.description
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+EXCEL_ERROR = ExcelError("error in cell")
 
 def parse_excel(incoming_data):
     book = xlrd.open_workbook(file_contents=incoming_data.read())
@@ -26,5 +38,6 @@ def _extract_cell_value(cell, book):
         time_tuple = xlrd.xldate_as_tuple(cell.value, book.datemode)
         return utc(datetime.datetime(*time_tuple)).isoformat()
     elif cell.ctype == xlrd.XL_CELL_ERROR:
-        raise ParseError("Error encountered in cell")
+        logging.warn("Encountered errors in cells when parsing excel file")
+        return EXCEL_ERROR
     return cell.value
diff --git a/backdrop/write/config/test.py b/backdrop/write/config/test.py
@@ -10,7 +10,8 @@
 ALLOW_TEST_SIGNIN = True
 BUCKET_AUTO_ID_KEYS = {
     "bucket_with_auto_id": ["key", "start_at", "end_at"],
-    "bucket_with_timestamp_auto_id": ["_timestamp", "key"]
+    "bucket_with_timestamp_auto_id": ["_timestamp", "key"],
+    "evl_volumetrics": ["_timestamp", "service", "transaction"],
 }
 BUCKET_UPLOAD_FORMAT = {
     "bucket_with_timestamp_auto_id": "excel",

diff --git a/features/contrib/evl_upload.feature b/features/contrib/evl_upload.feature
@@ -78,16 +78,10 @@ Feature: EVL Upload
              {"_timestamp": "2007-07-01T00:00:00+00:00", "_id": "2007-07-01", "satisfaction_tax_disc": 1.1662755514934828, "satisfaction_sorn": 1.3581011781786714}
              """
 
-    @wip
-    Scenario: Upload evl volumetrics 
+    Scenario: Upload evl volumetrics
         Given a file named "evl-volumetrics.xls" with fixture "contrib/evl-volumetrics.xls"
          and I am logged in
         when I go to "/evl_volumetrics/upload"
          and I enter "evl-volumetrics.xls" into the file upload field
          and I click "Upload"
         then the platform should have "336" items stored in "evl_volumetrics"
-         and the "evl_volumetrics" bucket should have items:
-             """
-             {"_timestamp": "2012-08-01T00:00:00+00:00", "_id": "2013-08-01", "channel": "manual", "service": "tax-disc", "transaction": "V-V890 Another transaction", "volume" : 123456},
-             {"_timestamp": "2013-05-01T00:00:00+00:00", "_id": "2007-07-01", "channel": "assisted-digital", "service": "sorn", "transaction": "V-V11 Some transaction", "volume": 987654}
-             """
diff --git a/tests/contrib/test_evl_upload_filters.py b/tests/contrib/test_evl_upload_filters.py
@@ -110,44 +110,43 @@ def test_converts_customer_satisfaction_raw_data_to_normalised_data(self):
                                ["2013-06-01T00:00:00+00:00", "2013-06-01", 0.3, 0.4],
                                ["2013-07-01T00:00:00+00:00", "2013-07-01", 0.5, 0.6]]))
 
-    @nottest
     def test_volumetrics_raw_data_to_normalised_data(self):
         raw_data = [
-           ["Ignore"],
-           ["Ignore"],
-           ["Ignore"],
-           ["Channel Descriptions", "", "Transaction", "Apr 2012"],
-           ["Assisted Digital", "Relicensing", "V-V10 Licence Application Post Office", 1000],
-           ["", "", "V-V11 Licence Renewal Reminder Post Office", 1001],
-           ["Ignore"],
-           ["", "SORN", "V-V11 Some transaction", 1003],
-           ["Ignore"],
-           ["Ignore"],
-           ["Fully Digital", "Relicensing", "V-V10 Licence Application EVL", 1006],
-           ["", "", "V-V11 Fleets", 1007],
-           ["", "", "V-V11 Licence Renewal Reminder EVL", 1008],
-           ["", "", "V-V85 and V85/1 HGV Licence Application EVL", 1009],
-           ["Ignore"],
-           ["", "SORN", "V-V11 SORN EVL", 1011],
-           ["", "", "V-V85/1 HGV SORN Declaration EVL", 1012],
-           ["", "", "V-V890 SORN Declaration EVL", 1013],
-           ["", "", "V-V890 SORN Declaration Fleets", 1014],
-           ["Ignore"],
-           ["Ignore"],
-           ["Manual", "Relicensing", "V-V890 Another transaction", 1017],
-           ["", "", "V-V11 Licence Renewal Reminder Local Office", 1018],
-           ["", "", "V-V85 and V85/1 HGV Licence Application", 1019],
-           ["Ignore"],
-           ["", "SORN", "V-V11 SORN Local Office", 1021],
-           ["", "", "V-V85/1 HGV SORN Declaration", 1022],
-           ["", "", "V-V890 SORN Declaration", 1023],
-           ["", "", "V-V890 SORN Declaration Key from Image", 1024],
-           ["", "", "V-V890 SORN Declaration Refunds Input", 1025],
-           ["", "", "V-V890 SORN Declaration Vehicles Input", 1026],
-           ["", "", "V-V890 SORN Declaration Vehicles Triage", 1027],
-           ["Ignore"],
-           ["Ignore"],
-           ["Ignore"]
+           ["_", "_", "_", "_", "_", "_"],
+           ["_", "_", "_", "_", "_", "_"],
+           ["_", "_", "_", "_", "_", "_"],
+           ["Channel Descriptions", "", "Transaction", "Apr 2012", "2012/13 Total", "Mar 2013"],
+           ["Assisted Digital", "Relicensing", "V-V10 Licence Application Post Office", 1000, 2000, 3000],
+           ["", "", "V-V11 Licence Renewal Reminder Post Office", 1001, 2001, 3001,],
+           ["_", "_", "_", "_", "_", "_"],
+           ["", "SORN", "V-V11 Some transaction", 1003, 2003, 3003],
+           ["_", "_", "_", "_", "_", "_"],
+           ["_", "_", "_", "_", "_", "_"],
+           ["Fully Digital", "Relicensing", "V-V10 Licence Application EVL", 1006, 2006, 3006],
+           ["", "", "V-V11 Fleets", 1007, 2007, 3007],
+           ["", "", "V-V11 Licence Renewal Reminder EVL", 1008, 2008, 3008],
+           ["", "", "V-V85 and V85/1 HGV Licence Application EVL", 1009, 2008, 3008],
+           ["_", "_", "_", "_", "_", "_"],
+           ["", "SORN", "V-V11 SORN EVL", 1011, 2011, 3011],
+           ["", "", "V-V85/1 HGV SORN Declaration EVL", 1012, 2012, 3012],
+           ["", "", "V-V890 SORN Declaration EVL", 1013, 2013, 3013],
+           ["", "", "V-V890 SORN Declaration Fleets", 1014, 2014, 3014],
+           ["_", "_", "_", "_", "_", "_"],
+           ["_", "_", "_", "_", "_", "_"],
+           ["Manual", "Relicensing", "V-V890 Another transaction", 1017, 2017, 3017],
+           ["", "", "V-V11 Licence Renewal Reminder Local Office", 1018, 2018, 3018],
+           ["", "", "V-V85 and V85/1 HGV Licence Application", 1019, 2019, 3019],
+           ["_", "_", "_", "_", "_", "_"],
+           ["", "SORN", "V-V11 SORN Local Office", 1021, 2021, 3021],
+           ["", "", "V-V85/1 HGV SORN Declaration", 1022, 2022, 3022],
+           ["", "", "V-V890 SORN Declaration", 1023, 2023, 3023],
+           ["", "", "V-V890 SORN Declaration Key from Image", 1024, 2024, 3024],
+           ["", "", "V-V890 SORN Declaration Refunds Input", 1025, 2025, 3025],
+           ["", "", "V-V890 SORN Declaration Vehicles Input", 1026, 2026, 3026],
+           ["", "", "V-V890 SORN Declaration Vehicles Triage", 1027, 2027, 3027],
+           ["_", "_", "_", "_", "_", "_"],
+           ["_", "_", "_", "_", "_", "_"],
+           ["_", "_", "_", "_", "_", "_"],
         ]
 
         data = volumetrics([[], [], raw_data])
@@ -156,4 +155,4 @@ def test_volumetrics_raw_data_to_normalised_data(self):
 
         assert_that(header, is_(["_timestamp", "service", "channel", "transaction", "volume"]))
         assert_that(rows[0],  is_(["2012-04-01T00:00:00+00:00", "tax-disc", "assisted-digital", "V-V10 Licence Application Post Office", 1000]))
-        assert_that(rows[-1], is_(["2012-04-01T00:00:00+00:00", "sorn", "manual", "V-V890 SORN Declaration Vehicles Triage", 1027]))
+        assert_that(rows[-1], is_(["2013-03-01T00:00:00+00:00", "sorn", "manual", "V-V890 SORN Declaration Vehicles Triage", 3027]))
diff --git a/tests/contrib/test_evl_volumetrics.py b/tests/contrib/test_evl_volumetrics.py
@@ -1,94 +1,8 @@
-from itertools import ifilter
-from pprint import pprint
-import re
 import unittest
-from hamcrest import *
-from datetime import datetime
-from backdrop.core.timeutils import as_utc
-
-
-def extract_column_header(sheet):
-    HEADER_INDEX = 3
-    return sheet[HEADER_INDEX]
-
-
-def extract_transaction_rows(sheet):
-    TRANSACTION_INDEXES = {
-           4: ["Assisted Digital", "Relicensing"],
-           5: ["Assisted Digital", "Relicensing"],
-           7: ["Assisted Digital", "SORN"],
-           10: ["Fully Digital", "Relicensing"],
-           11: ["Fully Digital", "Relicensing"],
-           12: ["Fully Digital", "Relicensing"],
-           13: ["Fully Digital", "Relicensing"],
-           15: ["Fully Digital", "SORN"],
-           16: ["Fully Digital", "SORN"],
-           17: ["Fully Digital", "SORN"],
-           18: ["Fully Digital", "SORN"],
-           21: ["Manual", "Relicensing"],
-           22: ["Manual", "Relicensing"],
-           23: ["Manual", "Relicensing"],
-           25: ["Manual", "SORN"],
-           26: ["Manual", "SORN"],
-           27: ["Manual", "SORN"],
-           28: ["Manual", "SORN"],
-           29: ["Manual", "SORN"],
-           30: ["Manual", "SORN"],
-           31: ["Manual", "SORN"],
-    }
-
-    def transaction_row(index):
-        channel_service = TRANSACTION_INDEXES[index]
-        return channel_service + sheet[index][2:4]
-
-    return extract_column_header(sheet), map(transaction_row, TRANSACTION_INDEXES.keys())
-
-
-def create_transaction_data(header, row):
-    CHANNEL_INDEX = 0
-    SERVICE_INDEX = 1
-    TRANSACTION_NAME_INDEX = 2
-    DATES_START_INDEX = 3
-    SERVICES = {
-        "Relicensing": "tax-disc",
-        "SORN": "sorn"
-    }
-
-    volumes = zip(header, row)[DATES_START_INDEX:]
-
-    def transaction_data(date_volume):
-        date, volume = date_volume
-        date = as_utc(datetime.strptime(date, "%b %Y"))
-        service = SERVICES[row[SERVICE_INDEX]]
-        channel = row[CHANNEL_INDEX].lower().replace(" ", "-")
-        transaction = row[TRANSACTION_NAME_INDEX]
 
-        return [date.isoformat(), service, channel, transaction, volume]
-
-    return map(transaction_data, volumes)
-
-
-def remove_summary_columns(sheet):
-    DATES_START_INDEX = 3
-    DATE_REGEXP = re.compile("[A-Z][a-z]{2}\s\d{4}")
-
-    header = extract_column_header(sheet)
-
-    def add_date_index(mem, i):
-        if bool(DATE_REGEXP.match(header[i])):
-            mem.append(i)
-            return mem
-        else:
-            return mem
-
-    date_indexes = reduce(add_date_index, range(DATES_START_INDEX,len(header)), [])
-
-    def remove_columns_from_row(row):
-        return row[:DATES_START_INDEX] + [row[i] for i in date_indexes]
-
-
-    return map(remove_columns_from_row, sheet)
+from hamcrest import *
 
+from backdrop.contrib.evl_volumetrics import extract_transaction_rows, create_transaction_data, remove_summary_columns
 
 
 class TestEVLVolumetrics(unittest.TestCase):

diff --git a/tests/core/upload/test_parse_excel.py b/tests/core/upload/test_parse_excel.py
@@ -2,7 +2,7 @@
 from hamcrest import assert_that, only_contains, contains
 from backdrop.core.errors import ParseError
 
-from backdrop.core.upload.parse_excel import parse_excel
+from backdrop.core.upload.parse_excel import parse_excel, ExcelError, EXCEL_ERROR
 from tests.support.test_helpers import fixture_path, d_tz
 
 
@@ -33,12 +33,11 @@ def test_parse_xls_file(self):
         )))
 
     def test_parse_xlsx_with_error(self):
-        def traverse_file(filename):
-            for sheet in self._parse_excel(filename):
-                for _ in sheet:
-                    pass
-
-        self.assertRaises(ParseError, traverse_file, "error.xlsx")
+        assert_that(self._parse_excel("error.xlsx"), contains(contains(
+            ["date", "name", "number", "error"],
+            ["2013-12-03T13:30:00+00:00", "test1", 12, EXCEL_ERROR],
+            ["2013-12-04T00:00:00+00:00", "test2", 34, EXCEL_ERROR],
+        )))
 
     def test_parse_xlsx_with_multiple_sheets(self):
         assert_that(self._parse_excel("multiple_sheets.xlsx"), contains(