Merge pull request #108 from cfpb/data-automation2

Automate the gathering of county limit data
cfpb · Feb 2, 2017 · e3ca5d6 · e3ca5d6
2 parents 6ae41dc + d12c6e1
commit e3ca5d6
Show file tree

Hide file tree

Showing 13 changed files with 13,913 additions and 688 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,7 +1,6 @@
 [run]
 source = 
     countylimits
-    mortgageinsurance
     ratechecker
 
 omit = 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@ We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 
 ## Unreleased
 - Add a monitor to watch for changes in census county values
+- Add a script and manage.py command to automate county data collection
 
 ## 0.9.92 - 2016-12-31
 - 2017 update for county-level mortgage-limit data 

diff --git a/countylimits/data_collection/gather_county_data.py b/countylimits/data_collection/gather_county_data.py
@@ -0,0 +1,179 @@
+import os
+import datetime
+from collections import OrderedDict
+
+import requests
+try:
+    from csvkit import DictReader  # for stand-alone use
+    from csvkit import writer as Writer  # for stand-alone use
+except ImportError:  # pragma: no cover
+    try:
+        from paying_for_college.csvkit.csvkit import DictReader
+        from paying_for_college.csvkit.csvkit import Writer
+    except ImportError:  # unicode errors ahoy
+        from csv import DictReader
+        from csv import writer as Writer
+
+
+ERROR_MSG = "Script failed to process all files."
+API_DIR = os.path.abspath(
+    os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+)
+DATA_DIR = "{}/data".format(API_DIR)
+CSV_DIR = "{}/base_data".format(DATA_DIR)
+CHUMS_FHA_URL = 'https://www.hud.gov/pub/chums/cy{}-forward-limits.txt'
+CHUMS_GSE_URL = 'https://www.hud.gov/pub/chums/cy{}-gse-limits.txt'
+CHUMS_SPACING = [
+    ('msa-code', (0, 5)),
+    ('metro-code', (5, 10)),
+    ('metro-name', (10, 60)),
+    ('program', (60, 65)),
+    ('limit-type', (65, 66)),  # S or H, standard or high
+    ('median-price', (66, 73)),
+    ('limit-1-unit', (73, 80)),
+    ('limit-2-units', (80, 87)),
+    ('limit-3-units', (87, 94)),
+    ('limit-4-units', (94, 101)),
+    ('state', (101, 103)),
+    ('county-fips', (103, 106)),
+    ('state-name', (106, 132)),
+    ('county-name', (132, 147)),
+    ('county-transaction-date', (147, 155)),
+    ('limit-transaction-date', (155, 163)),
+    ('median-price-determining-limit', (163, 170)),
+    ('year-for-median-determining-limit', (170, 175))
+]
+CHUMS_MAP = OrderedDict(CHUMS_SPACING)
+FINAL_FIELDNAMES = [
+    u'State',
+    u'State FIPS',
+    u'County FIPS',
+    u'Complete FIPS',
+    u'County Name',
+    u'GSE limit',
+    u'FHA limit',
+    u'VA limit'
+]
+
+
+def load_FIPS():
+    with open('{}/county_FIPS.csv'.format(CSV_DIR), 'r') as f:
+        reader = DictReader(f)
+        return [row for row in reader]
+
+
+def translate_data(data_list, data_map):
+    rows = []
+    for line in data_list:
+        rows.append(
+            {key: line[data_map[key][0]:data_map[key][1]].strip()
+             for key in data_map}
+        )
+    return rows
+
+
+def download_datafile(url):
+    response = requests.get(url)
+    if response.ok:
+        return response.text
+    else:
+        return "Error:\n{} {}\n{}".format(
+            response.status_code,
+            response.reason,
+            response.url)
+
+
+def dump_to_csv(filepath, headings, data):
+    with open(filepath, 'w') as f:
+        fieldnames = [key for key in headings]
+        writer = Writer(f)
+        writer.writerow(fieldnames)
+        for row in data:
+            writer.writerow(
+                [row[key] for key in headings]
+            )
+
+
+def assemble_final_data(fha_data, gse_data):
+    final_data = []
+    county_data = load_FIPS()
+    county_by_fips = {row['Complete FIPS']: row for row in county_data}
+    states = sorted(set(row['State'] for row in county_data))
+    state_fips = {state: '' for state in states}
+    for state in states:
+        for row in county_data:
+            if row['State'] == state:
+                state_fips[state] = row['State ANSI']
+                continue
+    for row in fha_data:
+        if row['state'] and row['county-fips']:
+            FIPS = state_fips[row['state']] + row['county-fips']
+            final_data.append({
+                u'State': row['state'],
+                u'State FIPS': state_fips[row['state']],
+                u'County FIPS': row['county-fips'],
+                u'Complete FIPS': FIPS,
+                u'County Name': county_by_fips[FIPS]['County Name'],
+                u'GSE limit': None,
+                u'FHA limit': int(row['limit-1-unit']),
+                u'VA limit': None
+                })
+    gse_by_fips = {}
+    for row in gse_data:
+        if row['state'] and row['county-fips']:
+            FIPS = state_fips[row['state']] + row['county-fips']
+            gse_by_fips[FIPS] = int(row['limit-1-unit'])
+    for row in final_data:
+        limit = gse_by_fips[row['Complete FIPS']]
+        row['GSE limit'] = limit
+        row['VA limit'] = limit
+    return final_data
+
+
+def get_chums_data(year=None):
+    """
+    Downloads and processes mortgage data files for the next year.
+
+    Normally, updates are run in December preceding the new data year,
+    so the default year is current year + 1.
+    If updates need to be run for the current year, or any other year,
+    then pass in your desired 'year' value.
+
+    Files are available manually
+    at https://www.hud.gov/pub/chums/file_layouts.html
+    """
+    year = year or datetime.date.today().year + 1
+    msg = ''
+    try:
+        fha = download_datafile(CHUMS_FHA_URL.format(year)).split('\r\n')
+        if fha[0].startswith("Error"):
+            msg += fha[0]
+            raise ValueError(fha[0])
+        fha_data = translate_data(fha, CHUMS_MAP)
+        dump_to_csv(
+            '{}/forward_limits_{}.csv'.format(CSV_DIR, year),
+            CHUMS_MAP.keys(),
+            fha_data)
+        msg += ('FHA limits saved to {}/forward_limits_{}.csv\n'.format(
+            CSV_DIR, year))
+        gse = download_datafile(CHUMS_GSE_URL.format(year)).split('\r\n')
+        if gse[0].startswith("Error"):  # pragma: no cover tested above
+            msg += gse[0]
+            raise ValueError(gse[0])
+        gse_data = translate_data(gse, CHUMS_MAP)
+        gse_file = '{}/gse_limits_{}.csv'.format(CSV_DIR, year)
+        dump_to_csv(gse_file, CHUMS_MAP.keys(), gse_data)
+        msg += 'GSE limits saved to {}\n'.format(gse_file)
+        final_data = assemble_final_data(fha_data, gse_data)
+        yearly_file = '{}/county_limit_data_flat_{}.csv'.format(CSV_DIR, year)
+        final_file = '{}/county_limit_data_latest.csv'.format(DATA_DIR)
+        dump_to_csv(yearly_file, FINAL_FIELDNAMES, final_data)
+        dump_to_csv(final_file, FINAL_FIELDNAMES, final_data)
+        msg += ('Final flat file saved to {}\n'.format(final_file))
+        msg += ("All county source files processed.\n"
+                "Data can be loaded with this command: \n"
+                "`python manage.py load_county_limits "
+                "data/county_limit_data_latest.csv --confirm=y`")
+    except Exception:
+        return "{}\n{}".format(ERROR_MSG, msg)
+    return msg
diff --git a/countylimits/management/commands/gather_limit_data.py b/countylimits/management/commands/gather_limit_data.py
@@ -0,0 +1,26 @@
+from django.core.management.base import BaseCommand
+from countylimits.data_collection.gather_county_data import get_chums_data
+
+COMMAND_HELP = "Gathers annaul county mortgage limit data from the "
+"HUD website and processes it for use in the owning-a-home-api. "
+"Processed source files are saved in the app's /data/base_data/ folder.  "
+PARSER_HELP = "An optional '--year' value may be supplied to process "
+"data files from a particular year. The default year value is the year after "
+"the current year, because this script is normally run at the end of "
+"December for the next year's values."
+
+
+class Command(BaseCommand):
+    help = COMMAND_HELP
+
+    def add_arguments(self, parser):
+        parser.add_argument('--year',
+                            help=PARSER_HELP,
+                            type=int)
+
+    def handle(self, *args, **options):
+        if options['year']:
+            msg = get_chums_data(year=options['year'])
+        else:
+            msg = get_chums_data()
+        self.stdout.write(msg)
diff --git a/countylimits/tests.py b/countylimits/tests.py
@@ -18,22 +18,89 @@
     get_current_log,
     get_base_log,
     get_lines)
-
-
+from countylimits.data_collection.gather_county_data import (
+    CHUMS_MAP,
+    ERROR_MSG,
+    download_datafile,
+    dump_to_csv,
+    get_chums_data,
+    translate_data
+    )
 try:
     BASE_PATH = os.path.dirname(
         os.path.dirname(os.path.abspath(__file__))) + '/'
 except:  # pragma: no cover
     BASE_PATH = ''
 
 
-class CheckCountyChangesCommand(unittest.TestCase):
+class DataAutomationTests(unittest.TestCase):
 
     def setUp(self):
         stdout_patch = mock.patch('sys.stdout')
         stdout_patch.start()
         self.addCleanup(stdout_patch.stop)
 
+    def test_translate_data(self):
+        test_list = [u'9999900000NON-METRO                                         203B S02070000275665035295004266250530150AK050ALASKA                    BETHEL CENSUS A201611292017010102080002015    ']  # noqa
+        expected_dict = {'county-fips': u'050', 'limit-1-unit': u'0275665', 'limit-2-units': u'0352950', 'county-transaction-date': u'20161129', 'metro-name': u'NON-METRO', 'metro-code': u'00000', 'year-for-median-determining-limit': u'2015', 'median-price-determining-limit': u'0208000', 'county-name': u'BETHEL CENSUS A', 'state': u'AK', 'program': u'203B', 'limit-type': u'S', 'median-price': u'0207000', 'limit-4-units': u'0530150', 'limit-transaction-date': u'20170101', 'msa-code': u'99999', 'limit-3-units': u'0426625', 'state-name': u'ALASKA'}  # noqa
+        result = translate_data(test_list, CHUMS_MAP)[0]
+        for key in ['county-fips', 'metro-name', 'county-name', 'state']:
+            self.assertEqual(result[key], expected_dict[key])
+        self.assertEqual(len(result), len(expected_dict))
+
+    @mock.patch(
+        'countylimits.data_collection.gather_county_data.download_datafile')
+    @mock.patch(
+        'countylimits.data_collection.gather_county_data.translate_data')
+    @mock.patch(
+        'countylimits.data_collection.gather_county_data.dump_to_csv')
+    def test_get_chums(self, mock_dump, mock_translate, mock_download):
+        mock_download.return_value = '1\r\n2\r\n3\r\n4\r\n'
+        mock_translate.return_value = [{'county-fips': '005',
+                                        'metro-name': 'Hooverville',
+                                        'county-name': 'Barbour County',
+                                        'state': 'AL',
+                                        'limit-1-unit': '20000'}]
+        get_chums_data()
+        self.assertEqual(mock_download.call_count, 2)
+        self.assertEqual(mock_translate.call_count, 2)
+        self.assertEqual(mock_dump.call_count, 4)
+
+    @mock.patch(
+        'countylimits.data_collection.gather_county_data.download_datafile')
+    def test_get_chums_failure(self, mock_download):
+        mock_download.side_effect = ValueError('Error: 404')
+        msg = get_chums_data()
+        self.assertIn(ERROR_MSG, msg)
+
+    @mock.patch(
+        'countylimits.data_collection.gather_county_data.download_datafile')
+    def test_get_chums_download_failure(self, mock_download):
+        mock_download.return_value = "Error:"
+        msg = get_chums_data()
+        self.assertIn(ERROR_MSG, msg)
+
+    @mock.patch('countylimits.data_collection.gather_county_data.requests.get')
+    def test_download_datafile(self, mock_get):
+        return_val = mock.Mock()
+        return_val.ok = True
+        return_val.text = 'heading1,heading2\nvalue1,value2'
+        mock_get.return_value = return_val
+        result = download_datafile('mockurl.example.com')
+        self.assertIn('heading1', result)
+        self.assertEqual(mock_get.call_count, 1)
+
+    @mock.patch('countylimits.data_collection.gather_county_data.requests.get')
+    def test_download_datafile_error(self, mock_get):
+        return_val = mock.Mock()
+        return_val.ok = False
+        return_val.status_code = '404'
+        return_val.reason = 'Not found'
+        mock_get.return_value = return_val
+        result = download_datafile('mockurl.example.com')
+        self.assertIn('404', result)
+        self.assertEqual(mock_get.call_count, 1)
+
     @mock.patch(
         'countylimits.management.commands.oah_check_county_changes.'
         'check_for_county_changes')
@@ -50,10 +117,34 @@ def test_check_county_with_email(self, mock_check):
         call_command('oah_check_county_changes', '--email', 'fake@example.com')
         self.assertEqual(mock_check.call_count, 1)
 
+    @mock.patch(
+        'countylimits.management.commands.gather_limit_data.'
+        'get_chums_data')
+    def test_gather_county_data_no_year(self, mock_get_chums):
+        mock_get_chums.return_value = 'Data downloaded'
+        call_command('gather_limit_data')
+        self.assertEqual(mock_get_chums.call_count, 1)
+
+    @mock.patch(
+        'countylimits.management.commands.gather_limit_data.'
+        'get_chums_data')
+    def test_gather_county_data_with_year(self, mock_get_chums):
+        mock_get_chums.return_value = 'Data downloaded'
+        call_command('gather_limit_data', '--year', '2017')
+        self.assertEqual(mock_get_chums.call_count, 1)
+        mock_get_chums.assert_called_with(year=2017)
+
 
 class DataCollectionTest(unittest.TestCase):
     """Test data automation functions"""
 
+    def test_dump_to_csv(self):
+        m = mock_open()
+        with patch("__builtin__.open", m, create=True):
+            dump_to_csv('fakepath', ['a', 'b'], [{'a': '1', 'b': '2'}])
+        self.assertTrue(m.call_count == 1)
+        m.assert_called_with('fakepath', 'w')
+
     def test_get_lines(self):
         lines_in = "\n\nline 1\nline 2\n\n\nline 3\n\n"
         expected_result = ['line 1', 'line 2', 'line 3']