Skip to content

Commit

Permalink
Merge pull request #108 from cfpb/data-automation2
Browse files Browse the repository at this point in the history
Automate the gathering of county limit data
  • Loading branch information
higs4281 committed Feb 2, 2017
2 parents 6ae41dc + d12c6e1 commit e3ca5d6
Show file tree
Hide file tree
Showing 13 changed files with 13,913 additions and 688 deletions.
1 change: 0 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[run]
source =
countylimits
mortgageinsurance
ratechecker

omit =
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.

## Unreleased
- Add a monitor to watch for changes in census county values
- Add a script and manage.py command to automate county data collection

## 0.9.92 - 2016-12-31
- 2017 update for county-level mortgage-limit data
Expand Down
179 changes: 179 additions & 0 deletions countylimits/data_collection/gather_county_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import os
import datetime
from collections import OrderedDict

import requests
try:
from csvkit import DictReader # for stand-alone use
from csvkit import writer as Writer # for stand-alone use
except ImportError: # pragma: no cover
try:
from paying_for_college.csvkit.csvkit import DictReader
from paying_for_college.csvkit.csvkit import Writer
except ImportError: # unicode errors ahoy
from csv import DictReader
from csv import writer as Writer


ERROR_MSG = "Script failed to process all files."
API_DIR = os.path.abspath(
os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
)
DATA_DIR = "{}/data".format(API_DIR)
CSV_DIR = "{}/base_data".format(DATA_DIR)
CHUMS_FHA_URL = 'https://www.hud.gov/pub/chums/cy{}-forward-limits.txt'
CHUMS_GSE_URL = 'https://www.hud.gov/pub/chums/cy{}-gse-limits.txt'
CHUMS_SPACING = [
('msa-code', (0, 5)),
('metro-code', (5, 10)),
('metro-name', (10, 60)),
('program', (60, 65)),
('limit-type', (65, 66)), # S or H, standard or high
('median-price', (66, 73)),
('limit-1-unit', (73, 80)),
('limit-2-units', (80, 87)),
('limit-3-units', (87, 94)),
('limit-4-units', (94, 101)),
('state', (101, 103)),
('county-fips', (103, 106)),
('state-name', (106, 132)),
('county-name', (132, 147)),
('county-transaction-date', (147, 155)),
('limit-transaction-date', (155, 163)),
('median-price-determining-limit', (163, 170)),
('year-for-median-determining-limit', (170, 175))
]
CHUMS_MAP = OrderedDict(CHUMS_SPACING)
FINAL_FIELDNAMES = [
u'State',
u'State FIPS',
u'County FIPS',
u'Complete FIPS',
u'County Name',
u'GSE limit',
u'FHA limit',
u'VA limit'
]


def load_FIPS():
with open('{}/county_FIPS.csv'.format(CSV_DIR), 'r') as f:
reader = DictReader(f)
return [row for row in reader]


def translate_data(data_list, data_map):
rows = []
for line in data_list:
rows.append(
{key: line[data_map[key][0]:data_map[key][1]].strip()
for key in data_map}
)
return rows


def download_datafile(url):
response = requests.get(url)
if response.ok:
return response.text
else:
return "Error:\n{} {}\n{}".format(
response.status_code,
response.reason,
response.url)


def dump_to_csv(filepath, headings, data):
with open(filepath, 'w') as f:
fieldnames = [key for key in headings]
writer = Writer(f)
writer.writerow(fieldnames)
for row in data:
writer.writerow(
[row[key] for key in headings]
)


def assemble_final_data(fha_data, gse_data):
final_data = []
county_data = load_FIPS()
county_by_fips = {row['Complete FIPS']: row for row in county_data}
states = sorted(set(row['State'] for row in county_data))
state_fips = {state: '' for state in states}
for state in states:
for row in county_data:
if row['State'] == state:
state_fips[state] = row['State ANSI']
continue
for row in fha_data:
if row['state'] and row['county-fips']:
FIPS = state_fips[row['state']] + row['county-fips']
final_data.append({
u'State': row['state'],
u'State FIPS': state_fips[row['state']],
u'County FIPS': row['county-fips'],
u'Complete FIPS': FIPS,
u'County Name': county_by_fips[FIPS]['County Name'],
u'GSE limit': None,
u'FHA limit': int(row['limit-1-unit']),
u'VA limit': None
})
gse_by_fips = {}
for row in gse_data:
if row['state'] and row['county-fips']:
FIPS = state_fips[row['state']] + row['county-fips']
gse_by_fips[FIPS] = int(row['limit-1-unit'])
for row in final_data:
limit = gse_by_fips[row['Complete FIPS']]
row['GSE limit'] = limit
row['VA limit'] = limit
return final_data


def get_chums_data(year=None):
"""
Downloads and processes mortgage data files for the next year.
Normally, updates are run in December preceding the new data year,
so the default year is current year + 1.
If updates need to be run for the current year, or any other year,
then pass in your desired 'year' value.
Files are available manually
at https://www.hud.gov/pub/chums/file_layouts.html
"""
year = year or datetime.date.today().year + 1
msg = ''
try:
fha = download_datafile(CHUMS_FHA_URL.format(year)).split('\r\n')
if fha[0].startswith("Error"):
msg += fha[0]
raise ValueError(fha[0])
fha_data = translate_data(fha, CHUMS_MAP)
dump_to_csv(
'{}/forward_limits_{}.csv'.format(CSV_DIR, year),
CHUMS_MAP.keys(),
fha_data)
msg += ('FHA limits saved to {}/forward_limits_{}.csv\n'.format(
CSV_DIR, year))
gse = download_datafile(CHUMS_GSE_URL.format(year)).split('\r\n')
if gse[0].startswith("Error"): # pragma: no cover tested above
msg += gse[0]
raise ValueError(gse[0])
gse_data = translate_data(gse, CHUMS_MAP)
gse_file = '{}/gse_limits_{}.csv'.format(CSV_DIR, year)
dump_to_csv(gse_file, CHUMS_MAP.keys(), gse_data)
msg += 'GSE limits saved to {}\n'.format(gse_file)
final_data = assemble_final_data(fha_data, gse_data)
yearly_file = '{}/county_limit_data_flat_{}.csv'.format(CSV_DIR, year)
final_file = '{}/county_limit_data_latest.csv'.format(DATA_DIR)
dump_to_csv(yearly_file, FINAL_FIELDNAMES, final_data)
dump_to_csv(final_file, FINAL_FIELDNAMES, final_data)
msg += ('Final flat file saved to {}\n'.format(final_file))
msg += ("All county source files processed.\n"
"Data can be loaded with this command: \n"
"`python manage.py load_county_limits "
"data/county_limit_data_latest.csv --confirm=y`")
except Exception:
return "{}\n{}".format(ERROR_MSG, msg)
return msg
26 changes: 26 additions & 0 deletions countylimits/management/commands/gather_limit_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from django.core.management.base import BaseCommand
from countylimits.data_collection.gather_county_data import get_chums_data

COMMAND_HELP = "Gathers annaul county mortgage limit data from the "
"HUD website and processes it for use in the owning-a-home-api. "
"Processed source files are saved in the app's /data/base_data/ folder. "
PARSER_HELP = "An optional '--year' value may be supplied to process "
"data files from a particular year. The default year value is the year after "
"the current year, because this script is normally run at the end of "
"December for the next year's values."


class Command(BaseCommand):
help = COMMAND_HELP

def add_arguments(self, parser):
parser.add_argument('--year',
help=PARSER_HELP,
type=int)

def handle(self, *args, **options):
if options['year']:
msg = get_chums_data(year=options['year'])
else:
msg = get_chums_data()
self.stdout.write(msg)
97 changes: 94 additions & 3 deletions countylimits/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,89 @@
get_current_log,
get_base_log,
get_lines)


from countylimits.data_collection.gather_county_data import (
CHUMS_MAP,
ERROR_MSG,
download_datafile,
dump_to_csv,
get_chums_data,
translate_data
)
try:
BASE_PATH = os.path.dirname(
os.path.dirname(os.path.abspath(__file__))) + '/'
except: # pragma: no cover
BASE_PATH = ''


class CheckCountyChangesCommand(unittest.TestCase):
class DataAutomationTests(unittest.TestCase):

def setUp(self):
stdout_patch = mock.patch('sys.stdout')
stdout_patch.start()
self.addCleanup(stdout_patch.stop)

def test_translate_data(self):
test_list = [u'9999900000NON-METRO 203B S02070000275665035295004266250530150AK050ALASKA BETHEL CENSUS A201611292017010102080002015 '] # noqa
expected_dict = {'county-fips': u'050', 'limit-1-unit': u'0275665', 'limit-2-units': u'0352950', 'county-transaction-date': u'20161129', 'metro-name': u'NON-METRO', 'metro-code': u'00000', 'year-for-median-determining-limit': u'2015', 'median-price-determining-limit': u'0208000', 'county-name': u'BETHEL CENSUS A', 'state': u'AK', 'program': u'203B', 'limit-type': u'S', 'median-price': u'0207000', 'limit-4-units': u'0530150', 'limit-transaction-date': u'20170101', 'msa-code': u'99999', 'limit-3-units': u'0426625', 'state-name': u'ALASKA'} # noqa
result = translate_data(test_list, CHUMS_MAP)[0]
for key in ['county-fips', 'metro-name', 'county-name', 'state']:
self.assertEqual(result[key], expected_dict[key])
self.assertEqual(len(result), len(expected_dict))

@mock.patch(
'countylimits.data_collection.gather_county_data.download_datafile')
@mock.patch(
'countylimits.data_collection.gather_county_data.translate_data')
@mock.patch(
'countylimits.data_collection.gather_county_data.dump_to_csv')
def test_get_chums(self, mock_dump, mock_translate, mock_download):
mock_download.return_value = '1\r\n2\r\n3\r\n4\r\n'
mock_translate.return_value = [{'county-fips': '005',
'metro-name': 'Hooverville',
'county-name': 'Barbour County',
'state': 'AL',
'limit-1-unit': '20000'}]
get_chums_data()
self.assertEqual(mock_download.call_count, 2)
self.assertEqual(mock_translate.call_count, 2)
self.assertEqual(mock_dump.call_count, 4)

@mock.patch(
'countylimits.data_collection.gather_county_data.download_datafile')
def test_get_chums_failure(self, mock_download):
mock_download.side_effect = ValueError('Error: 404')
msg = get_chums_data()
self.assertIn(ERROR_MSG, msg)

@mock.patch(
'countylimits.data_collection.gather_county_data.download_datafile')
def test_get_chums_download_failure(self, mock_download):
mock_download.return_value = "Error:"
msg = get_chums_data()
self.assertIn(ERROR_MSG, msg)

@mock.patch('countylimits.data_collection.gather_county_data.requests.get')
def test_download_datafile(self, mock_get):
return_val = mock.Mock()
return_val.ok = True
return_val.text = 'heading1,heading2\nvalue1,value2'
mock_get.return_value = return_val
result = download_datafile('mockurl.example.com')
self.assertIn('heading1', result)
self.assertEqual(mock_get.call_count, 1)

@mock.patch('countylimits.data_collection.gather_county_data.requests.get')
def test_download_datafile_error(self, mock_get):
return_val = mock.Mock()
return_val.ok = False
return_val.status_code = '404'
return_val.reason = 'Not found'
mock_get.return_value = return_val
result = download_datafile('mockurl.example.com')
self.assertIn('404', result)
self.assertEqual(mock_get.call_count, 1)

@mock.patch(
'countylimits.management.commands.oah_check_county_changes.'
'check_for_county_changes')
Expand All @@ -50,10 +117,34 @@ def test_check_county_with_email(self, mock_check):
call_command('oah_check_county_changes', '--email', 'fake@example.com')
self.assertEqual(mock_check.call_count, 1)

@mock.patch(
'countylimits.management.commands.gather_limit_data.'
'get_chums_data')
def test_gather_county_data_no_year(self, mock_get_chums):
mock_get_chums.return_value = 'Data downloaded'
call_command('gather_limit_data')
self.assertEqual(mock_get_chums.call_count, 1)

@mock.patch(
'countylimits.management.commands.gather_limit_data.'
'get_chums_data')
def test_gather_county_data_with_year(self, mock_get_chums):
mock_get_chums.return_value = 'Data downloaded'
call_command('gather_limit_data', '--year', '2017')
self.assertEqual(mock_get_chums.call_count, 1)
mock_get_chums.assert_called_with(year=2017)


class DataCollectionTest(unittest.TestCase):
"""Test data automation functions"""

def test_dump_to_csv(self):
m = mock_open()
with patch("__builtin__.open", m, create=True):
dump_to_csv('fakepath', ['a', 'b'], [{'a': '1', 'b': '2'}])
self.assertTrue(m.call_count == 1)
m.assert_called_with('fakepath', 'w')

def test_get_lines(self):
lines_in = "\n\nline 1\nline 2\n\n\nline 3\n\n"
expected_result = ['line 1', 'line 2', 'line 3']
Expand Down

0 comments on commit e3ca5d6

Please sign in to comment.