Merge pull request #386 from amymok/HMDA-2016

Add new county changes
cfpb · Jan 4, 2018 · b3b7ccc · b3b7ccc
2 parents 7b0a266 + 6a152a7
commit b3b7ccc
Show file tree

Hide file tree

Showing 12 changed files with 168 additions and 13 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,8 +4,13 @@ python:
 install:
   - |
     echo "DATABASES = {'default': {'ENGINE': 'django.contrib.gis.db.backends.postgis', 'NAME': 'travisdb', 'USER': 'postgres'}}" > mapusaurus/mapusaurus/settings/local_settings.py
-  - pip install -r requirements.txt --use-mirrors
+  - pip install -r requirements_test.txt
   - pip install coverage coveralls
+addons:
+  postgresql: 9.3
+  apt:
+    packages:
+    - postgresql-9.3-postgis-2.3
 before_script:
   - psql -c 'CREATE DATABASE travisdb;' -U postgres
   - psql -c 'CREATE EXTENSION postgis;' -U postgres -d travisdb

diff --git a/mapusaurus/censusdata/management/commands/load_summary_one.py b/mapusaurus/censusdata/management/commands/load_summary_one.py
@@ -32,6 +32,7 @@ def handle(self, *args, **options):
                 recordnum = line[18:25]
                 censustract = line[27:32] + line[54:60]
                 censustract = errors.in_2010.get(censustract, censustract)
+                censustract = errors.change_specific_year(censustract, year)
                 if censustract is not None:
                     geoids_by_record[recordnum] = year + censustract
                 state = line[27:29]

diff --git a/mapusaurus/censusdata/tests/test_loader.py b/mapusaurus/censusdata/tests/test_loader.py
@@ -8,6 +8,7 @@
 from censusdata import models
 from censusdata.management.commands.load_summary_one import Command
 
+import geo.errors
 
 class LoadSummaryDataTest(TestCase):
     fixtures = ['mock_geo']
@@ -38,13 +39,14 @@ def test_handle(self, hf3, hf4, hf5):
         self.assertEqual(positional_args[3]['0007159'], year+'11001000100')
         self.assertEqual(positional_args[3]['0007211'], year+'11001000902')
 
-    @patch('censusdata.management.commands.load_summary_one.errors')
     @patch.object(Command, 'handle_filefive')
     @patch.object(Command, 'handle_filefour')
     @patch.object(Command, 'handle_filethree')
-    def test_handle_errors_dict(self, hf3, hf4, hf5, errors):
+    def test_handle_errors_dict(self, hf3, hf4, hf5):
         year = '2001'
-        errors.in_2010 = {'11001000100': '22002000200', '11001000902': None}
+        old_geo_errors = geo.errors.in_2010
+        geo.errors.in_2010 = {'11001000100': '22002000200', '11001000902': None}
+
         # Create Mock GEO file
         shutil.copyfile(os.path.join("censusdata", "tests", "mock_geo.txt"),
                         os.path.join(self.tempdir, "ZZgeo2010.sf1"))
@@ -57,6 +59,8 @@ def test_handle_errors_dict(self, hf3, hf4, hf5, errors):
         # This entry was converted
         self.assertEqual(positional_args[3]['0007159'], year+'22002000200')
 
+        geo.errors.in_2010 = old_geo_errors
+
     def test_handle_filethree(self):
         shutil.copyfile(os.path.join("censusdata", "tests", "mock_file3.txt"),
                         os.path.join(self.tempdir, "ZZ000032010.sf1"))

diff --git a/mapusaurus/geo/errors.py b/mapusaurus/geo/errors.py
@@ -2,8 +2,12 @@
 # Unfortunately, both HMDA and census population statistics refer to the
 # original, erroneous census tracts. See
 # http://www.census.gov/geo/reference/pdfs/Geography_Notes.pdf
+# https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes.201X.html, where X is the last digit of the year
 in_2010 = {
     # Original -> Correct
+
+    # 2012
+    # https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2012/geography-changes.html
     "04019002701": "04019002704",
     "04019002903": "04019002906",
     "04019410501": "04019004118",
@@ -14,6 +18,8 @@
 
     "06037930401": "06037137000",
 
+    # 2011
+    # https://www.census.gov/programs-surveys/acs/technical-documentation/table-and-geography-changes/2011/geography-changes.html
     "36053940101": "36053030101",
     "36053940102": "36053030102",
     "36053940103": "36053030103",
@@ -32,3 +38,23 @@
     # removing it
     "36085008900": None,
 }
+
+changes = {
+  #https://www.census.gov/geo/maps-data/maps/2010ref/stXX_tract.html, XX to replace with the state code
+    2014: {
+        "51515050100": "51019050100",
+    },
+    2015: {
+        "02270000100": "02158000100",
+        "46113940500": "46102940500",
+        "46113940800": "46102940800",
+        "46113940900": "46102940900",
+        # More soon
+    }
+}
+def change_specific_year(census_tract, year):
+    new_census_tract = census_tract
+    for yr in sorted(changes):
+        if int(year) > yr:
+            new_census_tract = changes[yr].get(new_census_tract, new_census_tract)
+    return new_census_tract
diff --git a/mapusaurus/hmda/migrations/0003_auto_20171215_1812.py b/mapusaurus/hmda/migrations/0003_auto_20171215_1812.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('hmda', '0002_auto_20160219_1957'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='year',
+            name='census_year',
+            field=models.PositiveIntegerField(help_text=b'Year of census data.', choices=[(1970, 1970), (1971, 1971), (1972, 1972), (1973, 1973), (1974, 1974), (1975, 1975), (1976, 1976), (1977, 1977), (1978, 1978), (1979, 1979), (1980, 1980), (1981, 1981), (1982, 1982), (1983, 1983), (1984, 1984), (1985, 1985), (1986, 1986), (1987, 1987), (1988, 1988), (1989, 1989), (1990, 1990), (1991, 1991), (1992, 1992), (1993, 1993), (1994, 1994), (1995, 1995), (1996, 1996), (1997, 1997), (1998, 1998), (1999, 1999), (2000, 2000), (2001, 2001), (2002, 2002), (2003, 2003), (2004, 2004), (2005, 2005), (2006, 2006), (2007, 2007), (2008, 2008), (2009, 2009), (2010, 2010), (2011, 2011), (2012, 2012), (2013, 2013), (2014, 2014), (2015, 2015), (2016, 2016), (2017, 2017), (2018, 2018), (2019, 2019), (2020, 2020), (2021, 2021), (2022, 2022), (2023, 2023), (2024, 2024), (2025, 2025), (2026, 2026), (2027, 2027), (2028, 2028), (2029, 2029), (2030, 2030), (2031, 2031), (2032, 2032), (2033, 2033), (2034, 2034), (2035, 2035), (2036, 2036), (2037, 2037), (2038, 2038), (2039, 2039), (2040, 2040), (2041, 2041), (2042, 2042), (2043, 2043), (2044, 2044), (2045, 2045), (2046, 2046), (2047, 2047), (2048, 2048), (2049, 2049)]),
+        ),
+        migrations.AlterField(
+            model_name='year',
+            name='geo_year',
+            field=models.PositiveIntegerField(help_text=b'Year that geographic boundaries were recorded.', choices=[(1970, 1970), (1971, 1971), (1972, 1972), (1973, 1973), (1974, 1974), (1975, 1975), (1976, 1976), (1977, 1977), (1978, 1978), (1979, 1979), (1980, 1980), (1981, 1981), (1982, 1982), (1983, 1983), (1984, 1984), (1985, 1985), (1986, 1986), (1987, 1987), (1988, 1988), (1989, 1989), (1990, 1990), (1991, 1991), (1992, 1992), (1993, 1993), (1994, 1994), (1995, 1995), (1996, 1996), (1997, 1997), (1998, 1998), (1999, 1999), (2000, 2000), (2001, 2001), (2002, 2002), (2003, 2003), (2004, 2004), (2005, 2005), (2006, 2006), (2007, 2007), (2008, 2008), (2009, 2009), (2010, 2010), (2011, 2011), (2012, 2012), (2013, 2013), (2014, 2014), (2015, 2015), (2016, 2016), (2017, 2017), (2018, 2018), (2019, 2019), (2020, 2020), (2021, 2021), (2022, 2022), (2023, 2023), (2024, 2024), (2025, 2025), (2026, 2026), (2027, 2027), (2028, 2028), (2029, 2029), (2030, 2030), (2031, 2031), (2032, 2032), (2033, 2033), (2034, 2034), (2035, 2035), (2036, 2036), (2037, 2037), (2038, 2038), (2039, 2039), (2040, 2040), (2041, 2041), (2042, 2042), (2043, 2043), (2044, 2044), (2045, 2045), (2046, 2046), (2047, 2047), (2048, 2048), (2049, 2049)]),
+        ),
+        migrations.AlterField(
+            model_name='year',
+            name='hmda_year',
+            field=models.PositiveIntegerField(help_text=b'The reporting year of the HMDA record.', serialize=False, primary_key=True, choices=[(1970, 1970), (1971, 1971), (1972, 1972), (1973, 1973), (1974, 1974), (1975, 1975), (1976, 1976), (1977, 1977), (1978, 1978), (1979, 1979), (1980, 1980), (1981, 1981), (1982, 1982), (1983, 1983), (1984, 1984), (1985, 1985), (1986, 1986), (1987, 1987), (1988, 1988), (1989, 1989), (1990, 1990), (1991, 1991), (1992, 1992), (1993, 1993), (1994, 1994), (1995, 1995), (1996, 1996), (1997, 1997), (1998, 1998), (1999, 1999), (2000, 2000), (2001, 2001), (2002, 2002), (2003, 2003), (2004, 2004), (2005, 2005), (2006, 2006), (2007, 2007), (2008, 2008), (2009, 2009), (2010, 2010), (2011, 2011), (2012, 2012), (2013, 2013), (2014, 2014), (2015, 2015), (2016, 2016), (2017, 2017), (2018, 2018), (2019, 2019), (2020, 2020), (2021, 2021), (2022, 2022), (2023, 2023), (2024, 2024), (2025, 2025), (2026, 2026), (2027, 2027), (2028, 2028), (2029, 2029), (2030, 2030), (2031, 2031), (2032, 2032), (2033, 2033), (2034, 2034), (2035, 2035), (2036, 2036), (2037, 2037), (2038, 2038), (2039, 2039), (2040, 2040), (2041, 2041), (2042, 2042), (2043, 2043), (2044, 2044), (2045, 2045), (2046, 2046), (2047, 2047), (2048, 2048), (2049, 2049)]),
+        ),
+    ]
diff --git a/mapusaurus/respondents/management/commands/load_branch_locations.py b/mapusaurus/respondents/management/commands/load_branch_locations.py
@@ -5,6 +5,9 @@
 class Command(BaseCommand):
     args = "<filename>"
 
+    def normalize(s):
+        return s.strip().upper()
+
     def handle(self, *args, **options):
         branch_location_filename = args[0]
         count = 0; 
@@ -14,20 +17,27 @@ def handle(self, *args, **options):
             for branch_location_line in branch_location_reader:
                 record = Branch(
                     year = branch_location_line[0].replace("'", ""),
-                    name = branch_location_line[6],
-                    street = branch_location_line[7] if branch_location_line[7] != '0' else '', 
-                    city = branch_location_line[8],
-                    state = branch_location_line[10],
-                    zipcode = branch_location_line[11],
+                    name = normalize(branch_location_line[6]),
+                    street = normalize(branch_location_line[7]) if branch_location_line[7] != '0' else '',
+                    city = normalize(branch_location_line[8]),
+                    state = normalize(branch_location_line[10]),
+                    zipcode = normalize(branch_location_line[11]),
                     lat = branch_location_line[13], 
                     lon = branch_location_line[12],
                 )
-                record.institution_id = (branch_location_line[0]+branch_location_line[1]+branch_location_line[2]).replace("'", "")
+                record.institution_id = (branch_location_line[0]+branch_location_line[1]+branch_location_line[2]).replace("'", "").replace(" ", "")
                 if Institution.objects.filter(institution_id=record.institution_id).count() > 0:
                     branch_location.append(record)
+                else:
+                    print "Can't find institution_id"
+                    print '{}\t{}\t{}'.format(record.institution_id, record.name, record.street)
                 if len(branch_location) > 9999:
                     count += len(branch_location)
                     Branch.objects.bulk_create(branch_location, batch_size=1000)
                     print "Record count: " + str(count)
                     branch_location[:] = []
-
+            if len(branch_location) > 0:
+                count += len(branch_location)
+                Branch.objects.bulk_create(branch_location, batch_size=1000)
+                print "Record count: " + str(count)
+                branch_location[:] = []
diff --git a/mapusaurus/respondents/management/commands/load_transmittal.py b/mapusaurus/respondents/management/commands/load_transmittal.py
@@ -16,6 +16,7 @@ def handle(self, *args, **options):
         with open(transmittal_filename) as institutioncsv:
             transmittal_reader = csv.reader(institutioncsv, delimiter='\t')
             institutions = []
+            # count = 1 # use if want to see which item failed, see comment below where we create institution individually
             for inst_line in transmittal_reader:
                 year = inst_line[0]
                 zip_code = inst_line[8]
@@ -37,5 +38,9 @@ def handle(self, *args, **options):
                     assets=int(inst_line[17]),
                 )
 
+                # This can be used to figure out which exact item was failing, will need to disable bulk create below to use this
+                # Institution.objects.create(inst)
+                # inst.save()
+                # count += 1
                 institutions.append(inst)
             Institution.objects.bulk_create(institutions)
diff --git a/mapusaurus/respondents/migrations/0003_auto_20171215_1812.py b/mapusaurus/respondents/migrations/0003_auto_20171215_1812.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('respondents', '0002_auto_20160222_1706'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='institution',
+            name='assets',
+            field=models.BigIntegerField(default=0, help_text=b'Prior year reported assets in thousands of dollars'),
+        ),
+    ]
diff --git a/mapusaurus/respondents/migrations/0004_auto_20171227_2123.py b/mapusaurus/respondents/migrations/0004_auto_20171227_2123.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('respondents', '0003_auto_20171215_1812'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='branch',
+            name='name',
+            field=models.CharField(max_length=60),
+        ),
+    ]
diff --git a/mapusaurus/respondents/migrations/0005_auto_20171227_2322.py b/mapusaurus/respondents/migrations/0005_auto_20171227_2322.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('respondents', '0004_auto_20171227_2123'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='branch',
+            name='name',
+            field=models.CharField(max_length=100),
+        ),
+    ]
diff --git a/mapusaurus/respondents/models.py b/mapusaurus/respondents/models.py
@@ -71,7 +71,7 @@ class Institution(models.Model):
     name = models.CharField(max_length=30)
     mailing_address = models.CharField(max_length=40)
     zip_code = models.ForeignKey('ZipCodeCityStateYear', null=False)
-    assets = models.PositiveIntegerField(
+    assets = models.BigIntegerField(
         default=0,
         help_text='Prior year reported assets in thousands of dollars'
     )
@@ -145,7 +145,7 @@ class LenderHierarchy(models.Model):
 class Branch(models.Model):
     year = models.SmallIntegerField()
     institution = models.ForeignKey('Institution', to_field='institution_id')
-    name = models.CharField(max_length=50)
+    name = models.CharField(max_length=100)
     street = models.CharField(max_length=100)
     city = models.CharField(max_length=25)
     state = USStateField()

diff --git a/requirements_test.txt b/requirements_test.txt
@@ -0,0 +1,18 @@
+Django==1.7
+argparse==1.2.1
+django-geojson==2.6.0
+django-haystack==2.3.0
+django-localflavor==1.0
+django-overextends==0.3.2
+djangorestframework==2.3.14
+jsonschema==2.4.0
+mock==1.0.1
+psycopg2
+elasticsearch==1.0
+requests==2.3.0
+simplejson==3.6.3
+six==1.7.3
+sphinx-me==0.3
+wsgiref==0.1.2
+gunicorn==19.1.1
+newrelic==2.60.0.46