## `parse_costar`: statistical NLP for organizing addresses.

### What it does:
All hail [libpostal](https://github.com/openvenues/libpostal) and its Python bindings, [pypostal](https://github.com/openvenues/pypostal). Messy address strings are split into the following:
* Building/street number
* Road
* City
* City district
* State
* State district
* Zip code
* Building name
* Unit
* Level
* Entrance
* P.O. Box
* Suburb

### What you do:
Edit the two lines below as paths to your existing import `.csv` and your desired export `.csv`.

### Cautions:
* This script makes a NJ-specific assumption that 4- or 8-digit zip codes have lost their leading zero and need to be padded. This may not fit your assumptions.
* Must ~~love dogs~~ run this script on Mac or Linux OS.

In [None]:
import_file = "sample_costar.csv"
export_file = "parse_costar.csv"

### Should not have to edit anything below here.

In [None]:
import csv
from postal.parser import parse_address
parsed_addresses = []
output_fieldnames = set()
prop = []
nums = []
roads = []
cities = []
districts = []
states = []
state_districts = []
zips = []
buildings = []
units = []
levels = []
entrances = []
boxes = []
neighborhoods = []

In [None]:
with open(import_file, "rb") as io:
    r = csv.DictReader(io)
    for row in r:
        # Replace 9 digit zip codes with 5 digit character
        # If 8 digit, truncate to 4; if 9 digit, truncate to 5
        old_zip = "{Building_Zip}".format(**row)
        if len(old_zip) == 8:
            new_zip = (old_zip[:4])
        elif len(old_zip) > 8:
            new_zip = (old_zip[:5])
        else:
            new_zip = old_zip
        new_addr = "{Building_Address}, {Building_City}, {Building_State} ".format(**row)
        addr = new_addr + new_zip
        addr_components = parse_address(addr)
        # Zip components into dictionary, swap key-value pairs
        addr_dict = dict(map(lambda (v,k):(k,v), addr_components))
        # Useful for telling us what parsing categories we're actually using
        output_fieldnames.update(addr_dict.keys())
        addr_dict["PropertyID"] = row["PropertyID"]
        parsed_addresses.append(addr_dict)
        c = addr_dict["PropertyID"]
        prop.append(c)
        if "house_number" in addr_dict:
            c = addr_dict["house_number"]
            nums.append(c)
        else:
            nums.append("")
        if "road" in addr_dict:
            c = addr_dict["road"]
            roads.append(c)
        else:
            roads.append("")
        if "city" in addr_dict:
            c = addr_dict["city"]
            cities.append(c)
        else:
            cities.append("")
        if "city_district" in addr_dict:
            c = addr_dict["city_district"]
            districts.append(c)
        else:
            districts.append("")
        if "state" in addr_dict:
            c = addr_dict["state"]
            states.append(c)
        else:
            states.append("")
        if "state_district" in addr_dict:
            c = addr_dict["state_district"]
            state_districts.append(c)
        else:
            state_districts.append("")
        if "postcode" in addr_dict:
            c = addr_dict["postcode"]
            zips.append(c)
        else:
            zips.append("")
        if "house" in addr_dict:
            c = addr_dict["house"]
            buildings.append(c)
        else:
            buildings.append("")
        if "unit" in addr_dict:
            c = addr_dict["unit"]
            units.append(c)
        else:
            units.append("")
        if "level" in addr_dict:
            c = addr_dict["level"]
            levels.append(c)
        else:
            levels.append("")
        if "entrance" in addr_dict:
            c = addr_dict["entrance"]
            entrances.append(c)
        else:
            entrances.append("")
        if "po_box" in addr_dict:
            c = addr_dict["po_box"]
            boxes.append(c)
        else:
            boxes.append("")
        if "suburb" in addr_dict:
            c = addr_dict["suburb"]
            neighborhoods.append(c)
        else:
            neighborhoods.append("")

In [None]:
l = [prop, nums, roads, cities, districts, states, state_districts, zips,
     buildings, units, levels, entrances, boxes, neighborhoods]
reconstitute = zip(*l)

In [None]:
print(output_fieldnames)
with open(export_file, "w") as io:
    writer = csv.writer(io, dialect = "excel")
    writer.writerow(["PropertyID", "house_number", "road", "city",
                     "city_district", "state", "state_district",
                     "zip", "building", "unit", "level", "entrance",
                     "po_box", "neighborhood"])
    writer.writerows(reconstitute)
io.close()