In [5]:
#!/usr/bin/env python

"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up. In the first exercise we want you to audit
the datatypes that can be found in some particular fields in the dataset.
The possible types of values can be:
- NoneType if the value is a string "NULL" or an empty string ""
- list, if the value starts with "{"
- int, if the value can be cast to int
- float, if the value can be cast to float, but CANNOT be cast to int.
   For example, '3.23e+07' should be considered a float because it can be cast
   as float but int('3.23e+07') will throw a ValueError
- 'str', for all other values

The audit_file function should return a dictionary containing fieldnames and a 
SET of the types that can be found in the field. e.g.
{"field1": set([type(float()), type(int()), type(str())]),
 "field2": set([type(str())]),
  ....
}
The type() function returns a type object describing the argument given to the 
function. You can also use examples of objects to create type objects, e.g.
type(1.1) for a float: see the test function below for examples.

Note that the first three rows (after the header row) in the cities.csv file
are not actual data points. The contents of these rows should note be included
when processing data types. Be sure to include functionality in your code to
skip over or detect these rows.
"""
import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

def audit_file(filename, fields):
    fieldtypes = {}

    # YOUR CODE HERE
    for fieldname in fields:
        typeset = set()
        with open(filename, "r") as f:
            reader = csv.DictReader(f)
            header = reader.fieldnames
            for line in reader:
                if line[fieldname].startswith('{'):
                    typeset.add(type([]))
                if (line[fieldname] == 'NULL') or (line[fieldname] == ''):
                    typeset.add(type(None))                
                try:
                    int(line[fieldname])
                    typeset.add(type(1))
                except ValueError:
                    try: 
                        float(line[fieldname])
                        typeset.add(type(1.1))
                    except:
                        pass 
                print line["postalCode"]
                
            fieldtypes[fieldname] = typeset
    
    return fieldtypes


def test():
    fieldtypes = audit_file(CITIES, FIELDS)

    pprint.pprint(fieldtypes)

    assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
    assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
    
if __name__ == "__main__":
    test()

http://dbpedia.org/ontology/postalCode
XMLSchema#string
http://www.w3.org/2001/XMLSchema#string
NULL
NULL
NULL
NULL
572130
NULL
35611-35614-35613
36732
35043
35125 35128 35054
35473 35475 35476
99661
99685
94025-94029
99690
99701 99702 99703 99705 99706 99707 99708 99709 99710 99711  99712 99714 99716 99767 99775-(UAF) 99790
99603
99901 99928
99643
98576
98230 (home delivery) and 98231 (post office boxes)
98248
98935
98953
25530
53711
53589
NULL
54601 54602 54603
NULL
NULL
53150
54929
53027
99206
150000
44001-44388
50249
50436
http://dbpedia.org/ontology/postalCode
XMLSchema#string
http://www.w3.org/2001/XMLSchema#string
NULL
NULL
NULL
NULL
572130
NULL
35611-35614-35613
36732
35043
35125 35128 35054
35473 35475 35476
99661
99685
94025-94029
99690
99701 99702 99703 99705 99706 99707 99708 99709 99710 99711  99712 99714 99716 99767 99775-(UAF) 99790
99603
99901 99928
99643
98576
98230 (home delivery) and 98231 (post office boxes)
98248
98935
98953
25530
53711
53589
NULL
54601 54602 54603