# Data Wrangling with Python
### Anthony DeBarros, NICAR 2017 | @anthonydb

##### Among the most unspectacular but necessary tasks in data analysis is reading, transforming, and saving data. You can get a lot done with the Python standard library, and do even more cool things with libraries such as agate. We are going to try sending data back and forth among formats, particularly CSV and JSON, and transforming it along the way.
***

### 1. Open and Transform a CSV using the Python standard library

### Using csv.reader

In [None]:
# Import the csv module

import csv

In [None]:
# Open a file and use the reader function to display each line
# file_reader is an iterable reader object
# Each line in the file becomes a Python list

with open('us_counties_2010.csv') as csv_file:
    file_reader = csv.reader(csv_file)
    for row in file_reader:
        print(','.join(row))

In [None]:
# Because each line is a list, we can call specific elements

with open('us_counties_2010.csv') as csv_file:
    file_reader = csv.reader(csv_file)
    for row in file_reader:
        print(row[0] + ',' + row[1] + ',' + row[9])

In [None]:
# We also can slice the reader object with itertools.islice() to remove
# the header and just fetch a few rows.

from itertools import islice

with open('us_counties_2010.csv') as csv_file:
    file_reader = csv.reader(csv_file)
    for row in islice(file_reader, 1, 4):
        print(row[0] + ',' + row[1] + ',' + row[9])

### Using csv.DictReader

In [None]:
# DictReader creates a reader object where each row is an ordered dictionary
# with keys taken from the header row.

with open('us_counties_2010.csv') as csv_file:
    file_reader = csv.DictReader(csv_file)
    for row in islice(file_reader, 1, 4):
        print(row)
        

In [None]:
# Then, we can pull elements of each line via their key.

with open('us_counties_2010.csv') as csv_file:
    file_reader = csv.DictReader(csv_file)
    for row in islice(file_reader, 1, 4):
        print(row['NAME'] + ',' + row['STUSAB'] + ',' + row['POP100'])
        

### 2. Turning Your CSV into JSON

In [None]:
# Dictionaries and lists are easily transformed to JSON.


import json
import collections

# Define an empty list of dictionaries. Each dict will hold data on one state.
state_pop_list = []

# Open and read the CSV.
with open('us_counties_2010.csv') as csv_file:
    file_reader = csv.DictReader(csv_file)

    # Turn each row into an ordered dictionary
    for row in islice(file_reader, 1, 4):
        state_dict = collections.OrderedDict()
        state_dict['cty'] = row['NAME']
        state_dict['st'] = row['STUSAB']
        state_dict['pop2010'] = int(row['POP100'])
        # Append the dictionary to the list
        state_pop_list.append(state_dict)

# Use the json library to format the list of dicts as JSON and print.        
print(json.dumps(state_pop_list, indent=4))


In [None]:
# Write the results to a file.
json_out = json.dumps(state_pop_list)

with open('us_counties_2010.json', 'w') as j:
    j.write(json_out)


### 3. Reading JSON from an API and Transforming to CSV

In [None]:
import requests

earthquake_geojson_url = 'http://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2017-02-25&endtime=2017-02-26'
r = requests.get(earthquake_geojson_url)
print(r.text)


In [None]:
response = json.loads(r.text)
print(response['features'][0]['properties']['place'])

In [None]:
for i in range(0,20):
    print(response['features'][i]['properties']['place'])

In [None]:
# open and prep file
earthquakes = open('earthquakes.csv', 'w')
quake_writer = csv.writer(earthquakes, delimiter=",")
headers = ['PLACE', 'MAGNITUDE']
quake_writer.writerow(headers)

for i in range(0,20):
    place = response['features'][i]['properties']['place']
    mag = response['features'][i]['properties']['mag']
    quake = (place, mag)
    quake_writer.writerow(quake)
    
earthquakes.close()

### 4. Using the agate Library to Read a CSV, Calculate Stats, and Save as JSON

In [None]:
# Ignore deprecation warnings if they crop up
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import the agate library
import agate

In [None]:
# Use agate's from_csv method to load a CSV to a table object
us_counties = agate.Table.from_csv('us_counties_2010.csv')

In [None]:
# Now we have a table object
us_counties

In [None]:
# We can view its structure with print()
print(us_counties)

In [None]:
# Change data types of columns if needed
specified_types = {
    'SUMLEV': agate.Text(),
    'REGION': agate.Text(),
    'DIVISION': agate.Text(),
    'STATE': agate.Text(),
    'COUNTY': agate.Text()    
}
us_counties = agate.Table.from_csv('us_counties_2010.csv', column_types=specified_types)

In [None]:
print(us_counties)

In [None]:
# Viewing table contents
us_counties.print_table(max_rows = 10, max_columns = 9)

In [None]:
# Similar to a SQL SELECT, we can create a new table with a subset of columns
columns = ['STUSAB', 'POP100']
state_pop_table = us_counties.select(columns)

In [None]:
print(state_pop_table)

In [None]:
# Let's create a table of states with the total and median population
states = state_pop_table.group_by('STUSAB')
state_pop = states.aggregate([
    ('total_pop', agate.Sum('POP100')),
    ('median_pop', agate.Median('POP100'))
])

In [None]:
# Take a look at the results
for row in state_pop.rows:
    print(row['STUSAB'], row['total_pop'], row['median_pop'])

In [None]:
state_pop.print_table(max_rows=51)

In [None]:
# Save this is a CSV
state_pop.to_csv('state_pop.csv')

In [None]:
# Or save it as JSON
state_pop.to_json('state_pop.json')

### 5. Using csvkit to Prep for Database Import