# Demo 2: Munging

## SETI Institute Grants
![The redness of the rings remains a mystery](assets/redness_of_rings.png)

## SETI grants: csv

![SETI Institute Grants](assets/seti_csv.png)

### Size

Repeated the data to get a larger demo file and ended up with:

* about 438,000 rows
* about 372 MB

### Fixes

1. Add a column for fiscal year
2. Shorten agency names
3. Add state FIPS code

In [None]:
from datetime import datetime

def fiscal_year(yyyymmdd):
    """Return federal fiscal year for specified date."""
    d = datetime.strptime(yyyymmdd, '%Y-%m-%d')
    if d.month >= 10:
        # federal fiscal year begins in October
        return d.year + 1
    else:
        return d.year

def agency_abbreviation(agency_code):
    """Return a short agency name."""
    agency_code = agency_code.zfill(3)
    if agency_code == '080':
        return 'NASA'
    elif agency_code == '049':
        return 'NSF'
    elif agency_code == '014':
        return 'DOI'
    else:
        return 'UNKNOWN'

### Read each row

In [None]:
%%timeit -r 1
import csv
import pprint

fips_dict = {}

# create dictionary that maps state abbreviations to FIPS code
with open('data/state_fips.csv') as csv_file:
    fipsreader = csv.DictReader(csv_file)
    for row in fipsreader:
        fips_dict[row['abbreviation']] = row['fips']

seti_updates = []
with open('data/seti_big.csv') as csv_file:
    setireader = csv.DictReader(csv_file)    
    for row in setireader:
        # abbreviate agency name
        row['awarding_agency_name'] = agency_abbreviation(row['awarding_agency_code'])
        # add fiscal year column
        row['action_year'] = fiscal_year(row['action_date'])
        # look up state FIPS code
        row['recipient_state_fips'] = fips_dict.get(row['recipient_state_code'], '99')
        seti_updates.append(row)

pprint.pprint(seti_updates[0])

### Convert to dataframe and update

In [None]:
%%timeit -r 1
import pandas as pd

fips_df = pd.read_csv('data/state_fips.csv')
seti_df = pd.read_csv('data/seti_big.csv', dtype={'awarding_agency_code': str})

# abbreviate agency name
seti_df['awarding_agency_name'] = seti_df['awarding_agency_code'].apply(agency_abbreviation)
# add fiscal year columns
seti_df['action_year'] = seti_df['action_date'].apply(fiscal_year)
# merge in state FIPS code
seti_df = seti_df.merge(fips_df, left_on='recipient_state_code', right_on='abbreviation', how='left')

print(seti_df.head(1))