# Memory Workarounds

## Informed Reading

In [None]:
import numpy as np
import pandas as pd

# read small subset of rows to learn more about the data
recent_contracts_subset = pd.read_csv(
    's3://usaspending-files/contracts_prime_transactions_last_three_months.csv',
    nrows=5
)

list(recent_contracts_subset.columns)

In [None]:
# define the columns you want
columns = [
    'award_id_piid',
    'action_date', 
    'awarding_agency_name', 
    'primary_place_of_performance_state_name', 
    'primary_place_of_performance_county_name',
    'naics_description',
    'award_type',
    'federal_action_obligation'
]

# define the data types
dtypes = {
    'action_date': np.dtype(str),
    'awarding_agency_name': np.dtype(str),
    'federal_action_obligation': np.dtype(float)    
}

In [None]:
recent_contracts = pd.read_csv(
    # 's3://usaspending-files/contracts_prime_transactions_last_three_months.csv', 
    'data/contracts_prime_transactions_last_three_months.csv',
    usecols = columns,
    dtype = dtypes
)
recent_contracts.tail()

## Chunk


In [None]:
contract_chunks = pd.read_csv(
    # 's3://usaspending-files/contracts_prime_transactions_last_three_months.csv', 
    'data/contracts_prime_transactions_last_three_months.csv',
    usecols = columns,
    dtype = dtypes,
    chunksize=25000
)
recent_contracts_chunked = pd.concat([chunk for chunk in contract_chunks], ignore_index=True)
recent_contracts_chunked.tail()

## Divide and Conquer

There a few projects that provide a wrapper for the pandas dataframe to distribute the data and computation.

### Dask
Parallel computing with task scheduling. Two components:

1. Dynamic task scheduling
2. "Big Data" collections

Provides a parallelized DataFrame object based on the pandas datadframe.

    >>> import dask.dataframe as dd
    >>> df = dd.read_csv('2014-*.csv')

[![dask](https://www.continuum.io/sites/default/files/7-anaconda-and-dask.gif)

### Ray

This is bleeding-edge and interesting, but not yet fully functional.

In [None]:
# pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp36-cp36m-macosx_10_6_intel.whl
# ray wheel list: https://pypi.python.org/pypi/ray/0.3.1
from datetime import datetime

import ray.dataframe as pd2


def fiscal_year(yyyymmdd):
    """Return federal fiscal year for specified date."""
    d = datetime.strptime(yyyymmdd, '%Y-%m-%d')
    if d.month >= 10:
        # federal fiscal year begins in October
        return d.year + 1
    else:
        return d.year

def agency_abbreviation(agency_code):
    """Return a short agency name."""
    agency_code = agency_code.zfill(3)
    if agency_code == '080':
        return 'NASA'
    elif agency_code == '049':
        return 'NSF'
    elif agency_code == '014':
        return 'DOI'
    else:
        return 'UNKNOWN'

fips_df = pd2.read_csv('data/state_fips.csv')
seti_df = pd2.read_csv('data/seti_big.csv', nrows=30000, dtype={'awarding_agency_code': str})

# abbreviate agency name
seti_df['awarding_agency_code'].apply(agency_abbreviation)
# add fiscal year columns
seti_df['action_date'].apply(fiscal_year)
# merge in state FIPS code
# seti_df = seti_df.merge(fips_df, left_on='recipient_state_code', right_on='abbreviation', how='left')
seti_df.head()