In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import quilt
import seaborn as sns
%matplotlib inline

# Install data dependencies
* `force=True` skips a y/N prompt.
* `hash=9eb3337...` specifies the SHA256 hash for a specific commit

In [2]:
quilt.install("akarve/nfl_fandom", hash="9eb3337c3aacb1e955d63eb43bec6fd66fd5b48e9beddf0fffea3f43244c8000", force=True)

Downloading 6fd94746d08129b1b8868713fcccb007e22f2f4726ae8571d421d434eb041763 (1/3)...
Fragment already installed; skipping.
Downloading 8984ecbb8a5c2ab78bd8ba932b3375a4d92f4dbec3be8eb8337a3721a55823c9 (2/3)...
Fragment already installed; skipping.
Downloading f90b58fe741cee7c53d83cf9bfa7394efaaf5147fad7218906fdcc13646e01dc (3/3)...
Fragment already installed; skipping.


The above Python command shown is equivalent to this shell command:
```
quilt install akarve/nfl_fandom --hash 9eb3337c3aacb1e955d63eb43bec6fd66fd5b48e9beddf0fffea3f43244c8000 --force
```
Note that Quilt de-duplicates data fragments so, if you already have the the data installed, Quilt will skip the fragment.

# Import data package
We're going to use the akarve/nfl_fandom package:
https://quiltdata.com/package/akarve/nfl_fandom

In [3]:
from quilt.data.akarve import nfl_fandom as nfl

In [4]:
## browse the package
nfl

<PackageNode '/Users/karve/code/dsci/Jupyter/quilt_packages/akarve/nfl_fandom'>

README
google
surveymonkey

In [5]:
nfl.google

<DataNode>

In [6]:
goog = nfl.google() # () means "load this node from disk"
goog.head()

Unnamed: 0.1,Unnamed: 0,Pct. Of major sports searches,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,DMA,NFL,NBA,MLB,NHL,NASCAR,CBB,CFB,Trump 2016 Vote%
1,Abilene-Sweetwater TX,45%,21%,14%,2%,4%,3%,11%,79.13%
2,Albany GA,32%,30%,9%,1%,8%,3%,17%,59.12%
3,Albany-Schenectady-Troy NY,40%,20%,20%,8%,6%,3%,4%,44.11%
4,Albuquerque-Santa Fe NM,53%,21%,11%,3%,3%,4%,6%,39.58%


# Wrangle the data
We want a format that's conducive to visulization in matplotlib

In [7]:
# select 3 columns
spts = goog[[goog.columns[x] for x in range(9)]]
# promote row[0] to column names
spts.columns = spts.iloc[0]
# drop the first row, which is the labels
spts = spts.iloc[1:]
# for all of the XX% columns, turn them into floats
str_cols = [spts.columns[x] for x in [1,2,3,4,5,6,7]]
for c in str_cols:
    spts[c] = spts[c].map(lambda x: float(x[:-1])/100)
# turn the last column into a real %
spts[spts.columns[8]] = spts[spts.columns[8]].map(lambda x:float(x[:-1])/100)
# now peak at our clean data
spts.head()

Unnamed: 0,DMA,NFL,NBA,MLB,NHL,NASCAR,CBB,CFB,Trump 2016 Vote%
1,Abilene-Sweetwater TX,0.45,0.21,0.14,0.02,0.04,0.03,0.11,0.7913
2,Albany GA,0.32,0.3,0.09,0.01,0.08,0.03,0.17,0.5912
3,Albany-Schenectady-Troy NY,0.4,0.2,0.2,0.08,0.06,0.03,0.04,0.4411
4,Albuquerque-Santa Fe NM,0.53,0.21,0.11,0.03,0.03,0.04,0.06,0.3958
5,Alexandria LA,0.42,0.28,0.09,0.01,0.05,0.03,0.12,0.6964


# Push the wrangled version to the package repository
Why should others have to repeat the wrangling we just accomplished?

```
## TODO use generic USERNAME as only akarve can push to akarve :)
## TODO separate this into another notebook?
# whack existing key to avoid error
del nfl.google
# use the _set method to modify the package in memory
nfl._set(["google"], spts)
# reubuild the package from in-memory (wrangled) contents
quilt.build("akarve/nfl_fandom", nfl)
# push the wrangled version
quilt.push("akarve/nfl_fandom")
```
