# Exercise 1. Create a Dataset
Let's use [Socrata-py](https://github.com/socrata/socrata-py) to create a dataset from a CSV.

## Import Libraries

In [4]:
import os
import pandas as pd

from socrata.authorization import Authorization
from socrata import Socrata

## Setup Authentication
- Can enter Socrata user name and password or [api keys](https://socrataapikeys.docs.apiary.io) with key id and secret values respectively
- Enter your domain if you have publisher or admin access

In [5]:
# replace environmement variables with your credentials on lab machines
domain = 'alicia.data.socrata.com'
user_name = os.environ['SOCRATA_KEY_ID']
password = os.environ['SOCRATA_KEY_SECRET']

auth = Authorization(
  domain,
  user_name,
  password
)

socrata = Socrata(auth)

## Data
- Household Median Income data for all Arizona places has been saved to `/data/census_detail_household_median_income5.csv`
- Review the first and last 5 records
- Note that the Census returns certain negative values when a valid estimate is not available for given variable and geography. More on that, https://census.gov/data/developers/data-sets/acs-1year/notes-on-acs-estimate-and-annotation-values.html

In [7]:
dataset_name = 'Arizona Places Median Household Income'
dataset_description = 'source = file'
file_name = '../data/census_detail_household_median_income5.csv'
data = pd.read_csv(file_name)
data.head()

Unnamed: 0,name,type,variable_description,variable,value,annotation,year,date,geography_id,change_rate
0,"Aguila CDP, Arizona",place,Median Household Income,B19013_001E,33125.0,,2011,2011-12-31,1600000US0400730,
1,"Ajo CDP, Arizona",place,Median Household Income,B19013_001E,25181.0,,2011,2011-12-31,1600000US0400870,
2,"Ak Chin CDP, Arizona",place,Median Household Income,B19013_001E,-666666666.0,Either no sample observations or too few sampl...,2011,2011-12-31,1600000US0400940,
3,"Ak-Chin Village CDP, Arizona",place,Median Household Income,B19013_001E,33083.0,,2011,2011-12-31,1600000US0401090,
4,"Alamo Lake CDP, Arizona",place,Median Household Income,B19013_001E,35938.0,,2011,2011-12-31,1600000US0401170,


In [46]:
data.tail()

Unnamed: 0,name,type,variable_description,variable,value,annotation,year,date,geography_id,change_rate
3152,"Scenic CDP, Arizona",place,Median Household Income,B19013_001E,37174.0,,2017,2017-12-31,1600000US0464650,-14.7131
3153,"Hackberry CDP, Arizona",place,Median Household Income,B19013_001E,46985.0,,2017,2017-12-31,1600000US0430830,3.216098
3154,"Katherine CDP, Arizona",place,Median Household Income,B19013_001E,32857.0,,2017,2017-12-31,1600000US0436920,-0.8090566
3155,"Wikieup CDP, Arizona",place,Median Household Income,B19013_001E,-666666666.0,Either no sample observations or too few sampl...,2017,2017-12-31,1600000US0482880,-1741608.0
3156,"Lazy Y U CDP, Arizona",place,Median Household Income,B19013_001E,126532.0,,2017,2017-12-31,1600000US0440400,2.73957


## Publish dataset directly from the file
- Provide a name for dataset (minimum requirement)
- Set description as `source = file`

In [47]:
with open(file_name, 'rb') as file:
# Upload + Transform step

    # revision is the *change* to the view in the catalog, which has not yet been applied.
    # output is the OutputSchema, which is a change to data which can be applied via the revision
    (revision, output) = Socrata(auth).create(
        name = dataset_name,
        description = dataset_description
    ).csv(file)

# Apply the revision - this will make it public and available to make
# visualizations from
(ok, job) = revision.apply(output_schema = output)

# Now we can get the unique dataset id
dataset_id = revision.view_id()
dataset_url = 'https://' + domain + '/d/' + dataset_id

print(dataset_url)

https://alicia.data.socrata.com/d/viac-85k7


## Publish dataset directly from the Pandas dataframe
- Keep the same dataset name
- Set description as `source = pandas`
- Set any infinite values that may exist in `change_rate` to NA
- Set NaNs to blanks

In [48]:
pd.options.mode.use_inf_as_na = True
data = data.fillna('')

dataset_description = 'source = pandas'

(revision, output) = Socrata(auth).create(
    name = dataset_name,
    description = dataset_description
).df(data)

# Apply the revision - this will make it public and available to make
# visualizations from
(ok, job) = revision.apply(output_schema = output)

# Now we can get the unique dataset id
dataset_id = revision.view_id()
dataset_url = 'https://' + domain + '/d/' + dataset_id

print(dataset_url)

https://alicia.data.socrata.com/d/ykjb-n8a8


## Make dataset private
https://github.com/socrata/socrata-py#revisions

In [None]:
# make it private
permission = 'private'

(ok, view) = socrata.views.lookup(dataset_id)
assert ok, view

(ok, revision) = view.revisions.create_update_revision(metadata = {
        'name': dataset_name,
        'description': dataset_description
    },permission = permission)

assert ok, revision
revision.apply()