# HW 1 - Carlos Alvarado

### Problem 1

In [5]:
import os
import pandas as pd
import requests

def get_and_load_data(data_info):
    '''
    Loads "data_name.csv" from local disk, or downloads it if it's not present
    
    Input: dictionary with information about required data
    Returns: pandas data_frame for "data_name"
    '''
    
    filepath = './data/{}.csv'.format(data_info['source'])
    
    if os.path.exists(filepath):
        data = pd.read_csv(filepath)
    else:
        data = helper_download_data(filepath, data_name)
    
    for old_var, new_var in data_info['rename']:
        data.rename(index=str, columns={old_var: new_var}, inplace=True)
        
    return data
    
def helper_download_data(filepath, data_name):
    offset = 0
    limit = 50000
    data = pd.DataFrame({})
    
    while True:
        
        url = 'https://data.cityofchicago.org/resource/{}.json?$limit={}&$offset={}'.format(
            data_name, limit, offset)
        print('getting data from', url)
    
        response = requests.get(url)
        response_code = response.status_code
        
        if response_code != 200: 
            print('Failed to download data')
            break
            
        json_data = response.content

        if len(json_data) > 4:
            data = pd.concat([data, pd.read_json(json_data)])
        
            offset = offset + limit
        else:
            print(json_data)
            break
            
    #save data to csv for future use
    data.to_csv(filepath)
    
    return data
    

In [6]:
DATA1 = {'name': 'Graffiti Removal',
         'source': 'hec5-y4x5',
         'rename': [('where_is_the_graffiti_located_', 'Sub Type')]    
        }

DATA2 = {'name': 'Vacant and Abandoned Buildings Reported',
         'source': '7nii-7srd',
         'rename': [('is_building_open_or_boarded_', 'Sub Type'), 
                    ('date_service_request_was_received', 'creation_date')]
        }

DATA3 = {'name': 'Pot Holes Reported',
         'source': '7as2-ds3y',
         'rename': [('zip', 'zip_code'), 
                    ('type_of_service_request', 'Sub Type')]
        }
    
DATA4 = {'name': 'Sanitation Code Complaints',
         'source': 'me59-5fac',
         'rename': [('what_is_the_nature_of_this_code_violation_', 'Sub Type')]
        }

SOURCES = [DATA1, DATA2, DATA3, DATA4]

complaints = pd.DataFrame({})

for db_data in SOURCES:
    data = get_and_load_data(db_data)
    print(data.columns)
    data['Complaint Type'] = db_data['name']
    complaints = pd.concat([complaints, data], ignore_index=True)

Index(['Unnamed: 0', 'community_area', 'completion_date', 'creation_date',
       'latitude', 'location', 'longitude', 'police_district',
       'service_request_number', 'ssa', 'status', 'street_address',
       'type_of_service_request', 'ward',
       'what_type_of_surface_is_the_graffiti_on_', 'Sub Type', 'x_coordinate',
       'y_coordinate', 'zip_code'],
      dtype='object')
Index(['Unnamed: 0', 'address_street_direction', 'address_street_name',
       'address_street_number', 'address_street_suffix',
       'any_people_using_property_homeless_childen_gangs_', 'community_area',
       'creation_date', 'if_the_building_is_open_where_is_the_entry_point_',
       'Sub Type', 'is_the_building_currently_vacant_or_occupied_',
       'is_the_building_dangerous_or_hazardous_',
       'is_the_building_vacant_due_to_fire_', 'latitude', 'location',
       'location_of_building_on_the_lot_if_garage_change_type_code_to_bgd_',
       'longitude', 'police_district', 'service_request_number',
 

  if self.run_code(code, result):


In [None]:
#create sample database to play with
complaints.sample(1000).to_csv('./data/sample.csv')

In [None]:
complaints.loc[10]

In [None]:
ctcounts = complaints['Complaint Type'].value_counts()
ctcounts

In [None]:
gb = complaints.groupby(['Complaint Type', 'Sub Type']).size().to_frame()
print(gb)

In [None]:
gb_community_area = complaints.groupby(['Complaint Type', 'community_area']).size().to_frame()
select = gb_community_area['0'] 
print(gb_community_area)

### Problem 2

In [7]:
# Adding Block ID to 311 Requests data 
import geopandas as gpd
import pandas as pd
from geopandas import GeoDataFrame
from geopandas.tools import sjoin
from shapely.geometry import Point

blocks = gpd.read_file('./data/cb_2015_17_bg_500k/cb_2015_17_bg_500k.shp')
#blocks = gpd.read_file('./data/boundaries/boundaries_census.geojson')

#complaints = pd.read_csv('./data/sample.csv')

clean_data = complaints[complaints['latitude'].notnull() & complaints['longitude'].notnull()]

geometry = [Point(xy) for xy in zip(clean_data.longitude, clean_data.latitude)]
clean_data = clean_data.drop(['latitude', 'longitude'], axis=1)

crs = {'init': 'epsg:4269'}
geo_complaints = GeoDataFrame(clean_data, crs=crs, geometry=geometry)

data_with_blockid = sjoin(geo_complaints, blocks, how="left")



In [22]:
crs = {'init': 'epsg:4269'}
geo_complaints = GeoDataFrame(clean_data, crs=crs, geometry=geometry)

data_with_blockid = sjoin(geo_complaints, blocks, how="left")

In [14]:
# Getting census data
import requests
import pandas as pd

def get_data_census(varname, label):

    census_api_url = 'http://api.census.gov/data/' + \
                    '2015/acs5?get=NAME,' + varname + \
                    '&for=block+group:*&in=state:17&in=county:031&in=tract:*'

    response = requests.get(census_api_url)

    json_data = response.content

    data = pd.read_json(json_data)

    names = list(data.iloc[0])

    for i in range(0,6):
        data.rename(index=str, columns={i: names[i]}, inplace=True)
    
    data.rename(index=str, columns={varname: label}, inplace=True)
    
    data['GEOID'] = data['state'].map(str) + data['county'].map(str) + \
                            data['tract'].map(str) + data['block group'].map(str)

    data.drop(data.index[[0]], inplace=True)
    
    print('downloaded', varname)
    
    return data

download_data = [('B01003_001E', 'Total Population'),
                ('B02001_002E', 'White Population'),
                ('B19013_001E', 'Median Household Income'),
                ('B06009_002E', 'Less than highschool')]

for i, (varname, label) in enumerate(download_data):
    downloaded = get_data_census(varname, label)
    if i == 0:
        census_data = downloaded
    else:
        census_data = census_data.merge(downloaded, on='GEOID', how='outer')

data_with_blockid2 = data_with_blockid.merge(census_data, on='GEOID')    

In [23]:
data_with_blockid2 = data_with_blockid.merge(census_data, on='GEOID')    

In [None]:
data_with_blockid2.iloc[10]

### Problem 3

In [None]:
len(blocks2.blockce10.value_counts())

In [None]:
blocks2.columns

In [None]:
blocks2.countyfp10.value_counts()

In [None]:
#to make graphs
%matplotlib inline
import matplotlib
blocks.plot();

base = blocks2.plot(color='white')

geo_sel.plot(ax=base, marker='o', color='red', markersize=5);