# HW 1 - Carlos Alvarado

### Problem 1

In [8]:
import os
import pandas as pd
import requests

def get_and_load_data(data_info):
    '''
    Loads "data_name.csv" from local disk, or downloads it if it's not present
    
    Input: dictionary with information about required data
    Returns: pandas data_frame for "data_name"
    '''
    
    filepath = './data/{}.csv'.format(data_info['source'])
    
    if os.path.exists(filepath):
        data = pd.read_csv(filepath)
    else:
        data = helper_download_data(filepath, data_name)
    
    for old_var, new_var in data_info['rename']:
        data.rename(index=str, columns={old_var: new_var}, inplace=True)
        
    return data
    
def helper_download_data(filepath, data_name):
    offset = 0
    limit = 50000
    data = pd.DataFrame({})
    
    while True:
        
        url = 'https://data.cityofchicago.org/resource/{}.json?$limit={}&$offset={}'.format(
            data_name, limit, offset)
        print('getting data from', url)
    
        response = requests.get(url)
        response_code = response.status_code
        
        if response_code != 200: 
            print('Failed to download data')
            break
            
        json_data = response.content

        if len(json_data) > 4:
            data = pd.concat([data, pd.read_json(json_data)])
        
            offset = offset + limit
        else:
            print(json_data)
            break
            
    #save data to csv for future use
    data.to_csv(filepath)
    
    return data
    

In [9]:
DATA1 = {'name': 'Graffiti Removal',
         'source': 'hec5-y4x5',
         'rename': [('where_is_the_graffiti_located_', 'Sub Type')]    
        }

DATA2 = {'name': 'Vacant and Abandoned Buildings Reported',
         'source': '7nii-7srd',
         'rename': [('is_building_open_or_boarded_', 'Sub Type'), 
                    ('date_service_request_was_received', 'creation_date')]
        }

DATA3 = {'name': 'Pot Holes Reported',
         'source': '7as2-ds3y',
         'rename': [('zip', 'zip_code'), 
                    ('type_of_service_request', 'Sub Type')]
        }
    
DATA4 = {'name': 'Sanitation Code Complaints',
         'source': 'me59-5fac',
         'rename': [('what_is_the_nature_of_this_code_violation_', 'Sub Type')]
        }

SOURCES = [DATA1, DATA2, DATA3, DATA4]

complaints = pd.DataFrame({})

for db_data in SOURCES:
    data = get_and_load_data(db_data)
    print(data.columns)
    data['Complaint Type'] = db_data['name']
    complaints = pd.concat([complaints, data], ignore_index=True)

complaints.sample(1000).to_csv('./data/sample.csv')



Index(['Unnamed: 0', 'community_area', 'completion_date', 'creation_date',
       'latitude', 'location', 'longitude', 'police_district',
       'service_request_number', 'ssa', 'status', 'street_address',
       'type_of_service_request', 'ward',
       'what_type_of_surface_is_the_graffiti_on_', 'Sub Type', 'x_coordinate',
       'y_coordinate', 'zip_code'],
      dtype='object')
Index(['Unnamed: 0', 'address_street_direction', 'address_street_name',
       'address_street_number', 'address_street_suffix',
       'any_people_using_property_homeless_childen_gangs_', 'community_area',
       'creation_date', 'if_the_building_is_open_where_is_the_entry_point_',
       'Sub Type', 'is_the_building_currently_vacant_or_occupied_',
       'is_the_building_dangerous_or_hazardous_',
       'is_the_building_vacant_due_to_fire_', 'latitude', 'location',
       'location_of_building_on_the_lot_if_garage_change_type_code_to_bgd_',
       'longitude', 'police_district', 'service_request_number',
 

  if self.run_code(code, result):


In [None]:
complaints.loc[10]

In [None]:
ctcounts = complaints['Complaint Type'].value_counts()
ctcounts

In [None]:
gb = complaints.groupby(['Complaint Type', 'Sub Type']).size().to_frame()
print(gb)

In [None]:
gb_community_area = complaints.groupby(['Complaint Type', 'community_area']).size().to_frame()
select = gb_community_area['0'] 
print(gb_community_area)

### Problem 2

In [1]:
import geopandas as gpd
import os

data_pth = "./data/boundaries/"
blocks = gpd.read_file(os.path.join(data_pth, "geo_export_cdf3020f-dc9a-4e8b-ab13-a2db8609750f.shp"))
blocks.head(1)

Unnamed: 0,blockce10,countyfp10,geoid10,geometry,name10,statefp10,tract_bloc,tractce10
0,2010,31,170316903002010,POLYGON ((-87.62906799941059 41.76908600007152...,Block 2010,17,6903002010,690300


In [10]:
import pandas as pd
from geopandas import GeoDataFrame
from shapely.geometry import Point

#complaints = pd.read_csv('./data/sample.csv')

geometry = [Point(xy) for xy in zip(complaints.longitude, complaints.latitude)]
complaints = complaints.drop(['latitude', 'longitude'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(complaints, crs=crs, geometry=geometry)
geo_df.head(1).geometry

0    POINT (-87.67258908007462 41.82667996575032)
Name: geometry, dtype: object

In [3]:
geo_df.geometry[:5]

0    POINT (-87.62398757521471 41.89752669856311)
1    POINT (-87.69095886166781 41.82105560481256)
2     POINT (-87.73100420024738 41.8479242100189)
3    POINT (-87.65809289527267 41.77218734497791)
4    POINT (-87.70421646490921 41.94329691488169)
Name: geometry, dtype: object

In [11]:
from geopandas.tools import sjoin
data_with_blockid = sjoin(geo_df, blocks, how="left")

RTreeError: Coordinates must not have minimums more than maximums

In [12]:
data_with_blockid['countyfp10'].value_counts()

031    1000
Name: countyfp10, dtype: int64

In [27]:
for community in list(complaints['community_area'].value_counts().index):
    sel_data = complaints['community_area'] == community
    data = complaints[sel_data]
    print(len(data))
    try:
        print('OK!')
        data_with_blockid = sjoin(data, blocks, how="left")
        print(data_with_blockid.loc[10])
    except:
        print('FAILED')
        print(data['community_area'].value_counts())

77110
OK!
FAILED
24    77110
Name: community_area, dtype: int64
72851
OK!
FAILED
22    72851
Name: community_area, dtype: int64
68387
OK!
FAILED
30    68387
Name: community_area, dtype: int64
52931
OK!
FAILED
19    52931
Name: community_area, dtype: int64
51979
OK!
FAILED
31    51979
Name: community_area, dtype: int64
51690
OK!
FAILED
58    51690
Name: community_area, dtype: int64
43783
OK!
FAILED
16    43783
Name: community_area, dtype: int64
41380
OK!
FAILED
21    41380
Name: community_area, dtype: int64
40289
OK!
FAILED
63    40289
Name: community_area, dtype: int64
36855
OK!
FAILED
6    36855
Name: community_area, dtype: int64
32749
OK!
FAILED
2    32749
Name: community_area, dtype: int64
32282
OK!
FAILED
15    32282
Name: community_area, dtype: int64
31188
OK!
FAILED
25    31188
Name: community_area, dtype: int64
31165
OK!
FAILED
28    31165
Name: community_area, dtype: int64
31039
OK!
FAILED
14    31039
Name: community_area, dtype: int64
30153
OK!
FAILED
66    30153
Name: communi

### Problem 3