# HW 1 - Carlos Alvarado

### Problem 1

In [None]:
import os
import pandas as pd
import requests

def get_and_load_data(data_info):
    '''
    Loads "data_name.csv" from local disk, or downloads it if it's not present
    
    Input: dictionary with information about required data
    Returns: pandas data_frame for "data_name"
    '''
    
    filepath = './data/{}.csv'.format(data_info['source'])
    
    if os.path.exists(filepath):
        data = pd.read_csv(filepath)
    else:
        data = helper_download_data(filepath, data_name)
    
    for old_var, new_var in data_info['rename']:
        data.rename(index=str, columns={old_var: new_var}, inplace=True)
        
    return data
    
def helper_download_data(filepath, data_name):
    offset = 0
    limit = 50000
    data = pd.DataFrame({})
    
    while True:
        
        url = 'https://data.cityofchicago.org/resource/{}.json?$limit={}&$offset={}'.format(
            data_name, limit, offset)
        print('getting data from', url)
    
        response = requests.get(url)
        response_code = response.status_code
        
        if response_code != 200: 
            print('Failed to download data')
            break
            
        json_data = response.content

        if len(json_data) > 4:
            data = pd.concat([data, pd.read_json(json_data)])
        
            offset = offset + limit
        else:
            print(json_data)
            break
            
    #save data to csv for future use
    data.to_csv(filepath)
    
    return data
    

In [None]:
DATA1 = {'name': 'Graffiti Removal',
         'source': 'hec5-y4x5',
         'rename': [('where_is_the_graffiti_located_', 'Sub Type')]    
        }

DATA2 = {'name': 'Vacant and Abandoned Buildings Reported',
         'source': '7nii-7srd',
         'rename': [('is_building_open_or_boarded_', 'Sub Type'), 
                    ('date_service_request_was_received', 'creation_date')]
        }

DATA3 = {'name': 'Pot Holes Reported',
         'source': '7as2-ds3y',
         'rename': [('zip', 'zip_code'), 
                    ('type_of_service_request', 'Sub Type')]
        }
    
DATA4 = {'name': 'Sanitation Code Complaints',
         'source': 'me59-5fac',
         'rename': [('what_is_the_nature_of_this_code_violation_', 'Sub Type')]
        }

SOURCES = [DATA1, DATA2, DATA3, DATA4]

complaints = pd.DataFrame({})

for db_data in SOURCES:
    data = get_and_load_data(db_data)
    print(data.columns)
    data['Complaint Type'] = db_data['name']
    complaints = pd.concat([complaints, data], ignore_index=True)

complaints.sample(1000).to_csv('./data/sample.csv')



In [None]:
complaints.loc[10]

In [None]:
ctcounts = complaints['Complaint Type'].value_counts()
ctcounts

In [None]:
gb = complaints.groupby(['Complaint Type', 'Sub Type']).size().to_frame()
print(gb)

In [None]:
gb_community_area = complaints.groupby(['Complaint Type', 'community_area']).size().to_frame()
select = gb_community_area['0'] 
print(gb_community_area)

### Problem 2

In [None]:
import geopandas as gpd
import os

data_pth = "./data/boundaries/"
blocks = gpd.read_file(os.path.join(data_pth, "geo_export_cdf3020f-dc9a-4e8b-ab13-a2db8609750f.shp"))
blocks.head(1)

In [5]:
import geopandas as gpd
import os


/home/calvarado/ml_for_pp_homeworks/hw1


In [None]:
import pandas as pd
from geopandas import GeoDataFrame
from shapely.geometry import Point

complaints = pd.read_csv('./data/sample.csv')

geometry = [Point(xy) for xy in zip(complaints.longitude, complaints.latitude)]
complaints = complaints.drop(['latitude', 'longitude'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(complaints, crs=crs, geometry=geometry)
geo_df.head(1).geometry

In [None]:
geo_df.geometry[:5]

In [None]:
from geopandas.tools import sjoin
cities_with_country = sjoin(geo_df, blocks, how="left")

In [None]:
geo_df.loc[10]

### Problem 3