In [139]:
import os
import csv
import json
import geopy as gp
import numpy as np
import pandas as pd
import pymongo as pm
from arcgis.gis import *
from arcgis.geocoding import get_geocoders, batch_geocode


In [212]:
# code provided from https://developers.arcgis.com/python/guide/batch-geocoding/

user_name = 'apjansing3141'
password = 'NOT_MY_PASSWORD'
my_gis = GIS('https://www.arcgis.com', user_name, password)
geocoder = get_geocoders(my_gis)[0]
                                 
print("MaxBatchSize : " + str(geocoder.properties.locatorProperties.MaxBatchSize))
print("SuggestedBatchSize : " + str(geocoder.properties.locatorProperties.SuggestedBatchSize))


MaxBatchSize : 1000
SuggestedBatchSize : 150


In [3]:
connection = pm.MongoClient()
syr = connection.syr

In [4]:
def readGeoJson(FILE_PATH):
    with open(FILE_PATH, 'r') as f:
        return json.load(f)

In [9]:
def ingestGeoJsonFeatureToMongoDB(geojson, collection):
    for feature in geojson['features']:
        collection.insert_one(feature)

In [163]:
directory = "../../datasets/"

In [82]:
def dfToCollection(df, collection):
    records = json.loads(df.T.to_json()).values()
    collection.insert(records)

In [120]:
def getLocs(Loc):
    return np.array([ { "type": "Point", "coordinates": [ loc[0], loc[1] ] } for loc in np.array([Loc['X'], Loc['Y']]).T])


In [121]:
syr.code_violations.drop()
code_violations = pd.read_csv(directory + "Code_violations.csv", sep=",", header=0, dtype=None, na_values=None)
code_violations['location'] = getLocs(code_violations)
syr.code_violations.create_index([("location", "2dsphere")])
dfToCollection(code_violations, syr.code_violations)

  This is separate from the ipykernel package so we can avoid doing imports until


In [126]:
syr.lead_violations.drop()
lead_violations = pd.read_csv(directory + "Lead_Violation_Data.csv", sep=",", header=0, dtype=None, na_values=None)
lead_violations['location'] = getLocs(lead_violations)
syr.lead_violations.create_index([("location", "2dsphere")])
dfToCollection(lead_violations, syr.lead_violations)

  This is separate from the ipykernel package so we can avoid doing imports until


In [251]:
syr.water_services.drop()
water_services = pd.read_csv(directory + "Water_Services.csv", sep=",", header=0, dtype=None, na_values=None)
water_services['location'] = getLocs(water_services)
syr.water_services.create_index([("location", "2dsphere")])
dfToCollection(water_services, syr.water_services)

  This is separate from the ipykernel package so we can avoid doing imports until


In [252]:
syr.water_main_breaks.drop()
water_main_breaks = pd.read_csv(directory + "Water_Main_Breaks.csv", sep=",", header=0, dtype=None, na_values=None)
water_main_breaks['location'] = getLocs(water_main_breaks)
syr.water_main_breaks.create_index([("location", "2dsphere")])
dfToCollection(water_main_breaks, syr.water_main_breaks)


  This is separate from the ipykernel package so we can avoid doing imports until


In [133]:
lead = syr.lead_violations.find().limit(1).next()
near = {"location": {"$near": {"$geometry": lead['location'] } } }
near = syr.code_violations.find(near).limit(1)
lead


{'ObjectId': 1,
 'TNT_NAME': 'Northside',
 'X': -76.1610040034,
 'Y': 43.0712832611,
 '_id': ObjectId('5bde1b600921aa05cc548ce1'),
 'case_number': 'L00477',
 'case_open_date': '2018-08-09T00:00:00.000Z',
 'case_status': 'Open',
 'case_type': 'Lead Violations',
 'identifier': '002.-19-18.0',
 'lat': 43.0712832611,
 'location': {'coordinates': [-76.1610040034, 43.0712832611], 'type': 'Point'},
 'long': -76.1610040034,
 'nature_of_complaint': 'Lead Paint Inspection/Health',
 'neighborhood': 'Washington Square',
 'property_address': '1106 Carbon St',
 'property_id': '002.-19-18.0',
 'property_owner_address': '313 E. Willow St ',
 'property_owner_city': 'Syracuse',
 'property_owner_name': 'Frank Canzano',
 'property_owner_state': 'NY',
 'property_owner_zip': '"13203"',
 'property_zip': '"13208"',
 'vacant_property': 'N'}

In [134]:
near.next()

{'ObjectId': 811,
 'TNT_NAME': 'Northside',
 'X': -76.1610040034,
 'Y': 43.0712832611,
 '_id': ObjectId('5bde1b2d0921aa05cc545a4a'),
 'case_number': '2017-11140',
 'case_open_date': '2017-05-02T00:00:00.000Z',
 'case_type': 'Trash/Debris-Private, Occ',
 'comply_by_date': '2017-05-17T00:00:00.000Z',
 'inspector_id': 252,
 'lat': 43.0712832611,
 'location': {'coordinates': [-76.1610040034, 43.0712832611], 'type': 'Point'},
 'long': -76.1610040034,
 'owner_address': '313 E. Willow St ',
 'owner_city': 'Syracuse',
 'owner_state': 'NY',
 'owner_zip': '"13203"',
 'property_address': '1106 Carbon St',
 'property_id': '002.-19-18.0',
 'property_neighborhood': 'Washington Square',
 'property_owner_name': 'Frank Canzano',
 'property_zip': '"13208"',
 'vacant_property': 'N',
 'violation_date': '2017-05-02T00:00:00.000Z',
 'violation_name': 'SPCC - Section 27-72 (e) -Trash & Debris',
 'violation_status': 'Closed'}

In [188]:
weekly_part_1_crime = pd.read_csv(directory + "Weekly_Part_1_Crime_Offenses_2018.csv", sep=",", header=0, dtype=None, na_values=None)
weekly_part_2_crime = pd.read_csv(directory + "Weekly_Part_2_Crime_Offenses_2018.csv", sep=",", header=0, dtype=None, na_values=None)


In [208]:
crime_1_addresses = weekly_part_1_crime.values[:,4]
crime_2_addresses = weekly_part_2_crime.values[:,4]

In [221]:
def batch_geocode_addresses(addresses, max_num_in_batch = 150):
    i = 0
    coordinates = []
    while i <= len(addresses):
        next_i = i+max_num_in_batch
        if next_i < len(addresses):
            A = addresses[i:i+max_num_in_batch]
        else:
            A = addresses[i:]
        i = next_i
        A = [a + ", Syracuse, NY" for a in A]
        coordinates += batch_geocode(A)
    return coordinates

In [225]:
# LINES COMMENTED OUT BECAUSE THEY ARE MONETARILY EXPENSIVE

# coords = batch_geocode_addresses(crime_1_addresses)
len(coords)

# coords2 = batch_geocode_addresses(crime_2_addresses)
len(coords2)

3634

In [233]:
coords2[0]['location']

{'x': -76.15881953299998, 'y': 43.031716827000025}

In [234]:
coords_1 = np.array([{ "type": "Point", "coordinates": [ loc['location']['x'], loc['location']['y'] ] } for loc in coords])
coords2_1 = np.array([{ "type": "Point", "coordinates": [ loc['location']['x'], loc['location']['y'] ] } for loc in coords2])


In [235]:
coords_1

array([{'type': 'Point', 'coordinates': [-76.13904932099996, 43.052123288000075]},
       {'type': 'Point', 'coordinates': [-76.14701099999996, 43.05083154000005]},
       {'type': 'Point', 'coordinates': [-76.14213465099994, 43.04533000200007]},
       ...,
       {'type': 'Point', 'coordinates': [-76.17424752999995, 43.06991330900007]},
       {'type': 'Point', 'coordinates': [-76.10372816199998, 43.053524656000036]},
       {'type': 'Point', 'coordinates': [-76.17424752999995, 43.06991330900007]}],
      dtype=object)

In [239]:
weekly_part_1_crime['location'] = coords_1
weekly_part_2_crime['location'] = coords2_1

In [247]:
syr.weekly_crime.create_index([("location", "2dsphere")])
dfToCollection(weekly_part_1_crime, syr.weekly_crime)
dfToCollection(weekly_part_2_crime, syr.weekly_crime)

  This is separate from the ipykernel package so we can avoid doing imports until


In [248]:
lead = syr.weekly_crime.find().limit(1).next()
near = {"location": {"$near": {"$geometry": lead['location'] } } }
near = syr.code_violations.find(near).limit(1)
lead


{'ADDRESS': '500 BURNET AV',
 'Arrest': None,
 'Attempt': None,
 'CODE_DEFINED': 'LARCENY',
 'DATE': '2017-03-28T04:00:00.000Z',
 'DRNUMB': 18218804,
 'FID': 1,
 'LarcenyCode': 'All Other',
 'TIMEEND': 700,
 'TIMESTART': 700,
 '_id': ObjectId('5bde387e0921aa05cc552e2f'),
 'location': {'coordinates': [-76.139049321, 43.052123288],
  'spatialReference': {'wkid': 4326},
  'type': 'Point'}}

In [249]:
near.next()

{'ObjectId': 5256,
 'TNT_NAME': 'Northside',
 'X': -76.1389442919,
 'Y': 43.0528443064,
 '_id': ObjectId('5bde1b2d0921aa05cc547d98'),
 'case_number': '2018-27219',
 'case_open_date': '2018-09-04T00:00:00.000Z',
 'case_type': 'Overgrowth: Private, Occ',
 'comply_by_date': '2018-09-14T00:00:00.000Z',
 'inspector_id': 259,
 'lat': 43.0528443064,
 'location': {'coordinates': [-76.1389442919, 43.0528443064], 'type': 'Point'},
 'long': -76.1389442919,
 'owner_address': '4403 Oak Orchard Rd ',
 'owner_city': 'Clay',
 'owner_state': 'NY',
 'owner_zip': '"13041"',
 'property_address': '210 Howard St',
 'property_id': '030.-02-48.0',
 'property_neighborhood': 'Hawley Green',
 'property_owner_name': 'Gary Parker',
 'property_zip': '"13203"',
 'vacant_property': 'N',
 'violation_date': '2018-09-06T00:00:00.000Z',
 'violation_name': 'SPCC - Section 27-72 (f) - Overgrowth',
 'violation_status': 'Closed'}

In [213]:
results = batch_geocode(["500 Avery Lane, Rome, NY"])

In [172]:
# if the maps don't show, run the following in your terminal
# jupyter nbextension install --py --sys-prefix arcgis
# jupyter nbextension enable --py --sys-prefix arcgis

map = my_gis.map(results[0]['attributes']['City']+","+results[0]['attributes']['RegionAbbr'], 9)
map

In [246]:
for i in range(200):
    map.draw(coords[i])