First, load the 311 complaints dataset available here https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9

In [2]:
CSV_PATH = './data/311.csv'
import subprocess
from tqdm import tqdm
import pandas as pd 
import os
from h3 import h3
import warnings
warnings.filterwarnings('ignore')


def file_len(fname):
    p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, 
                                              stderr=subprocess.PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    return int(result.strip().split()[0])+1

n_rows = file_len(CSV_PATH)
print (f'Exact number of rows: {n_rows}')
df_tmp = pd.read_csv(CSV_PATH, nrows=5)
df_tmp.head()


Exact number of rows: 27382105


Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,...,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
0,27797840,04/07/2014 04:18:00 PM,04/08/2014 11:33:00 AM,DOT,Department of Transportation,Street Light Condition,Street Light Out,,11229.0,,...,,,,,,,,40.597262,-73.951006,"(40.59726241863315, -73.95100611242795)"
1,34969370,12/09/2016 10:14:44 PM,12/09/2016 10:15:17 PM,HRA,HRA Benefit Card Replacement,Benefit Card Replacement,Food Stamp,NYC Street Address,,,...,,,,,,,,,,
2,34969353,12/07/2016 08:59:03 AM,,DOT,Department of Transportation,Street Condition,Wear & Tear,,11233.0,HANCOCK STREET,...,,,,,,,,,,
3,34969558,12/09/2016 03:37:25 PM,12/12/2016 11:33:02 AM,DOF,Personal Exemption Unit,DOF Property - Reduction Issue,Personal STAR Exemption,"1-, 2- and 3- Family Home",11370.0,,...,,,,,,,,,,
4,34969585,12/08/2016 03:16:00 PM,12/09/2016 12:00:00 PM,DSNY,BCC - Staten Island,Overflowing Litter Baskets,6 Overflowing Litter Baskets,Sidewalk,,MAGUIRE AVENUE,...,,,,,,,,,,


In [3]:
types = {
    #'Unique Key': 'float32',
              'Complaint Type': 'category', 
              'Longitude': 'float32',
              'Latitude': 'float32',
#              'Incident Zip': 'int',
#              'Created Date': 'str',
#              'Closed Date': 'str',

        }
cols = list(types.keys())
chunksize = 10_000_000

df_list = [] # list to hold the batch dataframe

def process_date(df_chunk, key):
    df_chunk[key] = df_chunk[key].str.slice(0, 16)
    #df_chunk[key] = pd.to_datetime(df_chunk[key], utc=True, format='%Y-%m-%d %H:%M')
    
for df_chunk in tqdm(pd.read_csv(CSV_PATH, usecols=cols, dtype=types, chunksize=chunksize)):
    # Neat trick from https://www.kaggle.com/btyuhas/bayesian-optimization-with-xgboost
    # Using parse_dates would be much slower!
#    process_date(df_chunk, 'Closed Date')
#    process_date(df_chunk, 'Created Date')
    df_list.append(df_chunk) 

    
pdf = pd.concat(df_list)

# Delete the dataframe list to release memory
del df_list

# See what we have loaded
pdf.info()
#takes 1:30 minutes

3it [00:49, 16.58s/it]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27382103 entries, 0 to 27382102
Data columns (total 3 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Complaint Type  object 
 1   Latitude        float32
 2   Longitude       float32
dtypes: float32(2), object(1)
memory usage: 417.8+ MB


In [4]:
pd.options.display.max_rows = 1000

counts = pdf['Complaint Type'].value_counts()
print(counts)



Noise - Residential                              2575599
HEAT/HOT WATER                                   1589587
Illegal Parking                                  1424426
Blocked Driveway                                 1188263
Street Condition                                 1090368
Street Light Condition                           1035400
HEATING                                           887869
Noise - Street/Sidewalk                           863368
PLUMBING                                          780618
Water System                                      739604
UNSANITARY CONDITION                              570281
Noise                                             564961
General Construction/Plumbing                     563138
GENERAL CONSTRUCTION                              500863
Traffic Signal Condition                          500226
Noise - Commercial                                446898
Sewer                                             419190
PAINT/PLASTER                  

In [45]:
APERTURE_SIZE = 10
hex_col = 'hex'+str(APERTURE_SIZE)


pdf[hex_col] = 0
import json

print(1)
def binRow(df311):
    # find hexs containing the points
    df311[hex_col] = df311.apply(lambda x: h3.geo_to_h3(x.Latitude,x.Longitude,APERTURE_SIZE),1)

    # aggregate the points
    df311g = df311.groupby(hex_col).size()#.to_frame('cnt').reset_index()

    #find center of hex for visualization
#     df311g['lat'] = df311g[hex_col].apply(lambda x: h3.h3_to_geo(x)[0])
#     df311g['lng'] = df311g[hex_col].apply(lambda x: h3.h3_to_geo(x)[1])
    return (df311g)


def saveComplaint(complaint):
    print(complaint)
    data = pdf.loc[pdf['Complaint Type'] ==(complaint)]
    binned = binRow(data)
    complaint = complaint.replace(' ', '-').replace('/','-')
    hi = [(hex, count)  for hex,count in binned.iteritems()]
    with open('./data/'+complaint+'.json', 'w') as outfile: json.dump(hi, outfile)
    
    

stuff = [
    'Noise - Residential',                         
#'HEAT/HOT WATER' ,                              
#'Street Condition'   ,                           
#'Illegal Parking ',                              
#'Blocked Driveway'  ,                            
#'Street Light Condition' ,                       
#'HEATING'          ,                             
#'PLUMBING'     ,                                 
#'Water System'  ,                                
#'Noise - Street/Sidewalk' ,                      
# 'GENERAL CONSTRUCTION'  ,                                                                
#'UNSANITARY CONDITION'
        ]                          


    
    
import csv, json
from geojson import Feature, FeatureCollection, Point, dump


    
def saveComplaint2(complaint):
    
    data = pdf.loc[pdf['Complaint Type'] ==(complaint)]
    binned = binRow(data)
    complaint = complaint.replace(' ', '-').replace('-','')
    print(complaint)
    ##hi = [(hex, count)  for hex,count in binned.iteritems()]
    features = []
    for key,val in binned.iteritems():
        #print(key,val)
        if h3.string_to_h3(key) == 0x0: continue
        else: 
            lat,long = h3.h3_to_geo(key)
            #print(lat, long)
            features.append(
                Feature(
                    geometry = Point((long, lat)),
                    properties = {
                        'noise_value': val,
                    }
                )
            )
    #return
    print(features[0])
    collection = FeatureCollection(features)
    with open('./data/' + complaint + ".geojson", "w") as f:
        print ('writing to ' + complaint + ".geojson",)
        #f.write('%s' % collection)
        dump(collection, f)

    #with open('./data/'+complaint+'.json', 'w') as outfile: json.dump(hi, outfile)
 
    #     with open(complaint + '.csv', 'w') as csvfile:
    #         reader = csv.reader(csvfile, delimiter=',')
    #         for latitude, longitude, weather, temp in reader:
    #             latitude, longitude = map(float, (latitude, longitude))
          

   

        
for complaint in stuff:
    saveComplaint2(complaint)

1
NoiseResidential
{"geometry": {"coordinates": [-73.780891, 40.855039], "type": "Point"}, "properties": {"noise_value": 1}, "type": "Feature"}
writing to NoiseResidential.geojson


In [None]:
#automatically upload all csv to s3
import boto3, os