# Plotting data incl. google maps

gmplot library:
https://github.com/vgm64/gmplot

In [None]:
# !pip install gmplot --upgrade

In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

import gmplot

### Dataset

In [None]:
datafile = "../_data/nyc_311_data_subset-2.csv"
df = pd.read_csv(datafile, index_col='Unique Key', low_memory=False)

In [None]:
#Add the fix_zip function
def fix_zip(input_zip):
    try:
        input_zip = int(float(input_zip))
    except:
        try:
            input_zip = int(input_zip.split('-')[0])
        except:
            return np.NaN
    if input_zip < 10000 or input_zip > 19999:
        return np.NaN
    return str(input_zip)

In [None]:
#fix the zip
df['Incident Zip'] = df['Incident Zip'].apply(fix_zip)

In [None]:
# drop all rows that have any nans in them (note the easier syntax!)
df = df.dropna(how='any')

In [None]:
# get rid of unspecified boroughs
df = df[df['Borough'] != 'Unspecified']

In [None]:
# Convert times to datetime and create a processing time column
df['Created Date'] = df['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
df['Closed Date'] = df['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
df['processing_time'] =  df['Closed Date'] - df['Created Date']

In [None]:
#Finally, get rid of negative processing times and return the final data frame
df = df[df['processing_time']>=datetime.timedelta(0,0,0)]

### GoogleMapPlotter constructor
<ul>
<li>GoogleMapPlotter(center_lat, center_lng, zoom)
<li>from_geocode(location_string,zoom)


In [None]:
gmap = gmplot.GoogleMapPlotter(40.7128, 74.0059, 8)
# gmap = gmplot.GoogleMapPlotter.from_geocode("New York", 10)

### Generate the heatmap

In [None]:
gmap.heatmap(df['Latitude'], df['Longitude'])

### Save heatmap to an HTML file

In [None]:
gmap.draw('incidents3.html')

### Display gmap HTML
NOTE/TODO: this file is too large (30mb+) to show inline 

In [None]:
from IPython.display import HTML
# HTML('<iframe src=incidents3.html width=700 height=450></iframe>')

another way to dispaly HTML inline:

In [None]:
%%html
<iframe src=incidents3.html width=700 height=450></iframe>

## Incidents by Borough

In [None]:
borough_group = df.groupby('Borough')
borough_group.size().plot(kind='bar');
#kind can be 'hist', 'scatter'

## Incidents by Agency

In [None]:
agency_group = df.groupby('Agency')
agency_group.size().plot(kind='bar');

## Incidents combined by borough and agency

In [None]:
agency_borough = df.groupby(['Agency','Borough'])
agency_borough.size().plot(kind='bar', figsize=(18,8));

### Unstack groups to get borough by agency

In [None]:
agency_borough.size().unstack().plot(kind='bar', title="Incidents in each Agency by Borough", figsize=(18,8));

<h1>Incidents by time</h1>
<p>We know the creation date of each incident so we can build a bar graph of number of incidents by month
<p>Not particularly useful with a few months data but if we had all data from 2010, we could use this sort of
analysis to eyeball trends and seasonality
<p>We're going to need to do some data manipulation for this

### Convert date to yyyymm

In [None]:
df['Created Date'].sample(5)

In [None]:
pd.to_datetime(df['Created Date'].sample(5)).dt.strftime('%Y%m')

In [None]:
df['Created Date'] = pd.to_datetime(df['Created Date'])

In [None]:
df['yyyymm'] = df['Created Date'].dt.strftime('%Y%m')

In [None]:
date_agency = df.groupby(['yyyymm','Agency'])
date_agency.size().unstack().plot(kind='bar', figsize=(18, 8));

<h1>Examining agencies</h1>

<h2>We'll look at the frequency by agency and report the top 5 values</h2>

In [None]:
df.groupby('Agency').count()
df.groupby('Agency').size()

In [None]:
df.groupby('Agency').size().sort_values(ascending=False)

In [None]:
df.groupby('Agency').size().sort_values(ascending=False).plot(kind='bar', figsize=(20,4));

<h3>We can drill down into complaints by Agency by borough</h3>

In [None]:
agency_borough = df.groupby(['Agency', 'Borough']).size().unstack()

In [None]:
agency_borough

<h3>We can create 'top 5 Agency' subplots subplots for each borough</h3>

In [None]:
list(agency_borough.items())[:1]

### Plot top-n agencies per borough

In [None]:
ROW_NUM, COL_NUM = 3, 2
N = int(input('top number of agencies? '))
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(15, 8))

for i, (borough, agency) in enumerate(agency_borough.items()): 
    agency = agency.sort_values(ascending=False)[:N]
    ax = axes[int(i/COL_NUM), i%COL_NUM]
    agency.plot(kind='barh', ax=ax)
    ax.set_title(borough)

plt.tight_layout();

<h1>Processing time</h1>
<h2>We can compute simple statistics on processing time</h2>

In [None]:
grouped = df[['processing_time','Borough']].groupby('Borough')

In [None]:
grouped.describe()

### Convert timedelta processing_time into days (float)

In [None]:
df['float_time'] = df['processing_time'].apply(lambda x: x / np.timedelta64(1, 'D'))

In [None]:
df.head()

### Compute stats

In [None]:
grouped = df[['float_time','Agency']].groupby('Agency')
grouped.describe()

In [None]:
grouped = df[['float_time','Agency']].groupby('Agency')
grouped.mean().sort_values('float_time', ascending=False)

In [None]:
df['float_time'].hist(bins=40, figsize=(20, 8));