In [1]:
# Standard tools for data analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

# Tools specific for geospatial data analysis
from mpl_toolkits.basemap import Basemap
import shapely
from shapely.geometry import shape, mapping, Point, Polygon
import geopandas as gpd
import geojsonio
from matplotlib.collections import PatchCollection
from descartes import PolygonPatch

# Tools from the Python Standard Library
import os
import re

from IPython.display import display
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)

### Read in data

In [26]:
DATADIR = '../data/'
print('Data:\n')
!ls $DATADIR

RESULTSDIR = '../results/'
print('\nResults:\n')
!ls $RESULTSDIR

Data:

CrimeWatch_Maps_Past_90-Days.csv
Data_Delivery_EDF_AQ_Team_20170515.xlsx
Service_requests_received_by_the_Oakland_Call_Center.csv
oakland_neighborhoods.geojson
residential_zones_300_ft_of_industrial_areas.geojson

Results:

01-air_quality.h5      01-neighborhoods.h5    02-service_requests.h5
01-crime.h5            01-service_requests.h5


As before, let's read in the neighborhood shapes:

In [28]:
neighborhoods = gpd.read_file(DATADIR + 'oakland_neighborhoods.geojson')
neighborhoods = neighborhoods.iloc[137:]

Read in the cleaned crime data

In [11]:
crime = pd.read_hdf(RESULTSDIR + '01-crime.h5')

In [12]:
crime.head()

Unnamed: 0,CRIMETYPE,DATETIME,CASENUMBER,DESCRIPTION,Location,coordinates
0,VANDALISM,10/24/2017 07:00:00 PM,17-916879,VANDALISM,"500 20TH ST\nOakland, CA\n(37.809581, -122.269...","(37.809581, -122.269628)"
1,VANDALISM,09/08/2017 07:00:00 PM,17-914153,VANDALISM,"500 27TH ST\nOakland, CA\n(37.816128, -122.267...","(37.816128, -122.267219)"
2,MOTOR VEHICLE THEFT,10/06/2017 11:27:00 AM,17-052428,VEHICLE THEFT - AUTO,"3300 GEORGIA ST\nOakland, CA\n(37.794807, -122...","(37.794807, -122.20291)"
3,ASSAULT,09/09/2017 11:16:00 PM,17-047455,INFLICT CORPORAL INJURY ON SPOUSE/COHABITANT,"5900 HARMON AV\nOakland, CA\n(37.768042, -122....","(37.768042, -122.194877)"
4,ASSAULT,09/16/2017 04:37:00 PM,17-048647,BATTERY:SPOUSE/EX SPOUSE/DATE/ETC,"2400 96TH AV\nOakland, CA\n(37.750884, -122.16...","(37.750884, -122.160601)"


How many entries are there?

In [13]:
crime.shape

(9679, 6)

How many of those have coordinates that we can plot easily (without having to parse the address)?

In [14]:
crime[crime['coordinates'].notnull()].shape

(8820, 6)

In [17]:
crime[crime['coordinates'].isnull()].head()

Unnamed: 0,CRIMETYPE,DATETIME,CASENUMBER,DESCRIPTION,Location,coordinates
21,THEFT/LARCENY,09/05/2017 08:30:00 PM,17-914017,BURGLARY-AUTO,"2200 MARTIN LUTHER KING JR W\nOakland, CA\n",
29,THEFT/LARCENY,08/30/2017 08:25:00 AM,17-913651,BURGLARY-AUTO,"FRUITVALE AVE\nOakland, CA\n",
38,THEFT/LARCENY,09/09/2017 10:59:00 PM,17-914256,BURGLARY-AUTO,"22ND ST/ BROADWAY/ TELEGRAPH ST\nOakland, CA\n",
61,THEFT/LARCENY,11/22/2017 10:00:00 PM,17-061166,BURGLARY-AUTO,"PARKING GARAGE AT 881 69TH AVE AV\nOakland, CA\n",
63,VANDALISM,10/23/2017 12:00:00 AM,17-916774,VANDALISM,"2600 BLK 26TH ST\nOakland, CA\n",


The reason why a number of these don't have coordinates is that their locations contain little information (e.g., the second one above only says that it's on Fruitvale Ave.). Since that's the case, let's just drop anything that's null:

In [21]:
crime.dropna(inplace=True)

## Crime types
Let's take a look at the distribution of the crime types via a bar plot:

In [23]:
crime['CRIMETYPE'].unique()

array(['VANDALISM', 'MOTOR VEHICLE THEFT', 'ASSAULT', 'THEFT/LARCENY',
       'ROBBERY', 'BURGLARY', 'SEX CRIMES', 'DRUGS/ALCOHOL VIOLATIONS',
       'FRAUD', 'ARSON', 'WEAPONS', 'DISTURBING THE PEACE', 'DUI',
       'HOMICIDE'], dtype=object)