## EDA

In [1]:
import pandas as pd
sf_crime = pd.read_csv('Monthly_Property_Crime_2005_to_2015.csv')

In [2]:
sf_crime['Date'] = pd.to_datetime(sf_crime['Date'])

In [3]:
sf_crime['Category'].value_counts()

VEHICLE THEFT      132
BURGLARY           132
LARCENY/THEFT      132
VANDALISM          132
STOLEN PROPERTY    132
ARSON              132
Name: Category, dtype: int64

In [4]:
sf_crime.head()

Unnamed: 0,Date,Category,IncidntNum
0,2014-02-01,BURGLARY,506
1,2007-02-01,VANDALISM,531
2,2012-07-01,BURGLARY,522
3,2013-07-01,LARCENY/THEFT,3318
4,2010-08-01,VANDALISM,694


In [5]:
sf_crime.Category.value_counts()

VEHICLE THEFT      132
BURGLARY           132
LARCENY/THEFT      132
VANDALISM          132
STOLEN PROPERTY    132
ARSON              132
Name: Category, dtype: int64

In [6]:
sf_crime_separate = sf_crime.pivot(index = 'Date', columns = 'Category', values = 'IncidntNum').reset_index()
sf_crime_separate.head()

Category,Date,ARSON,BURGLARY,LARCENY/THEFT,STOLEN PROPERTY,VANDALISM,VEHICLE THEFT
0,2005-01-01,21,698,2038,56,493,1685
1,2005-02-01,17,547,1734,40,462,1366
2,2005-03-01,19,654,2035,46,585,1476
3,2005-04-01,24,551,1920,41,581,1594
4,2005-05-01,13,622,2181,56,658,1590


In [7]:
sf_crime_separate.columns

Index(['Date', 'ARSON', 'BURGLARY', 'LARCENY/THEFT', 'STOLEN PROPERTY',
       'VANDALISM', 'VEHICLE THEFT'],
      dtype='object', name='Category')

In [8]:
sf_crime_total = sf_crime.groupby('Date').sum().reset_index()
sf_crime_total.head()

Unnamed: 0,Date,IncidntNum
0,2005-01-01,4991
1,2005-02-01,4166
2,2005-03-01,4815
3,2005-04-01,4711
4,2005-05-01,5120


In [23]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.graph_objs import *

### Line charts of monthly total crime number (Plotly)

In [24]:
data = [go.Scatter(x=sf_crime_total.Date, y=sf_crime_total.IncidntNum, mode = 'lines+markers')]
layout = dict(
    title = "Monthly Total Crime Number",
    xaxis = dict(
        range = ['2004-12-01','2016-01-01'])
)
fig = dict(data=data, layout=layout)
iplot(fig)

### Multi Line charts of monthly crime number of each category (Plotly)

In [25]:
data = []
for i, col in enumerate(['ARSON', 'BURGLARY', 'LARCENY/THEFT', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT']):
    trace = go.Scatter(x=sf_crime_separate['Date'], y = sf_crime_separate[col], name = col)
    data.append(trace)
layout = dict(
    title = "Monthly Crime Number of Each Category",
    xaxis = dict(
        range = ['2004-12-01','2016-01-01'])
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='simple-connectgaps')

### Bar charts of monthly total crime number (Plotly)

In [26]:
data = [go.Bar(x=sf_crime_total.Date, y=sf_crime_total.IncidntNum)]
layout = dict(
    title = "Monthly Total Crime Number",
    xaxis = dict(
        range = ['2004-12-01','2016-01-01'])
)
fig = dict(data=data, layout=layout)
iplot(fig)

### Stacked bar charts of monthly crime number of each category (Plotly)

In [27]:
data = []
for i, col in enumerate(['ARSON', 'BURGLARY', 'LARCENY/THEFT', 'STOLEN PROPERTY', 'VANDALISM', 'VEHICLE THEFT']):
    trace = go.Bar(x=sf_crime_separate['Date'], y = sf_crime_separate[col], name = col)
    data.append(trace)
    
layout = go.Layout(
    title = "Monthly Crime Number of Each Category",
    xaxis = dict(range = ['2004-12-01','2016-01-01']),
    barmode='stack'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-bar')

### Heatmap of monthly crime number of each category (Plotly)

In [28]:
cols = sf_crime_separate.drop(['Date'], axis = 1).columns
data_list = []
for col in cols:
    data_list.append(list(sf_crime_separate[col]))
  
colors = ["#6b8c85", "#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
data = [
    go.Heatmap(
        z=data_list,
        x=sf_crime_separate.Date,
        y=cols,
        colorscale='Viridis',
    )
]

layout = go.Layout(
    title='Monthly Crime Number of Each Category',
    xaxis = dict(ticks=''),
    yaxis = dict(ticks='' ),
    margin = dict(l = 120, r = 50, b = 50, t = 100, pad = 4)
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='datetime-heatmap')

In [29]:
cols = sf_crime_separate.drop(['Date'], axis = 1).columns
data_list = []
for col in cols:
    data_list.append(list(sf_crime_separate[col]))

colors = ["#6b8c85", "#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
data = [
    go.Heatmap(
        z=data_list,
        x=sf_crime_separate.Date,
        y=cols,
        colorscale=[
        # Let first 10% (0.1) of the values have color rgb(0, 0, 0)
        [0, colors[0]],
        [0.1, colors[0]],

        # Let values between 10-20% of the min and max of z
        # have color rgb(20, 20, 20)
        [0.1, colors[1]],
        [0.2, colors[1]],

        # Values between 20-30% of the min and max of z
        # have color rgb(40, 40, 40)
        [0.2, colors[2]],
        [0.3, colors[2]],

        [0.3, colors[3]],
        [0.4, colors[3]],

        [0.4, colors[4]],
        [0.5, colors[4]],

        [0.5, colors[5]],
        [0.6, colors[5]],

        [0.6, colors[6]],
        [0.7, colors[6]],

        [0.7, colors[7]],
        [0.8, colors[7]],

        [0.8, colors[8]],
        [0.9, colors[8]],

        [0.9, colors[9]],
        [1.0, colors[9]]
    ],
    )
]

layout = go.Layout(
    title='Monthly Crime Number of Each Category',
    xaxis = dict(ticks=''),
    yaxis = dict(ticks='' ),
    margin = dict(l = 120, r = 50, b = 50, t = 100, pad = 4)
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='datetime-heatmap')

### Animated bubble charts of monthly crime number of each category (plotly)

In [30]:
colors = dict(
    ARSON='#1f77b4',
    BURGLARY='#ff7f0e',
    LARCENY_THEFT='#2ca02c',
    STOLEN_PROPERTY = "#7e7e7e",
    VANDALISM='#d62728',
    VEHICLE_THEFT='#9467bd'
)

sizemode = 'area'

# (!) Set a reference for 'size' values (i.e. a population-to-pixel scaling).
#     Here the max bubble area will be on the order of 100 pixels
sizeref = sf_crime_total['IncidntNum'].max() / 1e2**2*8

# Define a trace-generating function (returns a Scatter object)
def make_trace(X, cat, sizes, color):
    return Scatter(
        x=X['Date'],  # GDP on the x-xaxis
        y=X['IncidntNum'],    # life Exp on th y-axis
        name=cat,    # label continent names on hover
        mode='markers',    # (!) point markers only on this plot
        marker= Marker(
            color=color,          # marker color
            size=sizes,           # (!) marker sizes (sizes is a list)
            sizeref=sizeref,      # link sizeref
            sizemode=sizemode,    # link sizemode
            opacity=0.6,          # (!) partly transparent markers
            line=Line(width=0.0)  # remove marker borders
        )
    )
# Initialize data object 
data = Data()

# Group data frame by continent sub-dataframe (named X), 
#   make one trace object per continent and append to data object
for cat, X in sf_crime.groupby('Category'):
    if cat == 'LARCENY/THEFT': cat = 'LARCENY_THEFT'
    if cat == 'STOLEN PROPERTY': cat = 'STOLEN_PROPERTY'
    if cat == 'VEHICLE THEFT': cat = 'VEHICLE_THEFT'
    sizes = X['IncidntNum']                            # get population array 
    color = colors[cat]                   # get bubble color
    data.append(
        make_trace(X, cat, sizes, color)  # append trace to data object
    )

In [34]:
# Set plot and axis titles
title = "Monthly Crime Number of Each Category (Size = Number of Crimes)"
x_title = "Month"
y_title = "Number of Crimes"

# Define a dictionary of axis style options
axis_style = dict(
    zeroline=False,       # remove thick zero line
    gridcolor='#FFFFFF',  # white grid lines
    ticks='outside',      # draw ticks outside axes 
    ticklen=8,            # tick length
    tickwidth=1.5         #   and width
)

# Make layout object
layout = Layout(
    title=title,             # set plot title
    plot_bgcolor='#EFECEA',  # set plot color to grey
    xaxis=XAxis(
        axis_style,      # add axis style dictionary
        title=x_title,   # x-axis title
    ),
    yaxis=YAxis(
        axis_style,      # add axis style dictionary
        title=y_title,   # y-axis title
    )
)
fig = Figure(data=data, layout=layout)

# (@) Send to Plotly and show in notebook
iplot(fig, filename='s3_life-gdp')

### Joy Plot of the distribution of Insident Number for each Crime (bokeh)

In [35]:
from bokeh.io import output_notebook, show
output_notebook()

from numpy import linspace
from scipy.stats.kde import gaussian_kde

from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, FixedTicker, PrintfTickFormatter
from bokeh.plotting import figure
from bokeh.sampledata.perceptions import probly

import colorcet as cc

output_file("joyplot.html")

def joy(category, data, scale=100):
    return list(zip([category]*len(data), scale*data))

cats = list(reversed(sf_crime['Category'].unique()))

palette = [cc.rainbow[i*15] for i in range(17)]

x = linspace(0,3600, 500)

source = ColumnDataSource(data=dict(x=x))

p = figure(y_range=cats, plot_width=900, x_range=(0, 3600), toolbar_location=None)

for i, cat in enumerate(cats):
    sub_data = sf_crime[sf_crime['Category'] == cat]['IncidntNum']
    pdf = gaussian_kde(sub_data)
    y = joy(cat, pdf(x))
    source.add(y, cat)
    p.patch('x', cat, color=palette[i], alpha=0.6, line_color="black", source=source)

p.outline_line_color = None
p.background_fill_color = "#efefef"

p.xaxis.ticker = FixedTicker(ticks=list(range(0, 3600, 500)))
# p.xaxis.formatter = PrintfTickFormatter(format="%d%%")

p.ygrid.grid_line_color = None
p.xgrid.grid_line_color = "#dddddd"
p.xgrid.ticker = p.xaxis[0].ticker

p.axis.minor_tick_line_color = None
p.axis.major_tick_line_color = None
p.axis.axis_line_color = None

p.y_range.range_padding = 0.12

show(p)