In [41]:
from __future__ import division, print_function
import pandas as pd
import numpy as np
import bokeh as bk
from bokeh.plotting import figure, output_notebook, show, vplot, output_file
from bokeh.io import push_notebook
from bokeh.models import ColumnDataSource, HoverTool, DatetimeTickFormatter, DatetimeTicker, Span, Range1d, LinearAxis
from bokeh.charts import TimeSeries

In [37]:
output_notebook()

In [11]:
file_name = 'DOHMH_New_York_City_Restaurant_Inspection_Results.csv'
with open(file_name) as f:
    column_names = f.readline().splitlines()
    
column_names = column_names[0].split(',')
for x in ['PHONE','VIOLATION DESCRIPTION','RECORD DATE','BUILDING','STREET']:
    column_names.remove(x)

In [12]:
df = pd.read_csv(file_name,usecols=column_names,parse_dates=['INSPECTION DATE','GRADE DATE'])
df.head()

Unnamed: 0,CAMIS,DBA,BORO,ZIPCODE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,INSPECTION TYPE
0,41310578,EASTEND BAR & GRILL,MANHATTAN,10028,American,2014-09-26,Violations were cited in the following area(s).,04H,Critical,28.0,,NaT,Cycle Inspection / Initial Inspection
1,41656176,BEST LUNCHEONETTE,BROOKLYN,11214,American,2014-01-27,Violations were cited in the following area(s).,10F,Not Critical,10.0,A,2014-01-27,Cycle Inspection / Re-inspection
2,41558513,ESSEN DELI,BROOKLYN,11230,Jewish/Kosher,2014-05-15,Violations were cited in the following area(s).,04L,Critical,33.0,,NaT,Cycle Inspection / Initial Inspection
3,41685035,HADJA MARLEY RESTAURANT,BROOKLYN,11238,African,2014-01-25,No violations were recorded at the time of thi...,,Not Applicable,,,NaT,Administrative Miscellaneous / Initial Inspection
4,41644127,OLEANDR RESTAURANT,BROOKLYN,11235,Russian,2014-11-26,Violations were cited in the following area(s).,20D,Not Critical,,,NaT,Administrative Miscellaneous / Initial Inspection


# No. of restaurants (inspected) by year 

In [7]:
action_types = df['ACTION'].unique()
action_dict = dict(zip(range(1,len(action_types)+1),action_types))
df['ACTION'].replace(dict( (v,k) for k,v in action_dict.items()),inplace=True)

In [90]:
cuisine_closed_num = df[df['ACTION']==3].groupby('CUISINE DESCRIPTION')['CAMIS'].nunique()

In [92]:
cuisine_closed_num.sort_values(ascending=False)

CUISINE DESCRIPTION
Chinese                                                             310
American                                                            259
Latin (Cuban, Dominican, Puerto Rican, South & Central American)    108
Pizza                                                               103
Mexican                                                              85
Caribbean                                                            82
Japanese                                                             75
Spanish                                                              68
Bakery                                                               67
Italian                                                              54
CafÃ©/Coffee/Tea                                                     49
Chicken                                                              42
Indian                                                               37
Thai                                        

In [8]:
cuisine_closed = df[df['ACTION']==3]['CUISINE DESCRIPTION'].value_counts()

In [9]:
cuisine_closed.head()

Chinese                                                             2126
American                                                            1498
Latin (Cuban, Dominican, Puerto Rican, South & Central American)     716
Mexican                                                              559
Pizza                                                                544
Name: CUISINE DESCRIPTION, dtype: int64

In [13]:
cuisine_numbers = df.groupby('CUISINE DESCRIPTION')['CAMIS'].nunique()

In [33]:
df['SCORE'] = df['SCORE'].apply(float)
cuisine_score = df.groupby('CUISINE DESCRIPTION')['SCORE'].describe()

In [93]:
cuisine_df = pd.concat([cuisine_numbers,cuisine_closed,cuisine_closed_num],axis=1)

In [44]:
cuisine_closed.head()

Chinese                                                             2126
American                                                            1498
Latin (Cuban, Dominican, Puerto Rican, South & Central American)     716
Mexican                                                              559
Pizza                                                                544
Name: CUISINE DESCRIPTION, dtype: int64

In [97]:
cuisine_df.columns = ['number','closed inspections','closed numbers']
cuisine_df = cuisine_df.fillna(0)
cuisine_df.head()

Unnamed: 0,number,closed inspections,closed numbers
Afghan,12,17.0,2.0
African,73,79.0,15.0
American,6104,1498.0,259.0
Armenian,39,0.0,0.0
Asian,363,236.0,32.0


In [104]:
cuisine_df['closed inspections'] = cuisine_df['closed inspections'].apply(int)
cuisine_df['closed numbers'] = cuisine_df['closed numbers'].apply(int)
cuisine_df.index.rename('cuisine',inplace=True)

In [105]:
cuisine_df.sort_values('number',ascending=False,inplace=True)

In [109]:
cuisine_df

Unnamed: 0_level_0,number,closed inspections,closed numbers
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American,6104,1498,259
Chinese,2399,2126,310
CafÃ©/Coffee/Tea,1369,228,49
Pizza,1173,544,103
Other,1135,0,0
Italian,1051,319,54
"Latin (Cuban, Dominican, Puerto Rican, South & Central American)",882,716,108
Mexican,817,559,85
Japanese,790,472,75
Bakery,719,468,67


In [122]:
number_sum = cuisine_df.sum()['number']
closed_sum = cuisine_df.sum()['closed numbers']

In [137]:
cuisine_df.sum()

number                25911
closed inspections    11363
closed numbers         1850
dtype: int64

# old_copy

In [273]:
source = ColumnDataSource(cuisine_df.to_dict("list"))
source.add(cuisine_df.index, name="type")
hover = HoverTool(tooltips='@type')

p = figure(title='No. of NYC eateries inspected & closed by cuisine type',
        x_axis_label='No. of unique eateries inspected by DOH (Jan 2012 - Apr 2016)',
        y_axis_label='No. of unique eateries closed by DOH',        
        tools="pan,wheel_zoom,box_zoom,reset,resize",
        x_range=(-300,6400), y_range=(-15,325))
p.add_tools(hover)

p.title_text_font_size = '14pt'
axis_font = '11pt'
p.xaxis.axis_label_text_font_size = axis_font
p.yaxis.axis_label_text_font_size = axis_font


p.circle('number','closed numbers',size=10,source=source,legend='cuisine type (hover for info)')
p.line([-number_sum,number_sum],[-closed_sum,closed_sum],legend='proportional line')
p.legend.location = 'bottom_right'

output_file('heroku/closed.html')
t = show(p)

ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: closed number [renderer: GlyphRenderer, ViewModel:GlyphRenderer, ref _id: 771360c2-8489-4bf9-88e7-36e019b80382]
ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure, ViewModel:Plot, ref _id: f4f4481c-5589-4660-9581-bc09a98799aa
ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure, ViewModel:Plot, ref _id: f63842a5-f238-48be-a4e0-feaf1cc6cc4e
ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: mean [renderer: GlyphRenderer, ViewModel:GlyphRenderer, ref _id: 40ed4ab0-6277-4631-84ec-c32d9bc45c46]
ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Fig

# Plot 2

In [None]:
cuisine_inspected.sort_values().tail()

In [149]:
df[df['INSPECTION DATE']!=pd.to_datetime('1900-01-01 00:00:00')].head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,INSPECTION TYPE
0,41310578,EASTEND BAR & GRILL,MANHATTAN,1664,1 AVENUE,10028,American,2014-09-26,1,04H,Critical,28.0,,NaT,Cycle Inspection / Initial Inspection
1,41656176,BEST LUNCHEONETTE,BROOKLYN,1758,BATH AVENUE,11214,American,2014-01-27,1,10F,Not Critical,10.0,A,2014-01-27,Cycle Inspection / Re-inspection
2,41558513,ESSEN DELI,BROOKLYN,1359,CONEY ISLAND AVENUE,11230,Jewish/Kosher,2014-05-15,1,04L,Critical,33.0,,NaT,Cycle Inspection / Initial Inspection
3,41685035,HADJA MARLEY RESTAURANT,BROOKLYN,1139,FULTON STREET,11238,African,2014-01-25,2,,Not Applicable,,,NaT,Administrative Miscellaneous / Initial Inspection
4,41644127,OLEANDR RESTAURANT,BROOKLYN,410,BRIGHTON BEACH AVE,11235,Russian,2014-11-26,1,20D,Not Critical,,,NaT,Administrative Miscellaneous / Initial Inspection


In [237]:
time_df = df[(df['INSPECTION DATE']>=pd.to_datetime('2012-01-01 00:00:00')) & 
             (df['INSPECTION DATE']<pd.to_datetime('2016-04-01 00:00:00'))]

In [238]:
for x in ['INSPECTION TYPE','GRADE DATE','BUILDING','STREET','BORO','ZIPCODE']:
    time_df.drop(x,axis=1)

In [239]:
time_df['SCORE'].fillna(0,inplace=True)

In [240]:
time_df2 = time_df.set_index('INSPECTION DATE')

In [242]:
time_group = time_df2.groupby(pd.TimeGrouper('M'))

In [243]:
time_table = pd.concat([time_group['CAMIS'].count(),time_group['SCORE'].mean(),time_group['SCORE'].std()],axis=1)

In [245]:
time_table.columns = ['number','score mean','score std']

In [283]:
source = ColumnDataSource(time_table.to_dict("list"))
source.add(time_table.index, name='mo')

p2 = figure(title='Restaurant inspection and mean score over time',
    x_axis_type="datetime")
p.title_text_font_size = '12pt'

p2.extra_y_ranges = {"foo": Range1d(start=0, end=40)}
p2.add_layout(LinearAxis(y_range_name="foo"), 'right')

p2.line('mo','score mean',color='red',source=source,y_range_name="foo",legend='mean score (lower is better)')
p2.circle('mo','score mean',color='red',source=source,y_range_name="foo",legend='mean score (lower is better)')


err_xs = []
err_ys = []

for i in range(len(time_table)):
    x = time_table.index[i]
    mean = time_table.iloc[i,1]
    std = time_table.iloc[i,2]
    err_xs.append((x,x))
    err_ys.append((mean-std,mean+std))

p2.multi_line(err_xs, err_ys, color='red',y_range_name="foo",alpha=0.4)

A_threshold = Span(location=13.5,dimension='width',line_dash='dashed',line_color='red',y_range_name="foo")
B_threshold = Span(location=27.5,dimension='width',line_dash='dashed',line_color='red',y_range_name="foo")

p2.renderers.extend([A_threshold, B_threshold ])

## number
p2.line('mo','number',source=source,legend='number of inspections')
p2.square('mo','number',source=source,legend='number of inspections')

p2.xaxis[0].formatter = DatetimeTickFormatter(formats=dict(years=["%Y"]))
p2.legend.location = 'bottom_right'

output_file('heroku/time_series.html')
show(p2)

ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: closed number [renderer: GlyphRenderer, ViewModel:GlyphRenderer, ref _id: 771360c2-8489-4bf9-88e7-36e019b80382]
ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure, ViewModel:Plot, ref _id: f4f4481c-5589-4660-9581-bc09a98799aa
ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Figure, ViewModel:Plot, ref _id: f63842a5-f238-48be-a4e0-feaf1cc6cc4e
ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: mean [renderer: GlyphRenderer, ViewModel:GlyphRenderer, ref _id: 40ed4ab0-6277-4631-84ec-c32d9bc45c46]
ERROR:C:\Anaconda\lib\site-packages\bokeh\core\validation\check.pyc:W-1001 (NO_DATA_RENDERERS): Plot has no data renderers: Fig

In [260]:
time_table.iloc[0]

number        74.000000
score mean    14.216216
score std      8.912467
Name: 2012-01-31 00:00:00, dtype: float64