[Data description](http://www.opendatanetwork.com/dataset/data.cityofnewyork.us/9w7m-hzhe) 

Articles

* [Fines lowered](http://www.nydailynews.com/new-york/restaurants-cheer-plan-slash-penalties-article-1.1430407) (nice window photo)
* [Grading pooh-poohed](http://www.nydailynews.com/opinion/city-council-throws-rotten-tomatoes-restaurant-grades-article-1.1036545) (grade placards)

# Future ideas

1. Hall of shame (high scorers)
2. Grade not posted
3. Violations '7' (obstruction) and 10A (no toilet paper)

# Requirements

In [1]:
from __future__ import division, print_function
import pandas as pd
import numpy as np
import bokeh as bk
import pickle

import my_functions as mf

from bokeh.plotting import figure, output_notebook, show, output_file
from bokeh.io import push_notebook, reset_output
from bokeh.models import ColumnDataSource, HoverTool, Span, Range1d
from bokeh.embed import components

In [3]:
output_notebook()

# Load data

In [4]:
df15 = pd.read_csv('inspected_2015.csv',
                   parse_dates=['INSPECTION DATE','GRADE DATE'])
original_file = 'DOHMH_New_York_City_Restaurant_Inspection_Results_new.csv'

In [5]:
with open('dicts.pickle','rb') as f:
    dict_action, dict_violation = pickle.load(f)

# Grade breakdown

In [5]:
grades = df15[df15['GRADE'].isin(['A','B','C'])].groupby(['CAMIS'])['GRADE'].unique()
grades = grades.apply(lambda x: x[0]).value_counts()

In [6]:
dft = pd.concat([grades, grades/grades.sum()], axis=1)
dft.columns = ['count','percent']

In [8]:
percents = np.concatenate(([0],np.cumsum(dft['percent']).values))
ends = [-p*2*np.pi for p in percents[:-1]]
starts = [-p*2*np.pi for p in percents[1:]]

reset_output()
p = figure(title = 'NYC restaurant grade distribution 2015',
           x_range=(-1,1), y_range=(-1,1),
           width = 450,
           height = 380,
           tools='save')

colors = [mf.A_color, mf.B_color, mf.C_color]

for i in range(3):
    p.wedge(x=0, y=-.2, radius=.5, 
            start_angle= starts[i], end_angle= ends[i], 
            color=colors[i],
            legend='{}: {:>3.0f}%'.format(dft.index[i], dft['percent'][i]*100))

p.title_text_font_size = mf.title_font

p.outline_line_color = None
p.axis.axis_line_color = None
p.grid.grid_line_color = None

p.axis.major_tick_line_color = None
p.axis.minor_tick_line_color = None
p.axis.axis_label_text_color = None
p.axis.major_label_text_color = None

p.min_border_bottom = 0

mf.plot_output(p,'Grade_distribution')

## Score distribution

## Initial inspection

In [6]:
inspection_type = ['Pre-permit (Operational) / Initial Inspection',
                   'Cycle Inspection / Initial Inspection']

print('Restaurants covered:', df15[df15['INSPECTION TYPE'].isin(inspection_type)]['CAMIS'].nunique())

dft = df15[df15['INSPECTION TYPE'].isin(inspection_type)].groupby(
                  ['INSPECTION DATE','CAMIS'])['SCORE'].unique()
dft = dft.map(lambda x: x[0])
st = dft.value_counts()
st = st[st.index>=0] # Get rid of -1 scores

Restaurants covered: 22360


In [29]:
# mf.score_distribution_plot(st,'Initial inspection score distribution')
# def score_distribution_plot(score_distribution,title):

score_distribution = st
title = 'Initial inspection score distribution 2015'

title_font = '13pt'
axis_font = '10pt'
days_of_week = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']

A_color = '#1f77b4'
B_color = 'limegreen'
C_color = 'orange'


reset_output()
n = score_distribution.sum()

score_distribution = score_distribution.to_frame(name='freq')
score_distribution['half'] = score_distribution['freq']/2
score_distribution['percent'] = score_distribution['freq']/n

dt = score_distribution[score_distribution.index < 13.5] # A 
A_source = ColumnDataSource(dt.to_dict('list'))
A_source.add(dt.index,name='score')

dt = score_distribution[(score_distribution.index > 13.5) & (score_distribution.index < 27.5)] # B 
B_source = ColumnDataSource(dt.to_dict('list'))
B_source.add(dt.index,name='score')

dt = score_distribution[score_distribution.index > 27.5] # C 
C_source = ColumnDataSource(dt.to_dict('list'))
C_source.add(dt.index,name='score')

wh = 450
p = figure(title=title,
          x_axis_label='score',
          y_axis_label='count',
          plot_width = wh,
          plot_height = wh,
          x_range = Range1d(-3,120),
          y_range = Range1d(0,3000))
hover = HoverTool(tooltips=[('score','@score'),
                            ('count','@freq'),
                            ('percent','@percent{0.0%}')])
p.add_tools(hover)

def histo(p,source,color,grade):
    tally = sum(source.data['freq'])
    p.rect('score','half',
           width=1,
           height='freq',
           fill_color=color,
           line_color=color,
           hover_fill_color=color,
           hover_line_color='#333333',
           legend='{}: {:>3.0f}%'.format(grade, tally*100/n),
           source=source)

histo(p, A_source, A_color,'A')
histo(p, B_source, B_color,'B')
histo(p, C_source, C_color,'C')

p.title.text_font_size = title_font
p.xaxis.axis_label_text_font_size = axis_font
p.yaxis.axis_label_text_font_size = axis_font

from bokeh.models import Arrow, NormalHead, Label
crevass_arrow = Arrow(end=NormalHead(size=5),
                        x_start= 28, y_start=1300,
                      x_end=15, y_end=1000)
p.add_layout(crevass_arrow)
crevass_label = Label(x=28, y=1300, 
                      text='unnatural crevasse', render_mode='css')
p.add_layout(crevass_label)

x_pos = 40
y_pos = 700
no_crevass_arrow = Arrow(end=NormalHead(size=5),
                        x_start=x_pos, y_start=y_pos,
                      x_end=27.5, y_end=400)
p.add_layout(no_crevass_arrow)
no_crevass_label = Label(x=x_pos, y=y_pos, 
                      text='no crevasse', render_mode='css')
p.add_layout(no_crevass_label)



# Label(x=70, y=70, x_units='screen' text='Some Stuff', render_mode='css',
#      border_line_color='black', border_line_alpha=1.0,
#     background_fill_color='white', background_fill_alpha=1.0)


mf.plot_output(p,title.replace(' ','_'))

In [7]:
from bokeh.models import Label

## Reinspection

In [13]:
inspection_type = ['Pre-permit (Operational) / Re-inspection',
                   'Cycle Inspection / Re-inspection']

print('Restaurants covered:', df15[df15['INSPECTION TYPE'].isin(inspection_type)]['CAMIS'].nunique())

dft = df15[df15['INSPECTION TYPE'].isin(inspection_type)].groupby(
                  ['INSPECTION DATE','CAMIS'])['SCORE'].unique()
dft = dft.map(lambda x: x[0])
st = dft.value_counts()
st = st[st.index>=0] # Get rid of -1 scores

Restaurants covered: 11491


In [14]:
mf.score_distribution_plot(st,'Re-inspection score distribution')

# Closed

In [14]:
closed = df15[df15['ACTION']==2].groupby(['CAMIS','INSPECTION DATE'])['SCORE'].unique().map(lambda x: x[0])

In [19]:
closed = closed[closed >= 0]

In [20]:
closed.describe()

count    631.000000
mean      47.437401
std       18.690176
min        0.000000
25%       38.000000
50%       47.000000
75%       56.000000
max      117.000000
Name: SCORE, dtype: float64

In [24]:
dft = closed.value_counts()

In [None]:
mf.score_distribution_plot(dft, 'closed score distribution')

# Common violations

In [15]:
violation_prevalence = df15.groupby('VIOLATION CODE')['CAMIS'].nunique()
restaurant_number = df15['CAMIS'].nunique()

violation_prevalence = pd.concat([violation_prevalence,
                                  violation_prevalence*100/restaurant_number,
                                  pd.Series(dict_violation)],
                                 join='inner',axis=1)
violation_prevalence.index.name = 'violation_code'
violation_prevalence.columns = ['number','percentage','description']
violation_prevalence.sort_values('percentage',inplace=True,ascending=False)

from bokeh.models import Range1d
from bokeh.models import NumeralTickFormatter

dft = violation_prevalence[violation_prevalence['percentage']>=20]
dft.loc[:,'order'] = range(len(dft),0,-1) # chart start at 1
dft.loc[:,'half'] = dft['percentage']/2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
hover = HoverTool(tooltips=[('code','@violation_code'),
                            ('prevalence','@percentage{1.1}%'),
                            ('description','@description')])

wh = 450
reset_output()
p = figure(title = 'Common violations in 2015',
           plot_width = wh,
           plot_height = wh,
           x_axis_label = 'percent of restaurants inspected',
#           y_axis_label = 'violation code (hover for description)',
           tools = 'save',
           x_range = Range1d(0,70), 
           y_range = dft.index.tolist()[::-1])
p.add_tools(hover)

# Critical violations
phh_set = ['02G','02B','04H']

phh = ColumnDataSource(dft[dft.index.isin(phh_set)])
critical = ColumnDataSource(dft[(dft.index < '08') & ~dft.index.isin(phh_set)])
general = ColumnDataSource(dft[dft.index > '08'])

sources = [phh,critical,general]
legends = ['critical - PHH','critical - other','general']
colors = ['crimson','coral','#1f77b4']

for i in range(len(sources)):
    p.rect('half', 'order', width='percentage', 
           height=0.4,
           color=colors[i],
           legend=legends[i],
           source=sources[i])

p.legend.orientation = 'bottom_right'
p.xaxis[0].formatter = NumeralTickFormatter(format="0")
p.yaxis.major_tick_line_color = None
p.ygrid.grid_line_color = None

p.title_text_font_size = mf.title_font
p.xaxis.axis_label_text_font_size = mf.axis_font
p.yaxis.axis_label_text_font_size = mf.axis_font


mf.plot_output(p,'violations')

  super(HasProps, self).__setattr__(name, value)


# Violations

In [44]:
print('critical violations:', sum(x < '08' for x in dict_violation.keys()))
print('general violations:', sum((x > '08') & (x < '11')  for x in dict_violation.keys()) + 1)
# includes '10K' (not in scoresheet) and + 1  for '99B' ('Other general')
print('unscored violations:', sum((x > '15') & (x < '99')  for x in dict_violation.keys()))
print('total:', len(dict_violation))

critical violations: 50
general violations: 18
unscored violations: 30
total: 98


In [17]:
from bokeh.models import Range1d
from bokeh.models import NumeralTickFormatter

def violation_prevalence_plot(violation_set, df=df15, upper=40):
    if not isinstance(violation_set,list):
        violation_set = [violation_set]

    initial_type = ['Pre-permit (Operational) / Initial Inspection',
               'Cycle Inspection / Initial Inspection']

    reinspect_type = ['Pre-permit (Operational) / Re-inspection',
               'Cycle Inspection / Re-inspection']

    inspection_type = [initial_type, reinspect_type]

    s_list = range(2)

    for i in range(2):
        dft = df[df['INSPECTION TYPE'].isin(inspection_type[i]) 
                 & (df['SCORE'] <= upper)][[
                'INSPECTION DATE','CAMIS','VIOLATION CODE','SCORE']]
        score_count = dft.groupby(['SCORE','INSPECTION DATE'])['CAMIS'].nunique().sum(level=0) 
        violation_count = dft[dft['VIOLATION CODE'].isin(violation_set)].groupby(
            ['SCORE','INSPECTION DATE'])['CAMIS'].nunique().sum(level=0)
        percentage = violation_count/score_count
        percentage = percentage.reindex(range(percentage.index.max()+1))
        percentage.name = 'percent'
        s_list[i] = percentage.fillna(0)

    source = range(2)
    source[0] = ColumnDataSource(s_list[0].to_frame())
    source[1] = ColumnDataSource(s_list[1].to_frame())

    wh = 450
    reset_output()
    p = figure(title='Prevalence of {} violation by score'.format(', '.join(violation_set)),
           x_axis_label='score',
           width = wh,
           height = wh,
           x_range = Range1d(0,upper),
           y_range = Range1d(0,1),
               tools = ['resize','save','reset']
          )
    hover = HoverTool(tooltips='@SCORE, @percent{1.%}')
    p.add_tools(hover)

    color_list = ['#1f77b4','coral']
    legend_list = ['initial inspection','re-inspection']

    for i in range(2):        
        p.line('SCORE', 'percent', color=color_list[i], legend=legend_list[i], source=source[i],
        #          line_width=1
              )
        p.square('SCORE', 'percent', color=color_list[i], legend=legend_list[i], source=source[i],
        #            size=4
                )

    p.line([13.5,13.5],[-2,2],line_dash='dashed',color='limegreen')
    p.line([27.5,27.5],[-2,2],line_dash='dashed',color='orange')    


    p.yaxis[0].formatter = NumeralTickFormatter(format="0%")

    p.title_text_font_size = mf.title_font
    p.xaxis.axis_label_text_font_size = mf.axis_font
    p.yaxis.axis_label_text_font_size = mf.axis_font

    mf.plot_output(p,'{}_prevalence'.format('_'.join(violation_set)))

In [18]:
violation_prevalence_plot('02G')

In [19]:
violation_prevalence_plot('08A')

In [20]:
violation_prevalence_plot('10F')

# Sandbox

In [130]:
dfx.columns = ['A','B','C']

In [128]:
d = {'a':[1,1,2,3,1],
     'b':[1,2,2,1,2],
    'c':['x','y','z','y','x']}
dfx = pd.DataFrame(d)

In [197]:
q.fillna(0)

0    1
1    1
2    2
3    3
4    1
Name: A, dtype: int64

In [177]:
q = dfx['A']

In [192]:
x = range(2)
x[0] = q

In [189]:
a = dfx['A']
b = dfx['B']
Q = [a,b]

In [187]:
a = ColumnDataSource(dfx)
b = ColumnDataSource(dfx)
[a,b]

[<bokeh.models.sources.ColumnDataSource at 0xc14ad30>,
 <bokeh.models.sources.ColumnDataSource at 0xc14ada0>]

In [29]:
a = dfx.groupby(['a','b'])['c'].unique()

In [109]:
list1 = ['1', '2', '3']
str1 = ''.join(list1)

In [114]:
df3 = pd.DataFrame({'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]})
# df3.where(lambda x: x > 4, lambda x: x + 10)

In [118]:
x = ColumnDataSource(df3)
x.data

{'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9], 'index': [0, 1, 2]}

In [371]:
d = {'a':[1,1,2,3,1],
     'b':[1,2,2,1,2],
     'c':['x','x','x','x','x']}
df1 = pd.DataFrame(d)
df1.groupby(['a','b']).count() 

Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
1,1,1
1,2,2
2,2,1
3,1,1


In [376]:
df2 = df1.drop('c',1)
q = df2.groupby(['a'])['b'].value_counts() 