In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm.autonotebook import tqdm
import traceback

from sklearn import preprocessing

from utils.utilities import mk_heatmap, value_heatmap, select_by_date, update_grade
from utils.strings import DSNY_311, DEP_311, DOHMH_311, DATE, VIOLATIONS

  


# Second round of thoughts

I've done some more thinking about it, looking at the data, googling around, and reading about how NYC does their inspection process. There are basically two types of inspections... ones that center around food handling and ones that don't. The ones that have to do with food handling contribute to the grade posted in the window. 

* From NYC: 

The New York City Health Department inspects all food service establishments to make sure they meet Health Code requirements, which helps prevent foodborne illness. How often a restaurant is inspected depends on its inspection score. Restaurants that receive a low score on the initial or first inspection in the inspection cycle are inspected less often than those that receive a high score.

https://www1.nyc.gov/assets/doh/downloads/pdf/rii/inspection-cycle-overview.pdf


* From the blue book 

Only certain inspections result in a grade . Every food service establishment is scheduled for at least one inspection per year . A restaurant that scores 0 to 13 violation points on its first inspection will receive an A-grade card that must be posted immediately . An establishment that does not score an A on its initial inspection will not have to post a grade until it has had the opportunity to improve its sanitary conditions and is re-inspected . If an A is issued on re-inspection, the A-grade card must be posted immediately . An establishment receiving a B or C grade on re-inspection receives two cards: one showing the letter grade and one that says “Grade Pending”; one of those cards must be posted immediately . The final grade is determined at OATH .The frequency of inspections depends on a restaurant’s score .Restaurants with A grades are inspected less often than those with B or C grades . Frequent inspections of poorer-performing establishments enable the Health Department to closely monitor their food safety practices, while giving them more opportunities to improve their grades .The letter grade or “Grade Pending” card must be posted in a place where it is easily seen by people passing by . It must be on the front window, door or an outside-facing wall . The card must be within 5 feet from the front door or other entrance, and within 6 feet from the ground or floor 

https://www1.nyc.gov/assets/doh/downloads/pdf/rii/blue-book.pdf


### Things we need to change from the intial try:

* Only focus on cycle inspections
* Only focus on violations that are listed in the blue book
* ???


In [2]:
# Load all the data
inspecs = pd.read_csv('./data/inspections.csv', parse_dates=[DATE])
threeoneone = pd.read_csv('./data/311.csv')
weather = pd.read_csv('./data/NYC_historical_weather.csv')

In [3]:
inspecs_cycles = inspecs[inspecs['inspection_type'].str.contains('Cycle', regex=False, na=False)]

In [4]:
inspecs_cycles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334879 entries, 0 to 398252
Data columns (total 15 columns):
camis                    334879 non-null int64
dba                      334879 non-null object
boro                     334879 non-null object
zipcode                  329883 non-null float64
cuisine_description      334879 non-null object
inspection_date          334879 non-null datetime64[ns]
action                   334879 non-null object
violation_code           333864 non-null object
violation_description    332858 non-null object
critical_flag            332858 non-null object
score                    334879 non-null float64
inspection_type          334879 non-null object
latitude                 334608 non-null float64
longitude                334608 non-null float64
grade                    179091 non-null object
dtypes: datetime64[ns](1), float64(4), int64(1), object(9)
memory usage: 40.9+ MB


In [5]:
# save just the cycle inspections
inspecs_cycles.to_csv('./data/cycle_inspections.csv', index=False)

## Remove all the non-food violations

codes from the blue book

It removes ~30 violations, so we are just gonna over write it here.

In [6]:
mask = inspecs_cycles.violation_code.isin(VIOLATIONS)
inspecs_cycles = inspecs_cycles[mask]
inspecs_cycles.to_csv('./data/cycle_inspections_onlybbcodes.csv', index=False)

In [7]:
inspecs_cycles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 332842 entries, 0 to 398252
Data columns (total 15 columns):
camis                    332842 non-null int64
dba                      332842 non-null object
boro                     332842 non-null object
zipcode                  327925 non-null float64
cuisine_description      332842 non-null object
inspection_date          332842 non-null datetime64[ns]
action                   332842 non-null object
violation_code           332842 non-null object
violation_description    332842 non-null object
critical_flag            332842 non-null object
score                    332842 non-null float64
inspection_type          332842 non-null object
latitude                 332574 non-null float64
longitude                332574 non-null float64
grade                    177537 non-null object
dtypes: datetime64[ns](1), float64(4), int64(1), object(9)
memory usage: 40.6+ MB


# Group all the violations together

Just using some code from notebook 03.

In [8]:
# this gets all the unique inspections
inspecs_single_visits = inspecs_cycles.drop_duplicates(subset=['camis', 'inspection_date'])

cols_to_merge = ['violation_code', 'violation_description', 'critical_flag', 'inspection_type']
# this merges all of the individual inspections together. 
for col in tqdm(cols_to_merge, total=4):
    df_tmp = inspecs[col].groupby([inspecs_cycles.camis, inspecs_cycles[DATE]]).apply(list).reset_index()
    inspecs_single_visits = pd.merge(inspecs_single_visits, df_tmp,  how='left', 
                                     left_on=['camis', DATE], right_on = ['camis', DATE])
    inspecs_single_visits.drop(columns=f'{col}_x', axis=1, inplace=True)
    inspecs_single_visits.rename(columns={f'{col}_y': f"{col}"}, inplace=True)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [9]:
inspecs_single_visits.head(5)

Unnamed: 0,camis,dba,boro,zipcode,cuisine_description,inspection_date,action,score,latitude,longitude,grade,violation_code,violation_description,critical_flag,inspection_type
0,50072444,THE SPOT CAFE,Queens,11385.0,Café/Coffee/Tea,2019-07-09,Violations were cited in the following area(s).,28.0,40.707167,-73.896938,,"[08A, 06A, 04L, 06B, 04H]",[Facility not vermin proof. Harborage or condi...,"[N, Y, Y, Y, Y]","[Cycle Inspection / Initial Inspection, Cycle ..."
1,41660537,AMERICAN BAR,Manhattan,10075.0,American,2017-09-20,Violations were cited in the following area(s).,12.0,40.770759,-73.953849,A,"[10H, 04A]",[Proper sanitization not provided for utensil ...,"[N, Y]","[Cycle Inspection / Initial Inspection, Cycle ..."
2,50037997,JALAPENOS MEXICAN,Brooklyn,11220.0,Mexican,2017-09-21,Violations were cited in the following area(s).,24.0,40.641044,-74.01455,,"[06A, 04N, 06D, 08A, 10F]",[Personal cleanliness inadequate. Outer garmen...,"[Y, Y, Y, N, N]","[Cycle Inspection / Initial Inspection, Cycle ..."
3,40365904,MEE SUM CAFE,Manhattan,10013.0,Café/Coffee/Tea,2017-02-14,Violations were cited in the following area(s).,9.0,40.714861,-73.9982,A,"[10F, 09A, 06D]",[Non-food contact surface improperly construct...,"[N, N, Y]","[Cycle Inspection / Initial Inspection, Cycle ..."
4,41346103,LA MAISON DU MACARON,Manhattan,10011.0,French,2018-08-31,Violations were cited in the following area(s).,13.0,40.743346,-73.993887,A,"[10B, 10H, 02B]",[Plumbing not properly installed or maintained...,"[N, N, Y]","[Cycle Inspection / Re-inspection, Cycle Inspe..."


In [10]:
inspecs_single_visits.grade = inspecs_single_visits.apply(update_grade, axis=1)

In [11]:
inspecs_single_visits.grade.value_counts()

A    73954
B    27197
C    12897
Name: grade, dtype: int64

In [12]:
inspecs_single_visits.grade.value_counts(normalize=True)

A    0.648446
B    0.238470
C    0.113084
Name: grade, dtype: float64

# Add more info

More code from notebook 03

In [13]:
# Replace the missing lats and lons with nan
inspecs_single_visits['latitude'].replace(0, np.nan, inplace=True)
inspecs_single_visits['longitude'].replace(0, np.nan, inplace=True)
threeoneone['latitude'].replace(0, np.nan, inplace=True)
threeoneone['longitude'].replace(0, np.nan, inplace=True)

# drop rows that don't have a lat and lon attached
inspecs_single_visits.dropna(subset = ['latitude', 'longitude'], inplace=True)
threeoneone.dropna(subset = ['latitude', 'longitude'], inplace=True)

# Do date things
inspecs_single_visits[DATE] = pd.to_datetime(inspecs_single_visits[DATE])
threeoneone.created_date = pd.to_datetime(threeoneone.created_date)
weather.DATE = pd.to_datetime(weather.DATE)

# Does the inspection have a critical violation?
nested = inspecs_single_visits['critical_flag'].values.tolist()
inspecs_single_visits['critical'] = [1 if 'Y' in sublist else 0 for sublist in nested]

# how many criticals?
inspecs_single_visits['num_critical'] = [sublist.count('Y') for sublist in nested]

# make a column to store which day of the week it is... 
# Remember that Monday is 0 Sunday = 6
# inspecs['weekday'] = inspecs_single_visits.inspection_date.dt.weekday
# threeoneone['weekday'] = threeoneone.created_date.dt.weekday
# decided I want the day name
inspecs_single_visits['weekday'] = inspecs_single_visits.inspection_date.dt.day_name()

# merge the weather data into the frame
inspecs_single_visits = inspecs_single_visits.merge(weather[['DATE', 'TMAX']], left_on=DATE, right_on='DATE')
# drop the extra date column
inspecs_single_visits.drop(labels='DATE', axis=1, inplace=True)
# rename to be lower case
inspecs_single_visits.rename(columns={'TMAX': 'tmax'}, inplace=True)

# another suggestion was to add the three-day average temp before the inspection
threeday = weather[['DATE', 'TMAX']].rolling('3D', min_periods=3, on='DATE', closed='left').mean()
inspecs_single_visits = inspecs_single_visits.merge(threeday[['DATE', 'TMAX']], left_on=DATE, right_on='DATE')
# drop the extra date column
inspecs_single_visits.drop(labels='DATE', axis=1, inplace=True)
# rename to be lower case
inspecs_single_visits.rename(columns={'TMAX': 'tmax_3d'}, inplace=True)

# now we are gonnna create a bunch of temporal stuff

# gotta make sure the dates are in order
inspecs_single_visits.sort_values(['camis', 'inspection_date'], inplace=True)

# time since last inspection
inspecs_single_visits['time_since_last'] = inspecs_single_visits.groupby('camis')['inspection_date'].diff().apply(lambda x: x.days)
# past critical violation?
inspecs_single_visits['past_critical'] = inspecs_single_visits.groupby(['camis'])['critical'].shift()
# past grade and score
inspecs_single_visits['past_grade'] = inspecs_single_visits.groupby(['camis'])['grade'].shift()
inspecs_single_visits['past_score'] = inspecs_single_visits.groupby(['camis'])['score'].shift()

# clean up a few things with the temporal stuff
inspecs_single_visits.time_since_last.replace(np.nan, 0, inplace=True)
inspecs_single_visits.past_critical.replace(np.nan, 0, inplace=True)

In [14]:
inspecs_single_visits.head(5).T

Unnamed: 0,13387,66368,89621,12847,63036
camis,30075445,30075445,30075445,30075445,30075445
dba,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP
boro,Bronx,Bronx,Bronx,Bronx,Bronx
zipcode,10462,10462,10462,10462,10462
cuisine_description,Bakery,Bakery,Bakery,Bakery,Bakery
inspection_date,2017-05-18 00:00:00,2018-05-11 00:00:00,2019-05-16 00:00:00,2019-06-11 00:00:00,2019-10-21 00:00:00
action,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).
score,7,5,14,6,17
latitude,40.8482,40.8482,40.8482,40.8482,40.8482
longitude,-73.856,-73.856,-73.856,-73.856,-73.856


In [15]:
inspecs_single_visits.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111854 entries, 13387 to 14288
Data columns (total 24 columns):
camis                    111854 non-null int64
dba                      111854 non-null object
boro                     111854 non-null object
zipcode                  111854 non-null float64
cuisine_description      111854 non-null object
inspection_date          111854 non-null datetime64[ns]
action                   111854 non-null object
score                    111854 non-null float64
latitude                 111854 non-null float64
longitude                111854 non-null float64
grade                    111854 non-null object
violation_code           111854 non-null object
violation_description    111854 non-null object
critical_flag            111854 non-null object
inspection_type          111854 non-null object
critical                 111854 non-null int64
num_critical             111854 non-null int64
weekday                  111854 non-null object
tmax         

In [None]:
inspecs_single_visits.to_csv('./data/inspecs_single_visits.csv', index=False)

# Add the Heatmap Data

In [16]:
def score_heatmap(in_frame, out_frame, heatmap_frame, date_key='inspection_date', 
                  out_key='heat_score', window=90, s=2, bins=1000):
    
    grouped = in_frame.groupby(date_key)
    
    for date, group in tqdm(grouped, total=len(grouped)):
    
        # create the heat map
        end_date = pd.to_datetime(date)
        start_date = end_date - pd.to_timedelta(window, unit='days')

        # build the heataps for the 311 and nypd
        heatmap_frame_date = select_by_date(heatmap_frame, start_date, end_date)        
        
        if not heatmap_frame_date.shape[0]:
            continue

        img, extent, xedges, yedges= mk_heatmap(heatmap_frame_date.longitude.values, 
                                                heatmap_frame_date.latitude.values, s, bins=bins)

        for idx, lo, la in zip(group.index, group.longitude.values, group.latitude.values):
    
            out_frame[out_key].iloc[idx] = value_heatmap(lo, la, xedges, yedges, img)
    
    return out_frame

target_frame = pd.DataFrame(index=inspecs_single_visits.index)
target_frame['dsny_heat_score'] = 0.0
target_frame['dep_heat_score'] = 0.0
target_frame['dohmh_heat_score'] = 0.0

heatmap_frame = threeoneone[threeoneone.complaint_type.isin(DSNY_311)]
target_frame = score_heatmap(inspecs_single_visits, target_frame, heatmap_frame, 'inspection_date', 'dsny_heat_score')

heatmap_frame = threeoneone[threeoneone.complaint_type.isin(DEP_311)]
target_frame = score_heatmap(inspecs_single_visits, target_frame, heatmap_frame, 'inspection_date', 'dep_heat_score')

heatmap_frame = threeoneone[threeoneone.complaint_type.isin(DOHMH_311)]
target_frame = score_heatmap(inspecs_single_visits, target_frame, heatmap_frame, 'inspection_date', 'dohmh_heat_score')

HBox(children=(FloatProgress(value=0.0, max=1239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1239.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1239.0), HTML(value='')))




In [17]:
target_frame.describe()

Unnamed: 0,dsny_heat_score,dep_heat_score,dohmh_heat_score
count,111854.0,111854.0,111854.0
mean,0.221378,0.17015,0.152492
std,0.220264,0.195124,0.167318
min,0.0,0.0,0.0
25%,0.067819,0.052735,0.039844
50%,0.164837,0.118542,0.110121
75%,0.306675,0.224206,0.21813
max,4.574699,4.522346,10.359245


In [47]:
inspecs_scores = inspecs_single_visits.merge(target_frame, left_index=True, right_index=True)

In [48]:
inspecs_scores.head(5).T

Unnamed: 0,13387,66368,89621,12847,63036
camis,30075445,30075445,30075445,30075445,30075445
dba,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP
boro,Bronx,Bronx,Bronx,Bronx,Bronx
zipcode,10462,10462,10462,10462,10462
cuisine_description,Bakery,Bakery,Bakery,Bakery,Bakery
inspection_date,2017-05-18 00:00:00,2018-05-11 00:00:00,2019-05-16 00:00:00,2019-06-11 00:00:00,2019-10-21 00:00:00
action,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).
score,7,5,14,6,17
latitude,40.8482,40.8482,40.8482,40.8482,40.8482
longitude,-73.856,-73.856,-73.856,-73.856,-73.856


# Decide if the restaurants are chains

In [49]:
groups = inspecs_scores.groupby(['dba'])

In [50]:
inspecs_scores['is_chain'] = 0
for dba, group in groups:
    if group.camis.unique().shape[0] > 1:
        locs = [inspecs_scores.index.get_loc(idx) for idx in group.index]
        inspecs_scores['is_chain'].iloc[locs] = 1

In [51]:
inspecs_scores.head(5).T

Unnamed: 0,13387,66368,89621,12847,63036
camis,30075445,30075445,30075445,30075445,30075445
dba,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP,MORRIS PARK BAKE SHOP
boro,Bronx,Bronx,Bronx,Bronx,Bronx
zipcode,10462,10462,10462,10462,10462
cuisine_description,Bakery,Bakery,Bakery,Bakery,Bakery
inspection_date,2017-05-18 00:00:00,2018-05-11 00:00:00,2019-05-16 00:00:00,2019-06-11 00:00:00,2019-10-21 00:00:00
action,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).,Violations were cited in the following area(s).
score,7,5,14,6,17
latitude,40.8482,40.8482,40.8482,40.8482,40.8482
longitude,-73.856,-73.856,-73.856,-73.856,-73.856


In [52]:
inspecs_scores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111854 entries, 13387 to 14288
Data columns (total 28 columns):
camis                    111854 non-null int64
dba                      111854 non-null object
boro                     111854 non-null object
zipcode                  111854 non-null float64
cuisine_description      111854 non-null object
inspection_date          111854 non-null datetime64[ns]
action                   111854 non-null object
score                    111854 non-null float64
latitude                 111854 non-null float64
longitude                111854 non-null float64
grade                    111854 non-null object
violation_code           111854 non-null object
violation_description    111854 non-null object
critical_flag            111854 non-null object
inspection_type          111854 non-null object
critical                 111854 non-null int64
num_critical             111854 non-null int64
weekday                  111854 non-null object
tmax         

# Deal with missing values

Two columns have missing values, both of which have to do with the past scoring. 

There are a couple of things we can try. 

1. Fill in all the "past" initial scores with the median of all initial scores
2. Try to fetch each store's pre-opening inspection score. (hard?)
3. Create a flag that says whether or not it is an initial cycle inspection

Some stores don't have a pre-opening inspection (eg., camis = 30075445). So we are gonna try a combination of 1 and 2.

In [54]:
# first let's add a column saying if it's in the initial inspection
inspecs_scores['init_inspec'] = 0
inspecs_scores.loc[inspecs_scores.past_score.isnull(), 'init_inspec'] = 1

In [55]:
# median score of all the initial inspections
score_init = inspecs_scores[inspecs_scores.past_score.isnull()].score.median()

In [56]:
# That's an A grade
grade_init = 'A'

In [57]:
inspecs_scores.past_score.fillna(score_init, inplace=True)
inspecs_scores.past_grade.fillna(grade_init, inplace=True)

In [58]:
inspecs_scores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111854 entries, 13387 to 14288
Data columns (total 29 columns):
camis                    111854 non-null int64
dba                      111854 non-null object
boro                     111854 non-null object
zipcode                  111854 non-null float64
cuisine_description      111854 non-null object
inspection_date          111854 non-null datetime64[ns]
action                   111854 non-null object
score                    111854 non-null float64
latitude                 111854 non-null float64
longitude                111854 non-null float64
grade                    111854 non-null object
violation_code           111854 non-null object
violation_description    111854 non-null object
critical_flag            111854 non-null object
inspection_type          111854 non-null object
critical                 111854 non-null int64
num_critical             111854 non-null int64
weekday                  111854 non-null object
tmax         

In [61]:
inspecs_scores.columns.tolist()

['camis',
 'dba',
 'boro',
 'zipcode',
 'cuisine_description',
 'inspection_date',
 'action',
 'score',
 'latitude',
 'longitude',
 'grade',
 'violation_code',
 'violation_description',
 'critical_flag',
 'inspection_type',
 'critical',
 'num_critical',
 'weekday',
 'tmax',
 'tmax_3d',
 'time_since_last',
 'past_critical',
 'past_grade',
 'past_score',
 'dsny_heat_score',
 'dep_heat_score',
 'dohmh_heat_score',
 'is_chain',
 'init_inspec']

In [60]:
inspecs_scores.to_csv('./data/inspec_scores.csv', index=False)

In [None]:

# this is how we are gonna dummy encode all the violation codes
# s = inspecs_single_visits.violation_code
# pd.get_dummies(s.apply(pd.Series).stack()).sum(level=0)