# Initial ETL

In this notebook I am gonna do a bunch of stuff to get the data ready for the model. I might need to come back and do some more ETL as I work on the modeling bit, but this is the initial round.

In [None]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm.autonotebook import tqdm
import traceback

from sklearn import preprocessing

from utils.utilities import mk_heatmap, value_heatmap, select_by_date, update_grade
from utils.strings import DSNY_311, DEP_311, DOHMH_311

In [None]:
# Load all the data
inspecs = pd.read_csv('./data/inspections.csv')
threeoneone = pd.read_csv('./data/311.csv')
weather = pd.read_csv('./data/NYC_historical_weather.csv')

### But there are multiple citations for each restaurant in the df

For some reason I missed the fact that each row in the inspections data contains a single violation. We are gonna need to merge all the different rows into single restaurant inspections.

In [None]:
inspecs_single_visits = inspecs.drop_duplicates(subset=['camis', 'inspection_date'])

cols_to_merge = ['violation_code', 'violation_description', 'critical_flag', 'inspection_type']

for col in tqdm(cols_to_merge, total=4):
    df_tmp = inspecs[col].groupby([inspecs.camis, inspecs.inspection_date]).apply(list).reset_index()
    inspecs_single_visits = pd.merge(inspecs_single_visits, df_tmp,  how='left', left_on=['camis','inspection_date'], right_on = ['camis','inspection_date'])
    inspecs_single_visits.drop(columns=f'{col}_x', axis=1, inplace=True)
    inspecs_single_visits.rename(columns={f'{col}_y': f"{col}"}, inplace=True)

In [None]:
inspecs_single_visits.shape

In [None]:
inspecs = inspecs_single_visits

# Clean up some of the dtypes and replace missing values with better values

In [None]:
# Replace the missing lats and lons with nan
inspecs['latitude'].replace(0, np.nan, inplace=True)
inspecs['longitude'].replace(0, np.nan, inplace=True)
threeoneone['latitude'].replace(0, np.nan, inplace=True)
threeoneone['longitude'].replace(0, np.nan, inplace=True)

# drop rows that don't have a lat and lon attached
inspecs.dropna(subset = ['latitude', 'longitude'], inplace=True)
threeoneone.dropna(subset = ['latitude', 'longitude'], inplace=True)

# Do date things
inspecs.inspection_date = pd.to_datetime(inspecs.inspection_date)
threeoneone.created_date = pd.to_datetime(threeoneone.created_date)
weather.DATE = pd.to_datetime(weather.DATE)

# update the grades where a score is given but the letter grade is missing
inspecs.grade = inspecs.apply(update_grade, axis=1)


# Merge (or create) extra data into the inspections frame

In [None]:
# Does the inspection have a critical violation?
nested = inspecs['critical_flag'].values.tolist()
inspecs['critical'] = [1 if 'Y' in sublist else 0 for sublist in nested]

# make a column to store which day of the week it is... 
# Remember that Monday is 0 Sunday = 6
# inspecs['weekday'] = inspecs.inspection_date.dt.weekday
# threeoneone['weekday'] = threeoneone.created_date.dt.weekday
# decided I want the day name
inspecs['weekday'] = inspecs.inspection_date.dt.day_name()
threeoneone['weekday'] = threeoneone.created_date.dt.day_name()


# merge the weather data into the frame
inspecs = inspecs.merge(weather[['DATE', 'TMAX']], left_on='inspection_date', right_on='DATE')
# drop the extra date column
inspecs.drop(labels='DATE', axis=1, inplace=True)
# rename to be lower case
inspecs.rename(columns={'TMAX': 'tmax'}, inplace=True)

# now we are gonnna create a bunch of temporal stuff

# gotta make sure the dates are in order
inspecs.sort_values(['camis', 'inspection_date'], inplace=True)

# time since last inspection
inspecs['time_since_last'] = inspecs.groupby('camis')['inspection_date'].diff().apply(lambda x: x.days)
# past critical violation?
inspecs['past_critical'] = inspecs.groupby(['camis'])['critical'].shift()
# past grade and score
inspecs['past_grade'] = inspecs.groupby(['camis'])['grade'].shift()
inspecs['past_score'] = inspecs.groupby(['camis'])['score'].shift()

# clean up a few things with the temporal stuff
inspecs.time_since_last.replace(np.nan, 0, inplace=True)
inspecs.past_critical.replace(np.nan, 0, inplace=True)

# start_date = '2016-01-01'
# end_date = '2016-12-31'

# inspecs_2016 = select_by_date(inspecs, start_date, end_date)
# threeoneone_2016 = select_by_date(threeoneone, start_date, end_date)
# nypd_2016 = select_by_date(nypd, start_date, end_date)

In [None]:
inspecs.head(5).T

## Now we need to add all the heat map data

This is going to be the longest/most challenging bit. 

In [None]:
def score_heatmap(in_frame, out_frame, heatmap_frame, date_key='inspection_date', 
                  out_key='heat_score', window=90, s=2, bins=1000):
    
    grouped = in_frame.groupby(date_key)
    
    for date, group in tqdm(grouped, total=len(grouped)):
    
        # create the heat map
        end_date = pd.to_datetime(date)
        start_date = end_date - pd.to_timedelta(window, unit='days')

        # build the heataps for the 311 and nypd
        heatmap_frame_date = select_by_date(heatmap_frame, start_date, end_date)        
        
        if not heatmap_frame_date.shape[0]:
            continue

        img, extent, xedges, yedges= mk_heatmap(heatmap_frame_date.longitude.values, 
                                                heatmap_frame_date.latitude.values, s, bins=bins)

        for idx, lo, la in zip(group.index, group.longitude.values, group.latitude.values):
    
            out_frame[out_key].iloc[idx] = value_heatmap(lo, la, xedges, yedges, img)
    
    return out_frame

In [None]:
target_frame = pd.DataFrame(index=inspecs.index)
target_frame['dsny_heat_score'] = 0.0
target_frame['dep_heat_score'] = 0.0
target_frame['dohmh_heat_score'] = 0.0

In [None]:
heatmap_frame = threeoneone[threeoneone.complaint_type.isin(DSNY_311)]
target_frame = score_heatmap(inspecs, target_frame, heatmap_frame, 'inspection_date', 'dsny_heat_score')

heatmap_frame = threeoneone[threeoneone.complaint_type.isin(DEP_311)]
target_frame = score_heatmap(inspecs, target_frame, heatmap_frame, 'inspection_date', 'dep_heat_score')

heatmap_frame = threeoneone[threeoneone.complaint_type.isin(DOHMH_311)]
target_frame = score_heatmap(inspecs, target_frame, heatmap_frame, 'inspection_date', 'dohmh_heat_score')

In [None]:
target_frame.describe()

In [None]:
inspecs_scores = inspecs.merge(target_frame, left_index=True, right_index=True)

In [None]:
inspecs_scores.head(5).T

# Write things to a CSV !!!

In [None]:
inspecs_scores.to_csv('./data/inspecs_heat_scores.csv', index=False)