In [27]:
%matplotlib notebook
import pandas as pd
import numpy as np
from utilities import mk_heatmap, value_heatmap, select_by_date, update_grade
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn import preprocessing

In [2]:
# Load all the data
inspecs = pd.read_csv('./data/inspections.csv')
threeoneone = pd.read_csv('./data/311.csv')
nypd = pd.read_csv('./data/nypd.csv')

In [None]:
# 

In [3]:
# Replace the missing lats and lons with nan
inspecs['latitude'].replace(0, np.nan, inplace=True)
inspecs['longitude'].replace(0, np.nan, inplace=True)
threeoneone['latitude'].replace(0, np.nan, inplace=True)
inspecs['longitude'].replace(0, np.nan, inplace=True)
nypd['latitude'].replace(0, np.nan, inplace=True)
inspecs['longitude'].replace(0, np.nan, inplace=True)

In [4]:
# drop rows that don't have a lat and lon attached
inspecs.dropna(subset = ['latitude', 'longitude'], inplace=True)
threeoneone.dropna(subset = ['latitude', 'longitude'], inplace=True)
nypd.dropna(subset = ['latitude', 'longitude'], inplace=True)

In [5]:
# Do date things
inspecs.inspection_date = pd.to_datetime(inspecs.inspection_date)
threeoneone.created_date = pd.to_datetime(threeoneone.created_date)
nypd.complaint_date = pd.to_datetime(nypd.complaint_date)


# make a column to store which day of the week it is... 
# Remember that Monday is 0 Sunday = 6
inspecs['weekday'] = inspecs.inspection_date.dt.weekday
threeoneone['weekday'] = threeoneone.created_date.dt.weekday
nypd['weekday'] = nypd.complaint_date.dt.weekday


start_date = '2016-01-01'
end_date = '2016-12-31'

inspecs_2016 = select_by_date(inspecs, start_date, end_date)
threeoneone_2016 = select_by_date(threeoneone, start_date, end_date)
nypd_2016 = select_by_date(nypd, start_date, end_date)

In [6]:
# clean up the grades in the inspections
inspecs.grade = inspecs.apply(update_grade, axis=1)

In [7]:
# add the 311 and nypd scores to the inspections frame

In [17]:
inspecs_2016['311_score'] = 0
inspecs_2016['nypd_score'] = 0

for month in range(1, 13):
    df_311_month = threeoneone_2016[threeoneone['created_date'].dt.month == month]
    df_nypd_month = nypd_2016[nypd_2016['complaint_date'].dt.month == month]
    df_inspec_month = inspecs_2016[inspecs_2016['inspection_date'].dt.month == month]
    
    if not df_nypd_month.shape[0]:
        continue
    elif not df_311_month.shape[0]:
        continue
    
    # build the heataps for the 311 and nypd
    s = 2 # sigmas
    bins = 1000
    img_311, extent_311, xedges_311, yedges_311 = mk_heatmap(df_311_month.longitude, 
                                                             df_311_month.latitude, s, bins=bins)
    img_nypd, extent_nypd, xedges_nypd, yedges_nypd = mk_heatmap(df_nypd_month.longitude, 
                                                             df_nypd_month.latitude, s, bins=bins)
    
    for i, row in df_inspec_month.iterrows():
        inspecs_2016.loc[i, '311_score'] = value_heatmap(row.longitude, 
                                                          row.latitude, 
                                                          xedges_311, 
                                                          yedges_311, 
                                                          img_311)
        inspecs_2016.loc[i, 'nypd_score'] = value_heatmap(row.longitude, 
                                                          row.latitude, 
                                                          xedges_nypd, 
                                                          yedges_nypd, 
                                                          img_nypd)

    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  """


In [18]:
inspecs_2016.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27666 entries, 0 to 28179
Data columns (total 18 columns):
camis                    27666 non-null int64
dba                      27666 non-null object
boro                     27666 non-null object
zipcode                  27666 non-null float64
cuisine_description      27666 non-null object
inspection_date          27666 non-null datetime64[ns]
action                   27666 non-null object
violation_code           27343 non-null object
violation_description    27321 non-null object
critical_flag            27321 non-null object
score                    26295 non-null float64
grade                    14021 non-null object
inspection_type          27666 non-null object
latitude                 27666 non-null float64
longitude                27666 non-null float64
weekday                  27666 non-null int64
311_score                27666 non-null float64
nypd_score               27666 non-null float64
dtypes: datetime64[ns](1), float6

In [50]:
# update the null values in critical flag to 'N'
inspecs_2016.critical_flag.replace(np.nan, 'N', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [51]:
# convert some of the catagorical data into numerical data
enc = preprocessing.OrdinalEncoder()

boros = inspecs_2016.boro.unique()

enc.fit(boros.reshape(-1,1))

inspecs_2016['boro_enc'] = enc.transform(inspecs_2016.boro.values.reshape(-1, 1)).flatten()

crit_flags = inspecs_2016.critical_flag.unique()

enc.fit(crit_flags.reshape(-1,1))

inspecs_2016['crit_flag_enc'] = enc.transform(inspecs_2016.critical_flag.values.reshape(-1, 1)).flatten()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [52]:
inspecs_2016.to_csv('./data/inspecs_2016_map.csv', index=False)

In [48]:
inspecs_2016[inspecs_2016.critical_flag.isnull()].iloc[0]

camis                                                           50000590
dba                                                            CAFE BIBA
boro                                                            Brooklyn
zipcode                                                            11249
cuisine_description                                      Café/Coffee/Tea
inspection_date                                      2016-01-28 00:00:00
action                   Violations were cited in the following area(s).
violation_code                                                       NaN
violation_description                                                NaN
critical_flag                                                        NaN
score                                                                  0
grade                                                                  A
inspection_type                    Cycle Inspection / Initial Inspection
latitude                                           

In [19]:
df = nypd_2016

In [21]:
month_hist = df.complaint_date.dt.month
month_hist = month_hist.value_counts().sort_index()
month_hist.index = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
plt.bar(month_hist.index, month_hist.values)
plt.xlabel("Month")
plt.ylabel("Count of Inspections")

ValueError: Length mismatch: Expected axis has 8 elements, new values have 12 elements

In [22]:
month_hist

1    35990
2    34738
3    39777
4    39665
5    41993
6    41944
7    41824
8    21562
Name: complaint_date, dtype: int64

In [None]:
day_hist = pd.to_datetime(df["inspection_date"]).dt.weekday
day_hist = day_hist.value_counts().sort_index()
day_hist.index = ["Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun"]
plt.bar(day_hist.index, day_hist.values)
plt.xlabel("Day of Week")
plt.ylabel("Count of Inspections")

In [None]:
df[df.inspection_date.dt.month == 1][['latitude', 'longitude']]

In [None]:
score_heatmap_month(y=40.597078, x=-73.941255, df=df, month=0)

In [None]:
fig, ax = plt.subplots(1,1)

x = df.longitude[df.inspection_date.dt.month == 1]
y = df.latitude[df.inspection_date.dt.month == 1]

s = 2

img, extent, xedges, yedges = mk_heatmap(x, y, s, bins=1000)
ax.imshow(img, extent=extent, origin='lower', cmap=cm.PuRd)
ax.set_title("NYPD  $\sigma$ = %d" % s)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')