In [58]:
# Data manipulation
import numpy as np
import pandas as pd

# Ploting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Scientific computing
import scipy as sp

# Machine Learning
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import f1_score
import random
import datetime as dt
from sklearn import tree

In [4]:
df = pd.read_csv('data/inspections_clean.csv')

In [40]:
#last 2 months
df.inspection_dt = df.inspection_dt.astype('datetime64[ns]')
last2_df = df[((df.inspection_dt.dt.month==9) | (df.inspection_dt.dt.month==10)) & (df.inspection_dt.dt.year==2016)]

In [52]:
last2_df.inspection_dt.groupby(last2_df.inspection_dt.dt.day).count()

inspection_dt
1      57
2      64
3       2
4      63
5      51
6     111
7      68
8      71
9      59
11     47
12      2
13    126
14    109
15     62
16     70
17     33
18     51
19     77
20     96
21     71
22     58
23     49
24     33
25     44
26     71
27    106
28     61
29     56
30     59
31     13
Name: inspection_dt, dtype: int64

In [54]:
#assume 30 /day since that's the average
last2_df.shape[0]/60

30

In [55]:
last2_df.columns

Index([u'Unnamed: 0', u'License #', u'risk_description', u'Zip',
       u'inspection_date_string', u'Y_description', u'Latitude', u'Longitude',
       u'Y', u'Y_fail', u'reinspection?', u'recent_inspection?',
       u'task_force?', u'special_event?', u'canvass?', u'closeup?', u'liquor?',
       u'fire?', u'child?', u'no_entry?', u'complaint?', u'license?', u'risk',
       u'inspection_type', u'inspection_dt', u'prev_fail',
       u'cumulative_failures', u'ever_failed', u'cumulative_inspections',
       u'proportion_past_failures', u'days_since_last_inspection'],
      dtype='object')

In [95]:
y = pd.DataFrame()
y["License #"] = last2_df["License #"]
y["Y_fail"] = last2_df["Y_fail"]
y["inspection_dt"] = last2_df["inspection_dt"].astype('datetime64[ns]')


y_hat = pd.DataFrame()
y_hat["License #"] = last2_df["License #"]
#fake predictions (random)
y_hat["Y_fail"] = np.random.randint(0,2,size=y.shape[0]) #random pass/fail
#y_hat["inspection_dt"] = last2_df["inspection_dt"].astype('datetime64[ns]')



#now for the sake of naieve testing, we'll group the fails and assign 30/day

dates = []

for o in range(736208,736269): #ordinals for 9/1 to 10/31
    dates.append(dt.date.fromordinal(o))

y_hat = y_hat.sort(['Y_fail'], ascending=[0])
#y_hat["inspection_dt"] = y_hat["inspection_dt"].astype('datetime64[ns]')

j=0
k=0
for i,item in y_hat.iterrows():
    #print i, item
    #item["inspection_dt"] = dates[k]
    #y_hat.loc[y_hat.iloc[i], 'inspection_dt'] = dates[k]
    y_hat.ix[i, "inspection_dt"] = dates[k]
    
    j = j+1
    
    if j > 30: #max allowed per day
        j = 1 #reset the counter

        if k < len(dates) - 1:
            k = k + 1 #move to next day

In [115]:
#change its type to match
y_hat["inspection_dt"] = y_hat["inspection_dt"].astype('datetime64[ns]')

In [208]:
def score(predicted, actual):
    #iterate each actual
    deltas = []
    
    actual_fails = actual[actual["Y_fail"] == 1]
    
    for i,item in actual_fails.iterrows(): 
        #find the corresponding predicted and calculate a date delta for fails only
        predicted_dt = predicted[predicted.index == item.name]["inspection_dt"]
        diff = (item.inspection_dt - predicted_dt).dt.days
        diff = diff.reset_index()
        deltas.append(diff["inspection_dt"].astype('int'))
    #avg them
    s = np.mean(scores)
    return s
    
score(y_hat, y)

-4.4830917874396139

This means that this (random) model caught failures 4.4 days better than the actual inspection process, on average.

In [214]:
#Another way to measure is the proportion of actual 
#failures caught in the first 30 days
def score2(predicted, actual):
    actual_fails = actual[actual["Y_fail"] == 1]
    
    predicted_first_month = predicted[predicted.inspection_dt.dt.month == 9]
    
    overlap = actual_fails[actual_fails["License #"].isin(predicted_first_month["License #"])]["License #"].count()

    return overlap/float(actual_fails.shape[0])

score2(y_hat, y)

0.70289855072463769

This is saying that this model caught 70% of the failures in the first month