## Load dataset and dependencies.

In [9]:
# Import required dependencies for data processing.
import pandas as pd
# Creates several weak prediction trees, then assembles it all into great predictive model.
import xgboost as xbg
# Logistical Regression is great when outcome is a categorical variable (critical violation, etc.)
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV as LogisticRegression
# Random Forest will help improve predictive accuracy and limit over-fitting.
from sklearn.ensemble import RandomForestClassifier
# classifier defined by a seperating hyperplane.
from sklearn.svm import SVC

%matplotlib inline

In [10]:
# Read the master dataset.
loc = "/mnt/c/Users/anton/Documents/Projects/Predicting_WaterlooHealthInspections/datasets/"
data = pd.read_csv(loc + "master_dataset.csv")

# View data
data.head()

# Unnamed: Index from before writting out to file. Will be ignored during analysis; not a feature.
# FACILITY_ID: Is the same feature as BUSINESS_NAME, don't need both as a feature.

Unnamed: 0.1,Unnamed: 0,FACILITY_ID,BUSINESS_NAME,ADDR,CITY,CAT_1,CAT_2,SUBCAT_1,SUBCAT_2,SUBCAT_3,INFRACTION_TYPE,RESULT,INFRACTION_TOTAL,INSPECTION_DATE,REQUIRE_REINSPECTION,CERTIFIED_FOOD_HANDLER,INSPECTION_TYPE
0,0,E2E63D72-B389-401F-B812-0048575D3A87,C'EST CHEESE PLEASE,40 GRAND AVE N,CAMBRIDGE,Food,General,Food Take Out,,,NON-CRITICAL,Corrected During Inspection,1.0,2017-05-13,N,No,Compliance Inspection
1,1,C087ED29-934D-4297-862C-00697C89075C,"WIRED UP PUGS CAFE-BISTRO, THE",93 GRAND AVE S,CAMBRIDGE,Food,General,Restaurant,,,NON-CRITICAL,Not in Compliance,2.0,2016-05-25,N,No,Compliance Inspection
2,2,C087ED29-934D-4297-862C-00697C89075C,"WIRED UP PUGS CAFE-BISTRO, THE",93 GRAND AVE S,CAMBRIDGE,Food,General,Restaurant,,,NON-CRITICAL,Not in Compliance,3.0,2017-08-23,N,No,Compliance Inspection
3,3,C087ED29-934D-4297-862C-00697C89075C,"WIRED UP PUGS CAFE-BISTRO, THE",93 GRAND AVE S,CAMBRIDGE,Food,General,Restaurant,,,NON-CRITICAL,Corrected During Inspection,1.0,2017-08-23,N,No,Compliance Inspection
4,4,C087ED29-934D-4297-862C-00697C89075C,"WIRED UP PUGS CAFE-BISTRO, THE",93 GRAND AVE S,CAMBRIDGE,Food,General,Restaurant,,,NON-CRITICAL,Not in Compliance,2.0,2017-06-28,Y,No,Compliance Inspection


## Explore the data.

In [11]:
# What are the different properties of the data?

# Data rows
n_rows = data.shape[0]

# Unique food establishments.
n_establishments = data.BUSINESS_NAME.unique().size

# Number of critical infractions.
n_critical_violations = data.loc[data.INFRACTION_TYPE == "CRITICAL"].shape[0]

# Number of non-critical infractions.
n_noncritical_violations = data.loc[data.INFRACTION_TYPE == "NON-CRITICAL"].shape[0]

# Numer of non-violations.
n_no_violations = data.loc[data.INFRACTION_TYPE.isnull()].shape[0]

# Print the results
print("Total data rows: {}".format(n_rows))
print("Unique Establishments: {}".format(n_establishments))
print("Critical violations: {}".format(n_critical_violations))
print("Non-Critical violations: {}".format(n_noncritical_violations))
print("No violations: {}".format(n_no_violations))

# Notice, there are more infractions than establishments. Which makes sense as a single establishment can have multiple infractions,
#  in one inspection, and multiple inspections over the lifetime of the dataset.
#  Also, note the ranking of infractions. 1. Non-critical, 2. No violations, 3. Critical violations.

Total data rows: 16410
Unique Establishments: 2481
Critical violations: 4147
Non-Critical violations: 7126
No violations: 5137


In [12]:
# Data exploring continued.

# Number of establishments with critical violations in history.
n_establishment_critical = data.loc[data.INFRACTION_TYPE == "CRITICAL"].BUSINESS_NAME.unique().size

# Non-critical violations in history.
n_establishment_noncritical = data.loc[data.INFRACTION_TYPE == "NON-CRITICAL"].BUSINESS_NAME.unique().size

# No critical violations in history.
n_establishment_none = data.loc[data.INFRACTION_TYPE.isnull()].BUSINESS_NAME.unique().size


# Print results.
print("Unique Establishments: {}".format(n_establishments))
print("Establishments with critical violations: {} ({:.2%})".format(n_establishment_critical, n_establishment_critical/n_establishments))
print("Establishments with non-critical violations: {} ({:.2%})".format(n_establishment_noncritical, n_establishment_noncritical/n_establishments))
print("Establishments with no violations: {} ({:.2%})".format(n_establishment_none, n_establishment_none/n_establishments))

#Observe:
#  Over half of businesses in the dataset had a critical violation in the past.
#  Without a timeline for violations, it is difficult to see if critical violations decreased over time. (sign of progress)

Unique Establishments: 2481
Establishments with critical violations: 1357 (54.70%)
Establishments with non-critical violations: 1613 (65.01%)
Establishments with no violations: 2057 (82.91%)
