# Gathering all the data, then doing a clean for a master dataset.

In [1]:
# Import the needed libraries
import numpy as np
import pandas as pd
from datetime import datetime as dt
import itertools


In [2]:
# Load CSV data sets into a dataframe

loc = "/mnt/c/Users/anton/Documents/Projects/Predicting_WaterlooHealthInspections/datasets/"

raw_dataset_0 = pd.read_csv(loc + 'Facilities_OpenData.csv')
raw_dataset_1 = pd.read_csv(loc + 'Infractions_OpenData.csv')
raw_dataset_2 = pd.read_csv(loc + 'Inspections_OpenData.csv')

In [3]:
# Parse data as time 

def parse_date1(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d').date()

# two different date formats present
def parse_date2(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y/%m/%d').date()


raw_dataset_1.InspectionDate = raw_dataset_1.InspectionDate.apply(parse_date1)
raw_dataset_2.INSPECTION_DATE = raw_dataset_2.INSPECTION_DATE.apply(parse_date2)


In [4]:
# Get relevant columns per raw dataset

facilities_col_req = ["FACILITYID", "BUSINESS_NAME", "ADDR", "CITY", "CATEGORY", "SUBCATEGORY"]
infractions_col_req = ["INSPECTION_ID", "INFRACTION_TYPE", "Result", "InspectionDate"] #certain fields ignored, no value or inability to parse in a useful manner
inspections_col_req = ["INSPECTION_ID", "FACILITYID", "INSPECTION_DATE", "REQUIRE_REINSPECTION", "CERTIFIED_FOOD_HANDLER", "INSPECTION_TYPE"]

# Create new dataframe with required columns

data_facilities = raw_dataset_0[facilities_col_req]
data_infractions = raw_dataset_1[infractions_col_req]
data_inspections = raw_dataset_2[inspections_col_req]

In [5]:
# Clean-up the new dataframes by either splitting columns or merging multiple rows

# Facilities dataset combines several attributes into one column, split the attributes
def facilities_column_split(facilities):
    # split columns
    facilities["CAT_1"], facilities["CAT_2"] = facilities["CATEGORY"].str.split(",", 1).str
    facilities["SUBCAT_1"], facilities["SUBCAT_2"], facilities["SUBCAT_3"] = facilities["SUBCATEGORY"].str.split("/", 2).str

    # drop the old columns
    facilities.drop('CATEGORY', axis=1, inplace=True)
    facilities.drop('SUBCATEGORY', axis=1, inplace=True)
    
    return facilities

def infractions_aggregate_rows(df):
    # add a counter column, (that will be removed later)
    df = df.assign(InfractionCount=1)
    
    # sum all identical rows in a new column
    df['INFRACTION_TOTAL'] = df.groupby(["INSPECTION_ID", "INFRACTION_TYPE", "Result", "InspectionDate"])["InfractionCount"].transform('sum')
    
    # drop all but the first occurance of duplicate rows
    df.drop_duplicates(subset=["INSPECTION_ID", "INFRACTION_TYPE", "Result", "InspectionDate"], keep='first', inplace=True)
    
    # drop temporary InfractionCount column
    df.drop('InfractionCount', axis=1, inplace=True)
    
    return df

# Apply changes

data_facilities = facilities_column_split(data_facilities)
data_infractions = infractions_aggregate_rows(data_infractions)

In [8]:
# Merge the three datsets into one master set


# merge helper
def merge_frames(df1, df2, merge_key):
    result = pd.merge(df1, df2, on=merge_key, how='outer')
    
    return result

# First merge data_infractions and data_inspections on INSPECTION_ID
data_master = merge_frames(data_infractions, data_inspections, merge_key="INSPECTION_ID")

# Second merge data_master and data_facilities on "FACILITYID to complete the merge process.
data_master = merge_frames(data_master, data_facilities, merge_key="FACILITYID")


In [9]:
# Clean up the master data set.

# Change input to upper case.
def upper_case(data):
    if type(data) is str:
        return data.upper()
    else:
        return data

#  change column headings to all upper case,
#  drop irrelevant columns,
#  reorder columns
def cleanup_master(df):
    # Notice, CITY column has duplicated that only differe by case (upper vs. lower).
    #  Change the data to be all upper case
    df["BUSINESS_NAME"] = df["BUSINESS_NAME"].apply(upper_case)
    df["ADDR"] = df["ADDR"].apply(upper_case)
    df["CITY"] = df["CITY"].apply(upper_case)

    # Drop irrelevant data columns:
    #  INSPECTION_ID
    #  InspectionDate (duplicate)
    df.drop("INSPECTION_ID", axis=1, inplace=True)
    df.drop("InspectionDate", axis=1, inplace=True)
    
    # Rename columns for consistency. Not essential, but personal reference.
    # Columns to rename
    renaming_seq = {"Result": "RESULT", "FACILITYID": "FACILITY_ID"}
    df.rename(index=str, columns=renaming_seq, inplace=True)
    
    # Reorder the columns, also personal preference.
    col_order = ['FACILITY_ID', 'BUSINESS_NAME', 'ADDR', 'CITY', 'CAT_1', 'CAT_2', 'SUBCAT_1', 'SUBCAT_2', 'SUBCAT_3',
                 'INFRACTION_TYPE', 'RESULT', 'INFRACTION_TOTAL', 'INSPECTION_DATE', 'REQUIRE_REINSPECTION',
                 'CERTIFIED_FOOD_HANDLER', 'INSPECTION_TYPE'
                ]
    df = df[col_order]
    return df

# clean up the master data set
data_master = cleanup_master(data_master)


In [10]:
# Write data_master to disk
data_master.to_csv(loc + "master_dataset.csv")

In [11]:
data_master.head()

Unnamed: 0,FACILITY_ID,BUSINESS_NAME,ADDR,CITY,CAT_1,CAT_2,SUBCAT_1,SUBCAT_2,SUBCAT_3,INFRACTION_TYPE,RESULT,INFRACTION_TOTAL,INSPECTION_DATE,REQUIRE_REINSPECTION,CERTIFIED_FOOD_HANDLER,INSPECTION_TYPE
0,E2E63D72-B389-401F-B812-0048575D3A87,C'EST CHEESE PLEASE,40 GRAND AVE N,CAMBRIDGE,Food,General,Food Take Out,,,NON-CRITICAL,Corrected During Inspection,1.0,2017-05-13,N,No,Compliance Inspection
1,C087ED29-934D-4297-862C-00697C89075C,"WIRED UP PUGS CAFE-BISTRO, THE",93 GRAND AVE S,CAMBRIDGE,Food,General,Restaurant,,,NON-CRITICAL,Not in Compliance,2.0,2016-05-25,N,No,Compliance Inspection
2,C087ED29-934D-4297-862C-00697C89075C,"WIRED UP PUGS CAFE-BISTRO, THE",93 GRAND AVE S,CAMBRIDGE,Food,General,Restaurant,,,NON-CRITICAL,Not in Compliance,3.0,2017-08-23,N,No,Compliance Inspection
3,C087ED29-934D-4297-862C-00697C89075C,"WIRED UP PUGS CAFE-BISTRO, THE",93 GRAND AVE S,CAMBRIDGE,Food,General,Restaurant,,,NON-CRITICAL,Corrected During Inspection,1.0,2017-08-23,N,No,Compliance Inspection
4,C087ED29-934D-4297-862C-00697C89075C,"WIRED UP PUGS CAFE-BISTRO, THE",93 GRAND AVE S,CAMBRIDGE,Food,General,Restaurant,,,NON-CRITICAL,Not in Compliance,2.0,2017-06-28,Y,No,Compliance Inspection
