<a href="https://colab.research.google.com/github/andrew66882011/qss20_slides_activities/blob/main/activities/01_pandas_datacleaning_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
import re

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Load small sample of the Chicago restaurant inspections data


inspec = pd.read_csv('https://data.cityofchicago.org/api/views/4ijn-s7e5/rows.csv?accessType=DOWNLOAD',
                    nrows = 500)


inspec.shape
inspec.info()

(500, 17)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Inspection ID    500 non-null    int64  
 1   DBA Name         500 non-null    object 
 2   AKA Name         494 non-null    object 
 3   License #        500 non-null    int64  
 4   Facility Type    491 non-null    object 
 5   Risk             500 non-null    object 
 6   Address          500 non-null    object 
 7   City             500 non-null    object 
 8   State            500 non-null    object 
 9   Zip              500 non-null    int64  
 10  Inspection Date  500 non-null    object 
 11  Inspection Type  500 non-null    object 
 12  Results          500 non-null    object 
 13  Violations       95 non-null     object 
 14  Latitude         497 non-null    float64
 15  Longitude        497 non-null    float64
 16  Location         497 non-null    object 
dtypes: float64(2), i

## Cleaning columns and list comprehension

In [None]:
## Cleaning columns
print(inspec.columns)
print(type(inspec.columns))

Index(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type',
       'Risk', 'Address', 'City', 'State', 'Zip', 'Inspection Date',
       'Inspection Type', 'Results', 'Violations', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')
<class 'pandas.core.indexes.base.Index'>


In [None]:
## want to:
## 1. remove spaces
## 2. make lowercase
## 3. remove char like # that will cause issues

cleaner_cols = [re.sub("\s|\#", '', col.lower()) for col in inspec.columns]
print(cleaner_cols)

## assign back to main data
inspec.columns = cleaner_cols

#inspec.head()

## can also use to pull out columns with a particular
## pattern in their name -- eg could pull
## out anything with name in the str
name_cols = [col for col in inspec.columns if "name" in col]
print(name_cols)

['inspectionid', 'dbaname', 'akaname', 'license', 'facilitytype', 'risk', 'address', 'city', 'state', 'zip', 'inspectiondate', 'inspectiontype', 'results', 'violations', 'latitude', 'longitude', 'location']
['dbaname', 'akaname']


## Checking datatypes and recasting if needed

In [None]:
inspec.dtypes

## see two major issues
## want to treat license as string rather than int
## esp in risk there are leading zeros

## want to make sure inspectiondate is treated as datetime

## see here for good crosswalk between pandas series 
## dtypes and numpy dtypes
## https://pbpython.com/pandas_dtypes.html

inspectionid        int64
dbaname            object
akaname            object
license             int64
facilitytype       object
risk               object
address            object
city               object
state              object
zip                 int64
inspectiondate     object
inspectiontype     object
results            object
violations         object
latitude          float64
longitude         float64
location           object
dtype: object

In [None]:
## recasting license # (in this case, i'm writing as new var)
## could also write over original var by just saving with same name
inspec['license_derived'] = inspec.license.astype(str)
inspec.dtypes

inspec[['license', 'license_derived']].head()

## then, recasting inspectiondate to be datetime (no time in this case)
inspec['inspectiondate_derived'] = pd.to_datetime(inspec.inspectiondate)
inspec[[col for col in inspec.columns if "inspectiondate" in col]].head()



inspectionid         int64
dbaname             object
akaname             object
license              int64
facilitytype        object
risk                object
address             object
city                object
state               object
zip                  int64
inspectiondate      object
inspectiontype      object
results             object
violations          object
latitude           float64
longitude          float64
location            object
license_derived     object
dtype: object

Unnamed: 0,license,license_derived
0,1193207,1193207
1,2484004,2484004
2,2535924,2535924
3,2069755,2069755
4,66021,66021


Unnamed: 0,inspectiondate,inspectiondate_derived
0,12/11/2020,2020-12-11
1,11/23/2020,2020-11-23
2,11/18/2020,2020-11-18
3,11/05/2020,2020-11-05
4,11/05/2020,2020-11-05


## Creating boolean indicators

In [None]:
inspec.facilitytype.value_counts()

## want to create an indicator for whether
## the facility is food-related

### slower way - write out all the cats want to include
## (here non-exhaustive)
food_strings_manual = ['Restaurant', 'Grocery Store', 'Bakery', 'TAVERN', 
                      'COOKING SCHOOL', 'CATERING', 'Golden Diner',
                      'DELI/GROCERY', 'CULINARY SCHOOL', 
                      'FROZEN DESSERT PUSHCARTS']
inspec['is_foodfac_derived'] = np.where(inspec.facilitytype.isin(food_strings_manual), True, False)

### usually good to do a check on categorization
pd.crosstab(inspec.facilitytype, inspec.is_foodfac_derived)



is_foodfac_derived,False,True
facilitytype,Unnamed: 1_level_1,Unnamed: 2_level_1
Bakery,0,6
CHARTER SCHOOL,2,0
COOKING SCHOOL,0,2
CULINARY SCHOOL,0,1
Catering,2,0
Children's Services Facility,13,0
DELI/GROCERY,0,1
Daycare (2 - 6 Years),3,0
Daycare Above and Under 2 Years,5,0
FROZEN DESSERT PUSHCARTS,0,1


In [None]:
### can do more rigorous check with assert to make sure nothing that appears
### in the list is false
all_false = inspec[~inspec.is_foodfac_derived].copy()
all_facility_infalse = all_false.facilitytype.unique()
compare_fac = set(all_facility_infalse).intersection(set(food_strings_manual))
assert len(compare_fac) == 0

In [None]:
## faster way --- do partial patterns and str.contains (but also more likely to lead to false positives)

food_strings_partial = ['restaurant', 'grocery', 'bakery', 'tavern',
                        'diner', 'culinary', 'dessert', 'catering',
                      'cooking']

food_strings_join = "|".join(food_strings_partial)
food_strings_join

'restaurant|grocery|bakery|tavern|diner|culinary|dessert|catering|cooking'

In [None]:
inspec['is_foodfac_derived_2'] = np.where(inspec.facilitytype.astype(str).str.lower().str.contains(food_strings_join),
                                         True, False)

inspec.is_foodfac_derived_2.value_counts()
inspec.is_foodfac_derived.value_counts()

## 2 diff
inspec.loc[inspec.is_foodfac_derived != inspec.is_foodfac_derived_2,
          ['facilitytype', 'is_foodfac_derived', 
          'is_foodfac_derived_2']]



True     435
False     65
Name: is_foodfac_derived_2, dtype: int64

True     433
False     67
Name: is_foodfac_derived, dtype: int64

Unnamed: 0,facilitytype,is_foodfac_derived,is_foodfac_derived_2
6,Catering,False,True
376,Catering,False,True


In [None]:

## create a date-based boolean indicator
inspec.inspectiondate_derived.describe(datetime_is_numeric=True)

## include everything after christmas 2019
inspec_postxmas = inspec[inspec.inspectiondate_derived > '2019-12-25'].copy()

inspec_postxmas.inspectiondate_derived.describe(datetime_is_numeric = True)

count                              500
mean     2019-11-17 21:04:19.199999744
min                2019-06-17 00:00:00
25%                2019-08-13 00:00:00
50%                2019-10-24 12:00:00
75%                2020-01-30 00:00:00
max                2020-12-11 00:00:00
Name: inspectiondate_derived, dtype: object

count                              168
mean     2020-04-01 00:17:08.571428608
min                2019-12-27 00:00:00
25%                2020-01-30 00:00:00
50%                2020-02-28 00:00:00
75%                2020-04-22 12:00:00
max                2020-12-11 00:00:00
Name: inspectiondate_derived, dtype: object

# Using dictionaries and map to combine categories

In [None]:
inspec.inspectiontype.value_counts()

## get all inspections that result from canvassing
canvass_all = [i for i in inspec.inspectiontype.unique() if "canvass" in i.lower()]
canvass_all

## get all inspections that result from complaints
comp_all = [i for i in inspec.inspectiontype.unique() if "complaint" in i.lower()]
comp_all

## create a dictionary
cat_largesmall = {'Complaint': comp_all, 
                 'Canvass': canvass_all}

cat_largesmall

## to remap, we need the keys to correspond with column names, 
## so need to reverse what's a key and what's a value 
cat_smalllarge = {value: key for key, value2 in 
                      cat_largesmall.items() for value in value2}

inspec['simplified_type_derived'] = inspec['inspectiontype'].map(cat_smalllarge).fillna(inspec['inspectiontype'])

inspec.simplified_type_derived.value_counts()

Canvass                    290
License                    104
Canvass Re-Inspection       36
Complaint                   32
License Re-Inspection       16
Complaint Re-Inspection     11
Short Form Complaint        10
Non-Inspection               1
Name: inspectiontype, dtype: int64

['Canvass', 'Canvass Re-Inspection']

['Complaint', 'Short Form Complaint', 'Complaint Re-Inspection']

{'Complaint': ['Complaint', 'Short Form Complaint', 'Complaint Re-Inspection'],
 'Canvass': ['Canvass', 'Canvass Re-Inspection']}

Canvass                  326
License                  104
Complaint                 53
License Re-Inspection     16
Non-Inspection             1
Name: simplified_type_derived, dtype: int64