In [1]:
import pandas as pd

In [2]:
# Load CFI dataset 
cfi_dataset = pd.read_pickle('datasets/cleaned_inspections.pickle')

#### Chain flags

In [3]:
# Load fast food restaurants dataset 
fast_foods = pd.read_csv('datasets/FastFoodRestaurants.csv')

In [4]:
# We only keep the restaurants that appear more than once
chain_counts = (fast_foods['name'].value_counts())
fast_food_chains = chain_counts[chain_counts.values>1].index.tolist()

In [5]:
additional_chains = cfi_dataset[['DBA Name', 'Address', 'Latitude', 'Longitude']].drop_duplicates().reset_index().drop('index', axis=1)
grouped_by_dbaname = additional_chains.groupby(by=['DBA Name']).size().reset_index(name="Count")
establishments_repeted = grouped_by_dbaname[grouped_by_dbaname['Count']>1].reset_index()
other_fast_food_chains = set(establishments_repeted['DBA Name'])

In [6]:
# Extend the set of chain establishments in the dataset
fast_food_chains.extend(other_fast_food_chains)
fast_food_chains = set(fast_food_chains)

In [7]:
# We remove the char '\'' for simplicity, transform everything to lowercase and remove duplicates
fast_food_chains = set([chain_name.replace("\'", "").lower() for chain_name in fast_food_chains])

In [8]:
# We create a new column "Chain flag" for food chains
# We fill the column with 'N' for No' and 'Y' for 'Yes' if the establishment name exists (or not exist) in the fast_food_chains
boolean_foodchains = pd.DataFrame(cfi_dataset['DBA Name'].str.replace("\'", "")\
                                  .str.lower().isin(fast_food_chains)).rename(columns={"DBA Name": "Chain flag temp"})

In [9]:
# Add the boolean_foodchains['Chain flag temp'] columns to the main dataset CFI as a new column: 'Chain flag'
cfi_dataset['Chain flag'] = boolean_foodchains['Chain flag temp']

#### Yelp

In [10]:
yelp = pd.read_pickle('business_details.pickle')

In [11]:
yelp['Lowercased Address'] = yelp['location.address1'].str.lower()
yelp['Lowercased Name'] = yelp['name'].str.lower()

yelp = yelp.dropna(how='all').reset_index().drop(['index'], axis = 1) 

In [12]:
# Normalizing the name and address columns in order to merge correctly
cfi_dataset['Lowercased Name'] = cfi_dataset['DBA Name'].str.lower()
cfi_dataset['Lowercased Address'] = cfi_dataset['Address'].str.lower()

cfi_dataset['Lowercased Name'] = cfi_dataset['Lowercased Name'].astype(str).str.strip()
cfi_dataset['Lowercased Address'] = cfi_dataset['Lowercased Address'].astype(str).str.strip()

In [14]:
merged_cfi_yelp = cfi_dataset.merge(yelp, how='left', left_on=['Lowercased Name', 'Lowercased Address'], 
                                    right_on=['Lowercased Name', 'Lowercased Address'])

In [17]:
# save to pickle
merged_cfi_yelp.to_pickle('pickles/merged_cfi_yelp')

In [19]:
merged_cfi_yelp.columns

Index(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type',
       'Risk', 'Address', 'Inspection Date', 'Inspection Type', 'Results',
       'Violations', 'Latitude', 'Longitude', 'Location', 'Community Area',
       'Violation Numbers', 'Violation Comments', 'Chain flag',
       'Lowercased Name', 'Lowercased Address', 'alias', 'categories',
       'coordinates.latitude', 'coordinates.longitude', 'display_phone',
       'error.code', 'error.description', 'hours', 'id', 'image_url',
       'is_claimed', 'is_closed', 'location.address1', 'location.address2',
       'location.address3', 'location.city', 'location.country',
       'location.cross_streets', 'location.display_address', 'location.state',
       'location.zip_code', 'messaging.url', 'messaging.use_case_text', 'name',
       'phone', 'photos', 'price', 'rating', 'review_count', 'special_hours',
       'transactions', 'url'],
      dtype='object')