# Exploratory Data Analysis

In this EDA, I will be examining various data sources from [opendata.vancouver.ca](https://opendata.vancouver.ca/pages/home/).

## Imports

In [528]:
import numpy as np
import pandas as pd
import re
import requests
import os
import time
import pickle

from matplotlib import pyplot as plt
from pyspark.sql import SparkSession

In [38]:
from pyspark.sql.functions import coalesce, max

In [264]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col, count, mean, sum, udf, when, last, first, regexp_extract
from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
from pyspark.sql.functions import sum, col, udf
from pyspark.sql.types import *

## Reading in the main dataset

In [4]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

In [5]:
# allows for cleaner output
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

> This creates a Spark DataFrame which is very similar to a Pandas DataFrame. Spark DataFrames build on top of RDDs and also has great SQL integration for users who are familiar with the popular language. The main difference with Spark DataFrames is that operations can run on multiple nodes and they are lazy evaluators meaning operations won't be executed until the output is needed.

In [6]:
licences_df = (spark.read
          .option('header','true')
          .option('inferSchema','true')
          .option('sep', ';')
          .csv('data/business-licences-2.csv'))

In [7]:
f'There are %i observations in this dataset' % licences_df.count() 

'There are 504842 observations in this dataset'

In [8]:
licences_df.printSchema()

root
 |-- FOLDERYEAR: integer (nullable = true)
 |-- LicenceRSN: integer (nullable = true)
 |-- LicenceNumber: string (nullable = true)
 |-- LicenceRevisionNumber: integer (nullable = true)
 |-- BusinessName: string (nullable = true)
 |-- BusinessTradeName: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- IssuedDate: timestamp (nullable = true)
 |-- ExpiredDate: timestamp (nullable = true)
 |-- BusinessType: string (nullable = true)
 |-- BusinessSubType: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- House: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Province: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- PostalCode: string (nullable = true)
 |-- LocalArea: string (nullable = true)
 |-- NumberofEmployees: double (nullable = true)
 |-- FeePaid: integer (nullable = true)
 |-- ExtractDate: timestamp (nullable = true)
 |-- Geom: st

In [196]:
# show the first 10 rows 
licences_df.limit(10)

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
15,2345113,15-115406,0,(Shun Chen),ACC Security Syst...,Gone Out of Business,,,Electrical-Securi...,,,,,,Surrey,BC,CA,,,2.0,,2019-07-21 13:48:02,
15,2345119,15-115412,0,Praetorian Securi...,Amax Praetorian,Issued,2014-12-01 14:04:29,2015-12-31 00:00:00,Electrical-Securi...,,,,,,Chilliwack,BC,CA,,,0.0,133.0,2019-07-21 13:48:02,
15,2345120,15-115413,0,Safe-Tec Security...,,Gone Out of Business,,,Electrical-Securi...,,,,,,Surrey,BC,CA,,,0.0,,2019-07-21 13:48:02,
15,2345121,15-115414,0,Husky Alarm Integ...,,Issued,2015-02-10 08:05:11,2015-12-31 00:00:00,Electrical-Securi...,,,,,,Port Moody,BC,CA,,,2.0,173.0,2019-07-21 13:48:02,
15,2345122,15-115415,0,Sea to Sky Securi...,,Pending,,,Electrical-Securi...,,,,,,North Vancouver,BC,CA,,,3.0,,2019-07-21 13:48:02,
15,2345188,15-115481,0,(Kenneth Bradley),Ken Bradley Truck...,Issued,2014-11-18 11:35:48,2015-12-31 00:00:00,Equipment Operator,,,,,,Coquitlam,BC,CA,,,0.0,162.0,2019-07-21 13:48:02,
15,2345191,15-115484,0,Litz Crane Servic...,,Issued,2014-11-06 14:31:00,2015-12-31 00:00:00,Equipment Operator,,,,,,Port Coquitlam,BC,CA,,,5.0,162.0,2019-07-21 13:48:02,
15,2345195,15-115488,0,(Wayne Nichols),W Nichols Trucking,Issued,2014-11-05 12:50:58,2015-12-31 00:00:00,Equipment Operator,,,,,,Delta,BC,CA,,,1.0,162.0,2019-07-21 13:48:02,
15,2345196,15-115489,0,(Harry Powar),Harry Powar Trucking,Issued,2014-12-04 18:49:08,2015-12-31 00:00:00,Equipment Operator,,,,,,Burnaby,BC,CA,,,1.0,162.0,2019-07-21 13:48:02,
15,2345203,15-115496,0,J Brown Trucking Inc,,Issued,2014-12-06 09:23:32,2015-12-31 00:00:00,Equipment Operator,,,,,,Delta,BC,CA,,,2.0,162.0,2019-07-21 13:48:02,


In [197]:
licences_df.groupBy('FOLDERYEAR').count()

FOLDERYEAR,count
13,60915
16,61394
20,64003
19,70771
15,60938
17,60060
14,60581
18,66180


## Initial Wrangling 

In [10]:
business_types = licences_df.select('BusinessType').distinct().collect()
business_list = [business_types[i].BusinessType for i in range(len(business_types))]

r = re.compile(".*[Ff]ood.*|.*[Rr]estaurant.*|.*[Ll]iquor.*")
relevant_list = list(filter(r.match, business_list)) 
print(relevant_list)

['Manufacturer - Food with Anc. Retail', 'Ltd Service Food Establishment', 'Temp Liquor Licence Amendment', 'Liquor Delivery Services', 'Liquor Establishment Standard', 'Retail Dealer - Food', 'Restaurant Class 2', 'Food Processing', 'Liquor Establishment Extended', 'Wholesale Dealer - Food with Anc. Retail', 'Restaurant Class 1', 'Liquor License Application', 'Warehouse Operator - Food', 'Manufacturer - Food', 'Liquor Retail Store', 'Wholesale Dealer - Food']


> These are the list of business types that have the word food, restaurant, or liquor.

In [11]:
rest_1 = (licences_df.BusinessType == 'Restaurant Class 1')
rest_2 = (licences_df.BusinessType == 'Restaurant Class 2')
rest_3 = (licences_df.BusinessType == 'Ltd Service Food Establishment')
rest_4 = (licences_df.BusinessType == 'Temp Liquor Licence Amendment') & (licences_df.BusinessSubType == 'Area Extension')
rest_5 = (licences_df.BusinessType == 'Liquor Establishment Standard') & (licences_df.BusinessSubType == 'Class 1  0-65 Seats')
rest_6 = (licences_df.BusinessType == 'Liquor Establishment Extended') & (licences_df.BusinessSubType == 'Class 1  0-65 Seats')


licences_rest_df = licences_df.filter(rest_1 | rest_2 | rest_3 | rest_4 | rest_5 | rest_6)

In [12]:
licences_rest_df.groupBy("Status").count()

Status,count
Cancelled,604
Gone Out of Business,2914
Issued,24579
Inactive,662
Pending,1420


In [13]:
f'There are %i businesses with missing name values' % licences_rest_df.filter('BusinessTradeName is null').select('BusinessName').distinct().count()

'There are 955 businesses with missing name values'

> I will replace these entries with their business names. 

In [14]:
licences_rest_df = licences_rest_df.withColumn('BusinessTradeName', coalesce('BusinessTradeName', 'BusinessName'))

In [15]:
licences_rest_df = licences_rest_df.dropna(subset = ['BusinessName'])

In [16]:
licences_rest_df.count()

30148

In [17]:
# Search for names using regex

def search_restaurant(search_term, n = 20):
    rows = licences_rest_df.BusinessTradeName.rlike('.*{}.*'.format(search_term))
    return licences_rest_df.filter(rows).limit(n)

In [24]:
search_restaurant('Shizen', 5)

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
16,2603912,16-145075,0,Nagomi Project Ltd,Shizen Ya,Issued,2015-11-05 11:26:12,2016-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,10.0,1042,2019-07-21 13:49:20,"""{""""type"""": """"Poi..."
17,2819772,17-143449,0,Nagomi Project Ltd,Shizen Ya,Issued,2016-11-29 09:10:08,2017-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,10.0,1063,2020-01-01 02:32:04,"""{""""type"""": """"Poi..."
14,2157871,14-145460,0,Nagomi Project Ltd,Shizen Ya,Issued,2013-12-13 10:56:37,2014-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,8.0,999,2019-07-21 13:49:12,"""{""""type"""": """"Poi..."
15,2376219,15-146481,0,Nagomi Project Ltd,Natural Food Shiz...,Issued,2014-10-31 16:52:01,2015-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,10.0,878,2019-07-21 13:49:16,"""{""""type"""": """"Poi..."
14,2158834,14-146423,0,Nagomi Project Ltd,Natural Food Shiz...,Issued,2013-12-10 22:39:49,2014-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,7.0,860,2019-07-21 13:49:12,"""{""""type"""": """"Poi..."


In [106]:
# number of missing values of each column
licences_rest_df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in licences_rest_df.columns))

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
0,0,0,0,0,0,0,3261,3260,0,12929,25242,25238,17,17,0,0,2,243,87,0,2537,0,831


In [751]:
# use this table to populate the features from yelp

rest_distinct_df = licences_rest_df.select(['FOLDERYEAR', 'Status', 
                                            'BusinessTradeName', 'Geom', 
                                            'PostalCode', 'Unit', 'House', 
                                            'Street', 'City']).distinct()

In [736]:
f'There are %i restaurant locations' % licences_rest_df.select(['Geom', 'PostalCode', 'Unit', 
                                                                'House', 'Street', 'City']).distinct().count()

'There are 4504 restaurant locations'

In [733]:
f'There are %i closed/open restaurants locations' % licences_rest_df.select(['BusinessTradeName', 'Geom', 'PostalCode', 
                                                                             'Unit', 'House', 'Street', 'City']).distinct().count()

# this includes restuarants that changed locations

'There are 6637 closed/open restaurants locations'

In [441]:
rest_distinct_df.filter('House is null')\
                .filter('Street is null')\
                .select('BusinessTradeName')\
                .distinct()

BusinessTradeName
Fresh Counter & C...
Siga-Siga's Chine...
Siga-Siga's Filip...
Panz Veggie
Craft Beer Market...
Fiery Pizza


> The Yelp API requires an address of these restaurants and 6 restaurants do not have any address. Since this is an insignificant proportion of the data, I will simply remove them.

In [752]:
rest_distinct_df = rest_distinct_df.dropna(how = 'all', subset = ['House', 'Street'])

> Problem: can't get information for old closed restuarants 
> - initial thoughts: have to find an external API that has archived webpages
> - maybe a solution: businesses/search doesn't give closed restuarants but businesses/matches does (have to have exact address) and then get the ID from this output and input into businesses/{id}

In [753]:
# Here I am ordering the status such that if in a single year, there are multiple statuses, I will take the greatest

rest_distinct_df = rest_distinct_df.withColumn('Status', 
                            when(col('Status') == 'Inactive', 0)
                            .when(col('Status') == 'Cancelled', 1)
                            .when(col('Status') == 'Gone Out of Business', 2)
                            .when(col('Status') == 'Pending', 3)
                            .when(col('Status') == 'Issued', 4))

In [754]:
# This will get the latest location of a restaurant
# if in a single year, the restuarant is both cancelled and gone out of businesss, aggregate into 1 category
# if in a single year, the restuarant is both cancelled and issued, aggregate into 1 category - this means the restuarant moved locations
latest_year = rest_distinct_df.sort(['FOLDERYEAR', 'Status'])\
                              .groupBy('BusinessTradeName')\
                              .agg(last('Status').alias('Status'), 
                                   last('FOLDERYEAR').alias('FOLDERYEAR'))

rest_distinct_df = rest_distinct_df.join(latest_year, on = ['Status', 'FOLDERYEAR', 'BusinessTradeName'], how = 'leftsemi')

In [445]:
rest_distinct_df.count()

5740

In [446]:
# create new columns for longitude and latitude
rest_distinct_df = rest_distinct_df.withColumn('longitude', regexp_extract(col('Geom'), '\[(.+), (.+)\]', 1).cast('float'))\
                                   .withColumn('latitude', regexp_extract(col('Geom'), '\[(.+), (.+)\]', 2).cast('float'))\
                                   .drop('Geom')

In [447]:
rest_distinct_df.limit(5)

Status,FOLDERYEAR,BusinessTradeName,PostalCode,Unit,House,Street,City,longitude,latitude
1,17,George Simnos & J...,V5V 4E9,,4153,FRASER ST,Vancouver,-123.09054,49.248013
2,15,Sodexo @Vancouver...,V5T 4V5,,1120,E 7TH AV,Vancouver,-123.08036,49.2633
2,16,Cilantro Cafe,V6Z 2L2,110.0,840,HOWE ST,Vancouver,-123.121574,49.281162
2,17,O-Bowl,V6K 2H3,,3132,W BROADWAY,Vancouver,-123.17483,49.26392
2,17,Swiss Chalet Roti...,V6K 2H4,,3204,W BROADWAY,Vancouver,-123.176216,49.26395


Is the restaurant part of a chain?  
How many other restaurants are nearby (within one mile)?  
How does it compare to nearby restaurants (based on price, but also its average rating — and the number of reviews)?  
How old is the restaurants?

## YELP API

https://www.yelp.ca/developers/documentation/v3/get_started   
https://github.com/Yelp/yelp-fusion#code-samples


In [None]:
rest_distinct_list = rest_distinct_df.collect()

In [1550]:
def get_params(rest):
    
    name = rest.BusinessTradeName
    
    house = rest.House
    street = rest.Street
    if rest.Unit == None:
        unit = ''
    else:
        unit = rest.Unit
    address = ' '.join([unit, house, street])
    
    if rest.PostalCode == None:
        postal_code = ''
    else:
        postal_code = rest.PostalCode
            
    if rest.latitude == None:
        latitude = None
        longitude = None
    else:    
        latitude = rest.latitude
        longitude = rest.longitude
        
    city = rest.City
    
    return {'apikey': api_key,
            'name': name,
            'address1': address,
            'zip_code': postal_code,
            'latitude': latitude,
            'longitude': longitude,
            'city': city,
            'state': 'BC',
            'country': 'CA',
            'match_threshold': 'none',
            'limit': 1}

def make_call_1(headers, params):
    
    request_1 = requests.get('https://api.yelp.com/v3/businesses/matches', 
                             headers = headers, params = params)

    result_1 = request_1.json()
    
    if request_1.status_code == 500:

        print('There was an internal yelp matches call error for: ' + params['name'])
        print('Redoing yelp call... ', end = '')
        time.sleep(5)
        request_1 = requests.get('https://api.yelp.com/v3/businesses/matches', 
                                 headers = headers, params = params)
        result_1 = request_1.json()
        
        if request_1.status_code != 500:
            print('Fixed!')
    
    return request_1, result_1


def make_call_2(headers, params, request_1, result_1):
    # checks if the request was bad or output was empty   
    if request_1.status_code != 200 or len(result_1['businesses']) == 0:
        result_2 = None
        request_2 = request_1
        data['yelp_call'].append(result_2)

    else:
        rest_id = result_1['businesses'][0]['id']
        request_2 = requests.get('https://api.yelp.com/v3/businesses/{}'
                                 .format(rest_id), headers = headers)
        result_2 = request_2.json()
        
        data['yelp_call'].append(result_2)
        
        if request_2.status_code == 500:
            print('There was an internal yelp businesses call error for: ' + params['name'])
            print('Redoing yelp call... ', end = '')
            time.sleep(5)
            request_2 = requests.get('https://api.yelp.com/v3/businesses/{}'
                                    .format(rest_id), headers = headers)
            result_2 = request_2.json()
            
            if request_2.status_code != 500:
                print('Fixed!')
                
            data['yelp_call'][i] = result_2

    return request_2
    
def remaining_time():
    
    current_time = datetime.strptime(datetime.utcnow().strftime("%H:%M"), '%H:%M')
    reset_time = datetime.strptime('00:00', '%H:%M')
    
    diff = reset_time - current_time
    hours = int(diff.seconds // (60 * 60))
    mins = int((diff.seconds // 60) % 60)
    
    return '{} hour(s) and {} min(s)'.format(hours, mins)

In [1551]:
api_key = os.getenv('YELP_API_KEY')
headers = {'Authorization': 'Bearer %s' % api_key}

data = dict(identifier = [],
            name = [],
            unit = [],
            house = [],
            street = [],
            yelp_call = [])

In [1603]:
for i in range(len(rest_distinct_list)):

    params = get_params(rest_distinct_list[i])
    
    identifier = ''.join([params['name'], 
                          params['address1'], 
                          params['zip_code']])

    # doesn't make a request if already have the data
    if identifier not in data['identifier']:
        request_1, result_1 = make_call_1(headers, params)
        
        time.sleep(0.5)

        if request_1.status_code == 429:
            print('Yelp API call limit reached, re-run in {}'.format(remaining_time()))
            break

        data['identifier'].append(identifier)
        
        # for joining back to original dataframe
        data['name'].append(rest_distinct_list[i].BusinessTradeName)
        data['house'].append(rest_distinct_list[i].House)
        data['street'].append(rest_distinct_list[i].Street)
        data['unit'].append(rest_distinct_list[i].Unit)
        
        request_2 = make_call_2(headers, params, request_1, result_1)
        
        time.sleep(0.5)
        
    if (i % 200 == 0) & (i != 0):
        print(i, 'restaurants collected')

print('\n{} / {} restaurants collected'.format(i, len(rest_distinct_list)))
print('Calls remaining:', request_2.headers['ratelimit-remaining'])
path = 'data/yelp/'
filename = 'rest_yelp_api' + time.strftime('_%Y-%m-%d') + '.pkl'
with open(path + filename, 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
print('Data saved in {} as {}'.format(path, filename))


200 restaurants collected
400 restaurants collected
600 restaurants collected
800 restaurants collected
1000 restaurants collected
1200 restaurants collected
Yelp API call limit reached, re-run in 19 hour(s) and 10 min(s)

1228 / 5740 restaurants collected
Calls remaining: 0
Data saved in data/yelp/ as rest_yelp_api_2020-05-11.pkl


# Creating the Cleaned Data

In [1070]:
with open(path + filename, 'rb') as f:
    data = pickle.load(f)

In [1599]:
data_clean = dict(name = [],
                  categories = [],
                  rating = [],
                  review_count = [],
                  price = [])

In [1602]:
for name in data['name']:
    data_clean['name'].append(name)
for call in data['yelp_call']:
    if call == None:
        data_clean['categories'].append(None)
        data_clean['rating'].append(None)
        data_clean['review_count'].append(None)
        data_clean['price'].append(None)
    else:
        if 'categories' not in call:
            data_clean['categories'].append(None)
        else:
            data_clean['categories'].append(call['categories'])

        if 'rating' not in call:
            data_clean['rating'].append(None)
        else:
            data_clean['rating'].append(call['rating'])

        if 'review_count' not in call:
            data_clean['review_count'].append(None)
        else:
            data_clean['review_count'].append(call['review_count'])

        if 'price' not in call:
            data_clean['price'].append(None)
        else:
            data_clean['price'].append(call['price'])

path = 'data/yelp/'
filename = 'rest_cleaned' + time.strftime('_%Y-%m-%d') + '.pkl'
with open(path + filename, 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
print('Data saved in {} as {}'.format(path, filename))

Data saved in data/yelp/ as rest_cleaned_2020-05-11.pkl


In [1601]:
pd.DataFrame(data_clean)

Unnamed: 0,name,categories,rating,review_count,price
0,George Simnos & Joakim Bostner & Stelio Loukakos,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",3.5,49.0,$$
1,Sodexo @Vancouver Community College,"[{'alias': 'collegeuniv', 'title': 'Colleges &...",2.5,3.0,
2,Cilantro Cafe,"[{'alias': 'cafes', 'title': 'Cafes'}, {'alias...",5.0,1.0,
3,O-Bowl,"[{'alias': 'asianfusion', 'title': 'Asian Fusi...",3.5,5.0,$$
4,Swiss Chalet Rotisserie + Grill,"[{'alias': 'comfortfood', 'title': 'Comfort Fo...",2.5,40.0,$$
...,...,...,...,...,...
1223,Junsei River Japanese Restaurant,"[{'alias': 'japanese', 'title': 'Japanese'}]",2.5,35.0,$$
1224,D's Cafe And Roasting,,,,
1225,House of Canton Chinese Restaurant,"[{'alias': 'chinese', 'title': 'Chinese'}]",3.0,14.0,$$
1226,Martteoh Enterprises Inc,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",2.5,11.0,$


> Here I will create a PySpark dataframe to eventually join it back to the `rest_distinct_df` dataframe.

In [285]:
rest_distinct_info = spark.createDataFrame(pd.DataFrame(data))
rest_distinct_info

name,category,rating,review_count,price
Morris J. Wosk Ce...,,,,
Banana Leaf Malay...,[[title -> Malays...,3.5,211.0,$$
Top of Vancouver,[[title -> Americ...,3.0,307.0,$$$$
Portobello Ristor...,,,,
Park Theatre,[[title -> Cinema...,4.0,31.0,
The Bake Shop,[[title -> Bakeri...,4.5,42.0,$$
Liquid Nutrition,[[title -> Juice ...,1.5,6.0,$$$$
The Dish Fabulous...,[[title -> Massag...,3.5,28.0,$$$
Euro Bagel Cafe,[[title -> Coffee...,4.0,49.0,$
Thida Thai Restau...,"[[title -> Thai, ...",3.0,53.0,$$


In [281]:
rest_distinct_df.join(rest_distinct_info, rest_distinct_df.BusinessTradeName == rest_distinct_info.name, how = 'left')

BusinessTradeName,Geom,PostalCode,House,Street,name,house,street,postalcode,geom,category,rating,review_count,price
"""Phnom Penh """"a""""...","""{""""type"""": """"Poi...",V6A 1Z7,244,E Georgia St,,,,,,,,,
49th Parallel Cof...,"""{""""type"""": """"Poi...",V6K 1N6,2198,W 4TH AV,,,,,,,,,
49th Parallel Cof...,"""{""""type"""": """"Poi...",V6K 1N6,2152,W 4TH AV,,,,,,,,,
49th Parallel Cof...,"""{""""type"""": """"Poi...",V6E 0A1,689,THURLOW ST,,,,,,,,,
49th Parallel Cof...,"""{""""type"""": """"Poi...",V5T 3G3,2902,MAIN ST,,,,,,,,,
Annalena,"""{""""type"""": """"Poi...",V6J 5B8,1809,W 1ST AV,,,,,,,,,
Bing Sheng Restau...,,V5M 3H9,1800,RENFREW ST,,,,,,,,,
Brado,"""{""""type"""": """"Poi...",V5L 3X5,1399,COMMERCIAL DRIVE,,,,,,,,,
Capstone,"""{""""type"""": """"Poi...",V6G 1C1,1429,ROBSON ST,,,,,,,,,
Casa Shawarma,"""{""""type"""": """"Poi...",V5V 3E3,1385,KINGSWAY,,,,,,,,,


> quantify similarity of restaurant cuisines to see how similar restuarants in the same proximity do