# Exploratory Data Analysis

In this EDA, I will be examining various data sources from [opendata.vancouver.ca](https://opendata.vancouver.ca/pages/home/).

## Imports

In [1]:
import numpy as np
import pandas as pd
import re
import requests
import os

from matplotlib import pyplot as plt
from pyspark.sql import SparkSession

In [38]:
from pyspark.sql.functions import coalesce, max

In [264]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col, count, mean, sum, udf, when, last, first
from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
from pyspark.sql.functions import sum, col, udf
from pyspark.sql.types import *

## Reading in the main dataset

In [4]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

In [5]:
# allows for cleaner output
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

> This creates a Spark DataFrame which is very similar to a Pandas DataFrame. Spark DataFrames build on top of RDDs and also has great SQL integration for users who are familiar with the popular language. The main difference with Spark DataFrames is that operations can run on multiple nodes and they are lazy evaluators meaning operations won't be executed until the output is needed.

In [6]:
licences_df = (spark.read
          .option('header','true')
          .option('inferSchema','true')
          .option('sep', ';')
          .csv('data/business-licences-2.csv'))

In [7]:
f'There are %i observations in this dataset' % licences_df.count() 

'There are 504842 observations in this dataset'

In [8]:
licences_df.printSchema()

root
 |-- FOLDERYEAR: integer (nullable = true)
 |-- LicenceRSN: integer (nullable = true)
 |-- LicenceNumber: string (nullable = true)
 |-- LicenceRevisionNumber: integer (nullable = true)
 |-- BusinessName: string (nullable = true)
 |-- BusinessTradeName: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- IssuedDate: timestamp (nullable = true)
 |-- ExpiredDate: timestamp (nullable = true)
 |-- BusinessType: string (nullable = true)
 |-- BusinessSubType: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- House: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Province: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- PostalCode: string (nullable = true)
 |-- LocalArea: string (nullable = true)
 |-- NumberofEmployees: double (nullable = true)
 |-- FeePaid: integer (nullable = true)
 |-- ExtractDate: timestamp (nullable = true)
 |-- Geom: st

In [196]:
# show the first 10 rows 
licences_df.limit(10)

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
15,2345113,15-115406,0,(Shun Chen),ACC Security Syst...,Gone Out of Business,,,Electrical-Securi...,,,,,,Surrey,BC,CA,,,2.0,,2019-07-21 13:48:02,
15,2345119,15-115412,0,Praetorian Securi...,Amax Praetorian,Issued,2014-12-01 14:04:29,2015-12-31 00:00:00,Electrical-Securi...,,,,,,Chilliwack,BC,CA,,,0.0,133.0,2019-07-21 13:48:02,
15,2345120,15-115413,0,Safe-Tec Security...,,Gone Out of Business,,,Electrical-Securi...,,,,,,Surrey,BC,CA,,,0.0,,2019-07-21 13:48:02,
15,2345121,15-115414,0,Husky Alarm Integ...,,Issued,2015-02-10 08:05:11,2015-12-31 00:00:00,Electrical-Securi...,,,,,,Port Moody,BC,CA,,,2.0,173.0,2019-07-21 13:48:02,
15,2345122,15-115415,0,Sea to Sky Securi...,,Pending,,,Electrical-Securi...,,,,,,North Vancouver,BC,CA,,,3.0,,2019-07-21 13:48:02,
15,2345188,15-115481,0,(Kenneth Bradley),Ken Bradley Truck...,Issued,2014-11-18 11:35:48,2015-12-31 00:00:00,Equipment Operator,,,,,,Coquitlam,BC,CA,,,0.0,162.0,2019-07-21 13:48:02,
15,2345191,15-115484,0,Litz Crane Servic...,,Issued,2014-11-06 14:31:00,2015-12-31 00:00:00,Equipment Operator,,,,,,Port Coquitlam,BC,CA,,,5.0,162.0,2019-07-21 13:48:02,
15,2345195,15-115488,0,(Wayne Nichols),W Nichols Trucking,Issued,2014-11-05 12:50:58,2015-12-31 00:00:00,Equipment Operator,,,,,,Delta,BC,CA,,,1.0,162.0,2019-07-21 13:48:02,
15,2345196,15-115489,0,(Harry Powar),Harry Powar Trucking,Issued,2014-12-04 18:49:08,2015-12-31 00:00:00,Equipment Operator,,,,,,Burnaby,BC,CA,,,1.0,162.0,2019-07-21 13:48:02,
15,2345203,15-115496,0,J Brown Trucking Inc,,Issued,2014-12-06 09:23:32,2015-12-31 00:00:00,Equipment Operator,,,,,,Delta,BC,CA,,,2.0,162.0,2019-07-21 13:48:02,


In [197]:
licences_df.groupBy('FOLDERYEAR').count()

FOLDERYEAR,count
13,60915
16,61394
20,64003
19,70771
15,60938
17,60060
14,60581
18,66180


## Initial Wrangling 

In [10]:
business_types = licences_df.select('BusinessType').distinct().collect()
business_list = [business_types[i].BusinessType for i in range(len(business_types))]

r = re.compile(".*[Ff]ood.*|.*[Rr]estaurant.*|.*[Ll]iquor.*")
relevant_list = list(filter(r.match, business_list)) 
print(relevant_list)

['Manufacturer - Food with Anc. Retail', 'Ltd Service Food Establishment', 'Temp Liquor Licence Amendment', 'Liquor Delivery Services', 'Liquor Establishment Standard', 'Retail Dealer - Food', 'Restaurant Class 2', 'Food Processing', 'Liquor Establishment Extended', 'Wholesale Dealer - Food with Anc. Retail', 'Restaurant Class 1', 'Liquor License Application', 'Warehouse Operator - Food', 'Manufacturer - Food', 'Liquor Retail Store', 'Wholesale Dealer - Food']


> These are the list of business types that have the word food, restaurant, or liquor.

In [11]:
rest_1 = (licences_df.BusinessType == 'Restaurant Class 1')
rest_2 = (licences_df.BusinessType == 'Restaurant Class 2')
rest_3 = (licences_df.BusinessType == 'Ltd Service Food Establishment')
rest_4 = (licences_df.BusinessType == 'Temp Liquor Licence Amendment') & (licences_df.BusinessSubType == 'Area Extension')
rest_5 = (licences_df.BusinessType == 'Liquor Establishment Standard') & (licences_df.BusinessSubType == 'Class 1  0-65 Seats')
rest_6 = (licences_df.BusinessType == 'Liquor Establishment Extended') & (licences_df.BusinessSubType == 'Class 1  0-65 Seats')


licences_rest_df = licences_df.filter(rest_1 | rest_2 | rest_3 | rest_4 | rest_5 | rest_6)

In [12]:
licences_rest_df.groupBy("Status").count()

Status,count
Cancelled,604
Gone Out of Business,2914
Issued,24579
Inactive,662
Pending,1420


In [13]:
f'There are %i businesses with missing name values' % licences_rest_df.filter('BusinessTradeName is null').select('BusinessName').distinct().count()

'There are 955 businesses with missing name values'

> I will replace these entries with their business names. 

In [14]:
licences_rest_df = licences_rest_df.withColumn('BusinessTradeName', coalesce('BusinessTradeName', 'BusinessName'))

In [15]:
licences_rest_df = licences_rest_df.dropna(subset = ['BusinessName'])

In [16]:
licences_rest_df.count()

30148

In [17]:
from pyspark.sql.functions import regexp_extract, col

# Search for names using regex

def search_restaurant(search_term, n = 20):
    rows = licences_rest_df.BusinessTradeName.rlike('.*{}.*'.format(search_term))
    return licences_rest_df.filter(rows).limit(n)

In [24]:
search_restaurant('Shizen', 5)

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
16,2603912,16-145075,0,Nagomi Project Ltd,Shizen Ya,Issued,2015-11-05 11:26:12,2016-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,10.0,1042,2019-07-21 13:49:20,"""{""""type"""": """"Poi..."
17,2819772,17-143449,0,Nagomi Project Ltd,Shizen Ya,Issued,2016-11-29 09:10:08,2017-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,10.0,1063,2020-01-01 02:32:04,"""{""""type"""": """"Poi..."
14,2157871,14-145460,0,Nagomi Project Ltd,Shizen Ya,Issued,2013-12-13 10:56:37,2014-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,8.0,999,2019-07-21 13:49:12,"""{""""type"""": """"Poi..."
15,2376219,15-146481,0,Nagomi Project Ltd,Natural Food Shiz...,Issued,2014-10-31 16:52:01,2015-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,10.0,878,2019-07-21 13:49:16,"""{""""type"""": """"Poi..."
14,2158834,14-146423,0,Nagomi Project Ltd,Natural Food Shiz...,Issued,2013-12-10 22:39:49,2014-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,7.0,860,2019-07-21 13:49:12,"""{""""type"""": """"Poi..."


> Use an external dataset such as yelp or zomato to get restuarant cuisine, price range, and other features. 

In [98]:
from pyspark.ml.stat import Summarizer
Summarizer.metrics('')

30148

In [106]:
# number of missing values of each column
licences_rest_df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in licences_rest_df.columns))

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
0,0,0,0,0,0,0,3261,3260,0,12929,25242,25238,17,17,0,0,2,243,87,0,2537,0,831


In [315]:
# use this table to populate the features from yelp

rest_distinct_df = licences_rest_df.select(['FOLDERYEAR', 'Status', 'BusinessTradeName', 'Geom', 'PostalCode', 'Unit', 'House', 'Street', 'City']).distinct()

In [313]:
f'There are %i restaurant locations' % licences_rest_df.select(['Geom', 'PostalCode', 'Unit', 'House', 'Street', 'City']).distinct().count()

'There are 4504 restaurant locations'

In [314]:
f'There are %i closed/open restaurants' % licences_rest_df.select(['BusinessTradeName', 'Geom', 'PostalCode', 'Unit', 'House', 'Street', 'City']).distinct().count()

# this includes restuarants that changed locations

'There are 6637 closed/open restaurants'

In [304]:
# The Geom locations are not unique
licences_rest_df.select(['Geom']).distinct().count()

2273

In [194]:
rest_distinct_df.filter('Geom is null')\
                .filter('PostalCode is null')\
                .filter('Unit is null')\
                .filter('House is null')\
                .filter('Street is null')


FOLDERYEAR,Status,BusinessTradeName,Geom,PostalCode,Unit,House,Street,City
14,Pending,Siga-Siga's Chine...,,,,,,Vancouver
18,Inactive,Siga-Siga's Filip...,,,,,,Vancouver
15,Pending,Panz Veggie,,,,,,Vancouver
16,Issued,Siga-Siga's Filip...,,,,,,Vancouver
15,Issued,Siga-Siga's Chine...,,,,,,Vancouver
13,Issued,Siga-Siga's Chine...,,,,,,Vancouver
15,Issued,Fiery Pizza,,,,,,Vancouver


> 4 of these restaurants do not have any information on the location. Since this is an insignificant proportion of the data, I will simply remove them.

In [316]:
rest_distinct_df = rest_distinct_df.dropna(how = 'all', subset = ['Geom', 'PostalCode', 'Unit', 'House', 'Street'])

> I've noticed that in order for the yelp API to work well, coordinate location is the most important factor. Thus, I will need to use another API to collect coordinates.   
> Problem: can't get information for old closed restuarants 
> - initial thoughts: have to find an external API that has archived webpages
> - maybe a solution: businesses/search doesn't give closed restuarants but businesses/matches does (have to have exact address) and then get the ID from this output and input into businesses/{id}

In [317]:
# Here I am ordering the status such that if in a single year, there are multiple statuses, I will take the greatest

rest_distinct_df = rest_distinct_df.withColumn('Status', 
                            when(col('Status') == 'Inactive', 0)
                            .when(col('Status') == 'Cancelled', 1)
                            .when(col('Status') == 'Gone Out of Business', 2)
                            .when(col('Status') == 'Pending', 3)
                            .when(col('Status') == 'Issued', 4))

In [319]:
# This will get the latest location of a restaurant
# if in a single year, the restuarant is both cancelled and gone out of businesss, aggregate into 1 category
# if in a single year, the restuarant is both cancelled and issued, aggregate into 1 category - this means the restuarant moved locations
latest_year = rest_distinct_df.sort(['FOLDERYEAR', 'Status'])\
                              .groupBy('BusinessTradeName')\
                              .agg(last('Status').alias('Status'), 
                                   last('FOLDERYEAR').alias('FOLDERYEAR'))

rest_distinct_df = rest_distinct_df.join(latest_year, on = ['Status', 'FOLDERYEAR', 'BusinessTradeName'], how = 'leftsemi')

In [320]:
rest_distinct_df.count()

5741

In [227]:
search_restaurant('Shizen').orderBy('BusinessTradeName')

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
13,1817812,13-198393,0,Nagomi Project Ltd,Natural Food Shiz...,Issued,2012-12-07 09:34:10,2013-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,7.0,851,2019-07-21 13:49:09,"""{""""type"""": """"Poi..."
15,2376219,15-146481,0,Nagomi Project Ltd,Natural Food Shiz...,Issued,2014-10-31 16:52:01,2015-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,10.0,878,2019-07-21 13:49:16,"""{""""type"""": """"Poi..."
17,2820494,17-144171,0,Nagomi Project Ltd,Natural Food Shiz...,Inactive,2017-01-03 10:44:27,2017-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,10.0,914,2020-01-01 02:32:04,"""{""""type"""": """"Poi..."
14,2158834,14-146423,0,Nagomi Project Ltd,Natural Food Shiz...,Issued,2013-12-10 22:39:49,2014-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,7.0,860,2019-07-21 13:49:12,"""{""""type"""": """"Poi..."
16,2604702,16-145864,0,Nagomi Project Ltd,Natural Food Shiz...,Issued,2015-11-05 11:16:47,2016-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,985,HORNBY ST,Vancouver,BC,CA,V6Z 1V3,Downtown,10.0,896,2019-07-21 13:49:20,"""{""""type"""": """"Poi..."
16,2603912,16-145075,0,Nagomi Project Ltd,Shizen Ya,Issued,2015-11-05 11:26:12,2016-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,10.0,1042,2019-07-21 13:49:20,"""{""""type"""": """"Poi..."
19,3273800,19-148040,0,Nagomi Project Ltd,Shizen Ya,Inactive,2018-11-27 12:38:24,2019-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,10.0,1151,2020-04-01 02:32:25,"""{""""type"""": """"Poi..."
17,2819772,17-143449,0,Nagomi Project Ltd,Shizen Ya,Issued,2016-11-29 09:10:08,2017-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,10.0,1063,2020-01-01 02:32:04,"""{""""type"""": """"Poi..."
14,2157871,14-145460,0,Nagomi Project Ltd,Shizen Ya,Issued,2013-12-13 10:56:37,2014-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,8.0,999,2019-07-21 13:49:12,"""{""""type"""": """"Poi..."
13,1841771,13-177394,1,Nagomi Project Ltd,Shizen Ya,Issued,2013-09-25 10:41:40,2013-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1102,W BROADWAY,Vancouver,BC,CA,V6H 1G5,Fairview,0.0,466,2019-07-21 13:49:09,"""{""""type"""": """"Poi..."


> Have to take into account restuarants that change ownership and also change names, or even move locations 

Is the restaurant part of a chain?  
How many other restaurants are nearby (within one mile)?  
How does it compare to nearby restaurants (based on price, but also its average rating — and the number of reviews)?  
How old is the restaurants?

## YELP API

https://www.yelp.ca/developers/documentation/v3/get_started   
https://github.com/Yelp/yelp-fusion#code-samples


In [312]:
test = rest_distinct_df.take(15)

In [35]:
api_key = os.getenv('YELP_API_KEY')
headers = {'Authorization': 'Bearer %s' % api_key}

# defines the values inside Geom 
Point = 'point' 
coordinates = 'coordinates' 


data = dict(name = [],
            category = [],
            rating = [],
            review_count = [],
            price = [])

rest_distinct_list = rest_distinct_df.collect()

In [392]:
for i in range(len(rest_distinct_list)):

    # API parameters
    name = rest_distinct_list[i].BusinessTradeName
    
    if rest_distinct_list[i].House != None:
        house = rest_distinct_list[i].House
    if rest_distinct_list[i].Street != None:
        street = rest_distinct_list[i].Street
    if rest_distinct_list[i].PostalCode != None:
        postalcode = rest_distinct_list[i].PostalCode
    
    address = ' '.join([house, street, postalcode])
            
    if rest_distinct_list[i].Geom == None:
        long = None
        lat = None
    else:    
        coord = eval(eval(rest_distinct_list[i].Geom))[coordinates]
        long = coord[0]
        lat = coord[1]

    params = {'apikey': api_key,
              'term': name,
              'location': address,
              'longitude': long,
              'latitude': lat,
              'limit': 1}
    
    # doesn't make a request if already have the data
    if name not in data['name']:
        data['name'].append(rest_distinct_list[i].BusinessTradeName)
        
        # gather data from API
        request = requests.get('https://api.yelp.com/v3/businesses/search', 
                               headers = headers, params = params)

        result = request.json()

        # checks if the request is valid or output was empty
        if request.reason != 'OK' or result['total'] == 0:
            data['category'].append(None)
            data['rating'].append(None)
            data['review_count'].append(None)
            data['price'].append(None)

        else:

            # checks if the following are keys in the output

            if 'categories' not in result['businesses'][0]:
                data['category'].append(None)
            else:
                data['category'].append(result['businesses'][0]['categories'])

            if 'rating' not in result['businesses'][0]:
                data['rating'].append(None)
            else:
                data['rating'].append(result['businesses'][0]['rating'])

            if 'review_count' not in result['businesses'][0]:
                data['review_count'].append(None)
            else:
                data['review_count'].append(result['businesses'][0]['review_count'])

            if 'price' not in result['businesses'][0]:
                data['price'].append(None)
            else:
                data['price'].append(result['businesses'][0]['price'])


KeyboardInterrupt: 

In [561]:
test = rest_distinct_df.filter('PostalCode is null').filter('Geom is null').collect()
#len(data['price'])

In [563]:
test

[Row(BusinessTradeName="Purdy's Chocolates", Geom=None, PostalCode=None, Unit=None, House='4255', Street='Arbutus St', City='Vancouver')]

In [185]:
test = rest_distinct_df.toPandas()
test[test.duplicated('BusinessTradeName')]

Unnamed: 0,BusinessTradeName,Geom,PostalCode,Unit,House,Street,City
35,7-Eleven Store,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",V6G 2M4,,1055,Denman St,Vancouver
61,7-Eleven Store,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",V6E 2R1,104,1199,W PENDER ST,Vancouver
76,Tim Hortons,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",V6A 3W9,,306,TERMINAL AV,Vancouver
134,Freshii,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",V6Z 0C8,128,765,NELSON ST,Vancouver
137,Freshii,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",V5X 3H1,,6541,MAIN ST,Vancouver
...,...,...,...,...,...,...,...
6420,Extreme Pita,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",,222,1055,W GEORGIA ST,Vancouver
6422,Uncle Fatih's Pizza,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",V6B 0E1,,638,ABBOTT ST,Vancouver
6430,Freshslice Pizza,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",V5L 1T5,,771,DAVIE ST,Vancouver
6433,Simpatico Ristorante,"""{""""type"""": """"Point"""", """"coordinates"""": [-123....",V6K 1N8,,2222,W 4th Av,Vancouver


In [181]:
    params = {'apikey': api_key,
              'name': "Shizen Ya",
              'address1': '160 1333 W BROADWAY',
              'zip_code': 'V6H 1G5',
              'city': 'Vancouver',
              'state': 'BC',
              'country': 'CA',
              'match_threshold': 'none',
              'limit': 10
             }
    
    request = requests.get('https://api.yelp.com/v3/businesses/matches', 
                           headers = headers, params = params)

In [182]:
request.json()

{'businesses': [{'id': 'VcaNoLa56gpBWPxufnTw4w',
   'alias': 'shizenya-on-broadway-vancouver-3',
   'name': 'Shizenya on Broadway',
   'coordinates': {'latitude': 49.263888, 'longitude': -123.134264},
   'location': {'address1': '160-1333 W Broadway',
    'address2': '',
    'address3': '',
    'city': 'Vancouver',
    'zip_code': 'V6H 1G9',
    'country': 'CA',
    'state': 'BC',
    'display_address': ['160-1333 W Broadway',
     'Vancouver, BC V6H 1G9',
     'Canada']},
   'phone': '+16045693721',
   'display_phone': '+1 604-569-3721'},
  {'id': 'Z8wd5qc9kB1K4I3lj8y31g',
   'alias': 'boston-pizza-vancouver-2',
   'name': 'Boston Pizza',
   'coordinates': {'latitude': 49.2636494, 'longitude': -123.134374},
   'location': {'address1': '1333 W Broadway',
    'address2': 'Suite 190',
    'address3': '',
    'city': 'Vancouver',
    'zip_code': 'V6H 4C1',
    'country': 'CA',
    'state': 'BC',
    'display_address': ['1333 W Broadway',
     'Suite 190',
     'Vancouver, BC V6H 4C1',
   

In [71]:
params = {'apikey': api_key}
requests.get('https://api.yelp.com/v3/businesses/G3jGcliONiZdPKBEWdvM4w', headers = headers, params = params).json()

{'id': 'G3jGcliONiZdPKBEWdvM4w',
 'alias': 'la-brasserie-vancouver',
 'name': 'La Brasserie',
 'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/bzNDth0ahpRqxz6d2O6vUA/o.jpg',
 'is_claimed': True,
 'is_closed': True,
 'url': 'https://www.yelp.com/biz/la-brasserie-vancouver?adjust_creative=5sgRUqt0gRSL0aY24fgXOg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_lookup&utm_source=5sgRUqt0gRSL0aY24fgXOg',
 'phone': '+17783795400',
 'display_phone': '+1 778-379-5400',
 'review_count': 211,
 'categories': [{'alias': 'french', 'title': 'French'},
  {'alias': 'german', 'title': 'German'},
  {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'}],
 'rating': 4.0,
 'location': {'address1': '1091 Davie Street',
  'address2': '',
  'address3': '',
  'city': 'Vancouver',
  'zip_code': 'V6E 1M5',
  'country': 'CA',
  'state': 'BC',
  'display_address': ['1091 Davie Street', 'Vancouver, BC V6E 1M5', 'Canada'],
  'cross_streets': ''},
 'coordinates': {'latitude': 49.2802877, 'longitude': 

In [107]:
    params = {'apikey': api_key,
              'term': 'Natural Food Shizen Ya',
              'location': 'Vancouver',
            #  'longitude': -123.141291,
             # 'latitude': 49.287830,
              'limit': 10,
             # 'sort_by': 'distance'
             }
    
    request = requests.get('https://api.yelp.com/v3/businesses/search', 
                           headers = headers, params = params)

In [108]:
request.json()

{'businesses': [{'id': 'as2xQt48ltKKDUQbMCBrMw',
   'alias': 'shizen-ya-vancouver-10',
   'name': 'Shizen Ya',
   'image_url': 'https://s3-media3.fl.yelpcdn.com/bphoto/d2hPY0p4s7mhBqf_qncXRQ/o.jpg',
   'is_closed': False,
   'url': 'https://www.yelp.com/biz/shizen-ya-vancouver-10?adjust_creative=5sgRUqt0gRSL0aY24fgXOg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=5sgRUqt0gRSL0aY24fgXOg',
   'review_count': 547,
   'categories': [{'alias': 'japanese', 'title': 'Japanese'},
    {'alias': 'sushi', 'title': 'Sushi Bars'},
    {'alias': 'bars', 'title': 'Bars'}],
   'rating': 4.0,
   'coordinates': {'latitude': 49.2808844, 'longitude': -123.1246278},
   'transactions': [],
   'price': '$$',
   'location': {'address1': '965 Hornby Street',
    'address2': '',
    'address3': None,
    'city': 'Vancouver',
    'zip_code': 'V6Z 1V3',
    'country': 'CA',
    'state': 'BC',
    'display_address': ['965 Hornby Street',
     'Vancouver, BC V6Z 1V3',
     'Canada']},
   'ph

# Creating the Cleaned Data

> Here I will create a PySpark dataframe to eventually join it back to the `rest_distinct_df` dataframe.

In [285]:
rest_distinct_info = spark.createDataFrame(pd.DataFrame(data))
rest_distinct_info

name,category,rating,review_count,price
Morris J. Wosk Ce...,,,,
Banana Leaf Malay...,[[title -> Malays...,3.5,211.0,$$
Top of Vancouver,[[title -> Americ...,3.0,307.0,$$$$
Portobello Ristor...,,,,
Park Theatre,[[title -> Cinema...,4.0,31.0,
The Bake Shop,[[title -> Bakeri...,4.5,42.0,$$
Liquid Nutrition,[[title -> Juice ...,1.5,6.0,$$$$
The Dish Fabulous...,[[title -> Massag...,3.5,28.0,$$$
Euro Bagel Cafe,[[title -> Coffee...,4.0,49.0,$
Thida Thai Restau...,"[[title -> Thai, ...",3.0,53.0,$$


In [281]:
rest_distinct_df.join(rest_distinct_info, rest_distinct_df.BusinessTradeName == rest_distinct_info.name, how = 'left')

BusinessTradeName,Geom,PostalCode,House,Street,name,house,street,postalcode,geom,category,rating,review_count,price
"""Phnom Penh """"a""""...","""{""""type"""": """"Poi...",V6A 1Z7,244,E Georgia St,,,,,,,,,
49th Parallel Cof...,"""{""""type"""": """"Poi...",V6K 1N6,2198,W 4TH AV,,,,,,,,,
49th Parallel Cof...,"""{""""type"""": """"Poi...",V6K 1N6,2152,W 4TH AV,,,,,,,,,
49th Parallel Cof...,"""{""""type"""": """"Poi...",V6E 0A1,689,THURLOW ST,,,,,,,,,
49th Parallel Cof...,"""{""""type"""": """"Poi...",V5T 3G3,2902,MAIN ST,,,,,,,,,
Annalena,"""{""""type"""": """"Poi...",V6J 5B8,1809,W 1ST AV,,,,,,,,,
Bing Sheng Restau...,,V5M 3H9,1800,RENFREW ST,,,,,,,,,
Brado,"""{""""type"""": """"Poi...",V5L 3X5,1399,COMMERCIAL DRIVE,,,,,,,,,
Capstone,"""{""""type"""": """"Poi...",V6G 1C1,1429,ROBSON ST,,,,,,,,,
Casa Shawarma,"""{""""type"""": """"Poi...",V5V 3E3,1385,KINGSWAY,,,,,,,,,


> quantify similarity of restaurant cuisines to see how similar restuarants in the same proximity do