# Exploratory Data Analysis

In this EDA, I will be examining various data sources from [opendata.vancouver.ca](https://opendata.vancouver.ca/pages/home/).

In [45]:
import numpy as np
import pandas as pd
import re
import requests

from matplotlib import pyplot as plt
from pyspark.sql import SparkSession

In [6]:
from pyspark.sql.functions import coalesce

In [7]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col, count, mean, sum, udf, when
from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
from pyspark.sql.functions import sum, col, udf
from pyspark.sql.types import *

In [8]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

In [9]:
# allows for cleaner output
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [6]:
licences_df = (spark.read
          .option('header','true')
          .option('inferSchema','true')
          .option('sep', ';')
          .csv('data/business-licences.csv'))

In [120]:
licences_df.count()

443158

> There are 443 158 observations in this dataset. 

In [7]:
licences_df.printSchema()

root
 |-- FOLDERYEAR: integer (nullable = true)
 |-- LicenceRSN: integer (nullable = true)
 |-- LicenceNumber: string (nullable = true)
 |-- LicenceRevisionNumber: integer (nullable = true)
 |-- BusinessName: string (nullable = true)
 |-- BusinessTradeName: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- IssuedDate: timestamp (nullable = true)
 |-- ExpiredDate: timestamp (nullable = true)
 |-- BusinessType: string (nullable = true)
 |-- BusinessSubType: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- House: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Province: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- PostalCode: string (nullable = true)
 |-- LocalArea: string (nullable = true)
 |-- NumberofEmployees: double (nullable = true)
 |-- FeePaid: integer (nullable = true)
 |-- ExtractDate: timestamp (nullable = true)
 |-- Geom: st

In [19]:
licences_df

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
15,2380055,15-150315,0,Brandi Nicole Eue...,Spa Haven Boutiqu...,Gone Out of Business,,,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Kensington-Cedar ...,1.0,,2019-07-21 13:49:17,
15,2380056,15-150316,0,Kelly Ashley Bake...,,Pending,,,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Kitsilano,1.0,,2019-07-21 13:49:17,
15,2380058,15-150318,0,Heart To Mind Cra...,Heart To Mind,Issued,2014-12-20 10:39:10,2015-12-31 00:00:00,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Kerrisdale,1.0,133.0,2019-07-21 13:49:17,
15,2380062,15-150322,0,(Donna Sam),,Issued,2014-12-02 15:12:32,2015-12-31 00:00:00,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,South Cambie,0.0,155.0,2019-07-21 13:49:17,
15,2380065,15-150325,0,James Joseph Thom...,James Thompson RST,Gone Out of Business,,,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Grandview-Woodland,0.0,,2019-07-21 13:49:17,
15,2380066,15-150326,0,(Bunchu Praichit),Bunchu Praichit T...,Issued,2014-12-09 21:26:32,2015-12-31 00:00:00,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Kensington-Cedar ...,0.0,133.0,2019-07-21 13:49:17,
15,2380078,15-150338,0,Tourland Travel Ltd,,Issued,2014-11-24 10:30:17,2015-12-31 00:00:00,Travel Agent,,7.0,Unit,900.0,W GEORGIA ST,Vancouver,BC,CA,V6C 2W6,Downtown,2.0,133.0,2019-07-21 13:49:17,"""{""""type"""": """"Poi..."
15,2380079,15-150339,0,Jetway Travel Inc,,Issued,2015-02-27 14:16:59,2015-12-31 00:00:00,Travel Agent,,,,1796.0,RENFREW ST,Vancouver,BC,CA,V5M 3H8,Hastings-Sunrise,2.0,173.0,2019-07-21 13:49:17,"""{""""type"""": """"Poi..."
15,2380080,15-150340,0,Skyline Connectio...,,Issued,2014-12-29 11:34:29,2015-12-31 00:00:00,Travel Agent,,,,5318.0,VICTORIA DRIVE,Vancouver,BC,CA,V5P 3V7,Kensington-Cedar ...,2.0,133.0,2019-07-21 13:49:17,"""{""""type"""": """"Poi..."
15,2380082,15-150342,0,The Flight Shops Inc,Flight Centre,Issued,2014-12-03 09:37:03,2015-12-31 00:00:00,Travel Agent,,,,1232.0,DAVIE ST,Vancouver,BC,CA,V6E 1N3,West End,4.0,133.0,2019-07-21 13:49:17,"""{""""type"""": """"Poi..."


In [9]:
licences_df.groupBy('FOLDERYEAR').count()

FOLDERYEAR,count
,1
13.0,60915
16.0,61394
20.0,62366
94.0,1
96.0,11
19.0,70771
15.0,60938
14.0,60581
18.0,66180


In [26]:
licences_df.select('BusinessType').distinct().count()
licences_df.select('BusinessType').groupBy('BusinessType').count()

BusinessType,count
Referral Services,1835
Janitorial Services,4366
Financial Institu...,1480
Public Market Ope...,20
Non-profit Housing,1674
Motel,7
Printing Services,1010
Retail Dealer - M...,74
Contractor,27394
Private Hospital,53


In [123]:
business_types = licences_df.select('BusinessType').distinct().collect()
business_list = [business_types[i].BusinessType for i in range(len(business_types))]

r = re.compile(".*[Ff]ood.*|.*[Rr]estaurant.*|.*[Ll]iquor.*")
newlist = list(filter(r.match, business_list)) 
print(newlist)

['Manufacturer - Food with Anc. Retail', 'Ltd Service Food Establishment', 'Temp Liquor Licence Amendment', 'Liquor Delivery Services', 'Liquor Establishment Standard', 'Retail Dealer - Food', 'Restaurant Class 2', 'Food Processing', 'Liquor Establishment Extended', 'Wholesale Dealer - Food with Anc. Retail', 'Restaurant Class 1', 'Liquor License Application', 'Warehouse Operator - Food', 'Manufacturer - Food', 'Liquor Retail Store', 'Wholesale Dealer - Food']


In [None]:
restaurants_1 = (licences_df['BusinessType'] == 'Restaurant Class 1')
restaurants_2 = (licences_df['BusinessType'] == 'Restaurant Class 2')
restaurants_3 = (licences_df['BusinessType'] == 'Ltd Service Food Establishment')


licences_rest_df = licences_df.filter(restaurants_1 | restaurants_2 | restaurants_3)

> Use an external dataset such as yelp or zomato to get restuarant cuisine, price range, and other features. 

In [139]:
licences_rest_df.groupBy("Status").count()

Status,count
Cancelled,510
Gone Out of Business,2488
Issued,20666
Inactive,518
Pending,1380


> There are 3693 businesses with missing name values so I will replace these entries with their business names. 

In [216]:
licences_rest_df.filter('BusinessTradeName is null')

3693

In [233]:
licences_rest_df = licences_rest_df.withColumn('BusinessTradeName', coalesce('BusinessTradeName', 'BusinessName'))

In [241]:
licences_rest_df.na.drop(subset = ['BusinessName']).count()

25538

> How many stores have changed owners? (BusinessName is the owner, BusinessTradeName is the name of the shop). There are 5682 different owners and 5150 different restaurants. 

In [240]:
print(licences_rest_df.select('BusinessName').distinct().count())
print(licences_rest_df.select('BusinessTradeName').distinct().count())

5682
5150


In [None]:
tripadvisor API, google maps API

https://developers.zomato.com/api
https://github.com/RapidSoftwareSolutions/Marketplace-Zomato-Package
api key: 99beac6a17fef3d9816ec61eb532bab0

In [93]:
import os

print(os.getenv('ZOMATO_API_KEY'))

None


In [141]:
# categories
params = {'apikey': '99beac6a17fef3d9816ec61eb532bab0',
          'city_id': None}
categories = requests.get('https://developers.zomato.com/api/v2.1/categories', params = params).json()


In [143]:
# cuisines
params = {'apikey': '99beac6a17fef3d9816ec61eb532bab0',
          'city_id': 256}
cuisines = requests.get('https://developers.zomato.com/api/v2.1/cuisines', params = params).json()


In [145]:
# types
params = {'apikey': '99beac6a17fef3d9816ec61eb532bab0',
          'city_id': 256}
types = requests.get('https://developers.zomato.com/api/v2.1/establishments', params = params).json()

In [160]:
# restaurants
params = {'apikey': '99beac6a17fef3d9816ec61eb532bab0',
          'entity_id': 256,
          'entity_type': 'city',
          'start': 40,
          'count': 100}
restaurants = requests.get('https://developers.zomato.com/api/v2.1/search', params = params).json()

In [161]:
restaurants

{'results_found': 8078,
 'results_start': 40,
 'results_shown': 20,
 'restaurants': [{'restaurant': {'R': {'has_menu_status': {'delivery': -1,
      'takeaway': -1},
     'res_id': 16617275},
    'apikey': '99beac6a17fef3d9816ec61eb532bab0',
    'id': '16617275',
    'name': 'Dinesty Dumpling House',
    'url': 'https://www.zomato.com/vancouver/dinesty-dumpling-house-2-central-richmond?utm_source=api_basic_user&utm_medium=api&utm_campaign=v2.1',
    'location': {'address': 'Richport Town Centre, 160-8111 Ackroyd Road, Richmond V6X3J9',
     'locality': 'Richport Town Centre, Richmond',
     'city': 'Richmond',
     'city_id': 256,
     'latitude': '49.1721920000',
     'longitude': '-123.1349620000',
     'zipcode': 'V6X3J9',
     'country_id': 37,
     'locality_verbose': 'Richport Town Centre, Richmond, Richmond'},
    'switch_to_order_menu': 0,
    'cuisines': 'Shanghai, Chinese, Asian',
    'timings': '11 AM to 3 PM, 5:30 PM to 10 PM (Mon-Fri),11 AM to 3 PM, 4:30 PM to 10 PM (Sat-S

## requests.get('https://developers.zomato.com/api/v2.1/geocode?lat=%2249.2827%22&lon=%22123.1207%22', params = params).json()

In [121]:
zomato_request = requests.get('https://developers.zomato.com/api/v2.1/cuisines?city_id=256', params = params)

In [126]:
zomato_request.json()

{'cuisines': [{'cuisine': {'cuisine_id': 1035, 'cuisine_name': 'Afghan'}},
  {'cuisine': {'cuisine_id': 152, 'cuisine_name': 'African'}},
  {'cuisine': {'cuisine_id': 1, 'cuisine_name': 'American'}},
  {'cuisine': {'cuisine_id': 4, 'cuisine_name': 'Arabian'}},
  {'cuisine': {'cuisine_id': 3, 'cuisine_name': 'Asian'}},
  {'cuisine': {'cuisine_id': 401, 'cuisine_name': 'Asian Fusion'}},
  {'cuisine': {'cuisine_id': 131, 'cuisine_name': 'Australian'}},
  {'cuisine': {'cuisine_id': 201, 'cuisine_name': 'Austrian'}},
  {'cuisine': {'cuisine_id': 193, 'cuisine_name': 'BBQ'}},
  {'cuisine': {'cuisine_id': 955, 'cuisine_name': 'Bagels'}},
  {'cuisine': {'cuisine_id': 5, 'cuisine_name': 'Bakery'}},
  {'cuisine': {'cuisine_id': 227, 'cuisine_name': 'Bar Food'}},
  {'cuisine': {'cuisine_id': 132, 'cuisine_name': 'Belgian'}},
  {'cuisine': {'cuisine_id': 270, 'cuisine_name': 'Beverages'}},
  {'cuisine': {'cuisine_id': 159, 'cuisine_name': 'Brazilian'}},
  {'cuisine': {'cuisine_id': 182, 'cuisine_n

In [16]:

schema = StructType([
    StructField('Category', StringType(), True),
    StructField('Count', IntegerType(), True),
    StructField('Description', StringType(), True)
])

# Create data frame
json_file_path = "data/yelp_dataset/yelp_academic_dataset_business.json"
df = spark.read.json(json_file_path)
print(df.schema)
df.show()

StructType(List(StructField(address,StringType,true),StructField(attributes,StructType(List(StructField(AcceptsInsurance,StringType,true),StructField(AgesAllowed,StringType,true),StructField(Alcohol,StringType,true),StructField(Ambience,StringType,true),StructField(BYOB,StringType,true),StructField(BYOBCorkage,StringType,true),StructField(BestNights,StringType,true),StructField(BikeParking,StringType,true),StructField(BusinessAcceptsBitcoin,StringType,true),StructField(BusinessAcceptsCreditCards,StringType,true),StructField(BusinessParking,StringType,true),StructField(ByAppointmentOnly,StringType,true),StructField(Caters,StringType,true),StructField(CoatCheck,StringType,true),StructField(Corkage,StringType,true),StructField(DietaryRestrictions,StringType,true),StructField(DogsAllowed,StringType,true),StructField(DriveThru,StringType,true),StructField(GoodForDancing,StringType,true),StructField(GoodForKids,StringType,true),StructField(GoodForMeal,StringType,true),StructField(HairSpecial

In [44]:
df.filter(df['city'] == 'Toronto')

address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
700 Kipling Avenu...,"[,,,,,,, False,,,...",EosRKXIGeSWFYWwpk...,"Martial Arts, Gym...",Toronto,"[5:30-23:0, 5:30-...",1,43.6245394916,-79.5291079302,Xtreme Couture,M8Z 5G3,16,3.0,ON
306 Yonge Street,"[,,,,,,, True,,, ...",1wWneWD_E1pBIyVpd...,"Shopping, Shoe St...",Toronto,"[10:0-21:0, 10:0-...",1,43.6565424,-79.3813076,Air Jordan Store,M5B 1R4,9,4.0,ON
415 Horner Avenue,"[,,,,,,, False,,,...",007Dg4ESDVacWcC4V...,"Shopping, Food, O...",Toronto,"[9:0-16:0, 9:0-16...",0,43.603232,-79.5384244,Front Door Organics,M8W 4W3,8,4.0,ON
843 Kipling Avenue,"[,, u'none', {'ro...",rVBPQdeayMYht4Uv_...,"Restaurants, Burg...",Toronto,,0,43.6332914,-79.5317683,Gourmet Burger Co...,M8Z 5G9,13,3.0,ON
1900 Eglintion Ave E,"[,,,,,,,,,, {'gar...",9JCjKd6eFXsAMVwou...,Cosmetics & Beaut...,Toronto,,1,43.7271887,-79.2930079,Bath and Body Works,M1L 2L9,7,3.5,ON
688 Gerrard Street E,"[,, u'beer_and_wi...",0QjROMVW9ACKjhSEf...,"Vietnamese, Resta...",Toronto,"[11:0-22:0, 11:0-...",1,43.6663763388,-79.3487726589,Mi Mi Restaurant,M4M 1Y3,116,4.0,ON
510 Coronation Dr...,"[,,,,,,,,,,,,,,,,...",OT-8IUWo_2M-rHddj...,Event Planning & ...,Toronto,"[, 9:0-17:0,,,,,]",1,43.7652657335,-79.1669769712,Equipment Sales a...,M1E 4X6,3,1.5,ON
143 Sheppard Aven...,"[,,,,,,, True,,, ...",umDBj-8WUNkNBODa6...,"Health & Medical,...",Toronto,"[10:0-20:0, 0:0-0...",1,43.7596753,-79.4181223,Natural Scents,M2N 1M7,8,4.5,ON
198 Spadina Avenue,"[,,,,,,,,,,,,,,,,...",rZSS1JzizAKTIWXxU...,Fitness & Instruc...,Toronto,"[17:0-21:0, 18:0-...",0,43.6502316,-79.3972233,Shaolin Temple Qu...,M5T 2C2,3,5.0,ON
287 College Street,,53Q2c9qMLEjD9r1wM...,"Shopping, Computers",Toronto,,1,43.6576212,-79.4008849,CJ Laptop Service...,M5T 1S2,4,4.0,ON
