In [5]:
import numpy as np
import pandas as pd
import re
import requests
import os
import time
import pickle
import json
import matplotlib.pyplot as plt
from datetime import datetime
from geopy.distance import geodesic

from matplotlib import pyplot as plt
from pyspark.sql import SparkSession

In [6]:
from pyspark.sql.functions import coalesce, max

In [7]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col, count, mean, sum, udf, when, last, first, regexp_extract
from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
from pyspark.sql.types import *

In [8]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

# allows for cleaner output
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

licences_df = (spark.read
          .option('header','true')
          .option('inferSchema','true')
          .option('sep', ';')
          .csv('../data/business-licences.csv'))

licences_df = licences_df.withColumnRenamed('FOLDERYEAR', 'year')\
           .withColumnRenamed('LicenceRSN', 'licence_rsn')\
           .withColumnRenamed('LicenceNumber', 'licence_num')\
           .withColumnRenamed('LicenceRevisionNumber', 'licence_revision_num')\
           .withColumnRenamed('BusinessName', 'owner_name')\
           .withColumnRenamed('BusinessTradeName', 'name')\
           .withColumnRenamed('Status', 'status')\
           .withColumnRenamed('IssuedDate', 'issued_date')\
           .withColumnRenamed('ExpiredDate', 'expired_date')\
           .withColumnRenamed('BusinessType', 'type')\
           .withColumnRenamed('BusinessSubType', 'subtype')\
           .withColumnRenamed('Unit', 'unit')\
           .withColumnRenamed('UnitType', 'unit_type')\
           .withColumnRenamed('House', 'house')\
           .withColumnRenamed('Street', 'street')\
           .withColumnRenamed('City', 'city')\
           .withColumnRenamed('Province', 'province')\
           .withColumnRenamed('Country', 'country')\
           .withColumnRenamed('PostalCode', 'postal_code')\
           .withColumnRenamed('LocalArea', 'area')\
           .withColumnRenamed('NumberofEmployees', 'n_employees')\
           .withColumnRenamed('FeePaid', 'fee')\
           .withColumnRenamed('ExtractDate', 'extract_date')\
           .withColumnRenamed('Geom', 'geom')

rest_1 = (licences_df.type == 'Restaurant Class 1')
rest_2 = (licences_df.type == 'Restaurant Class 2')
rest_3 = (licences_df.type == 'Ltd Service Food Establishment')
rest_4 = (licences_df.type == 'Temp Liquor Licence Amendment') & (licences_df.subtype == 'Area Extension')
rest_5 = (licences_df.type == 'Liquor Establishment Standard') & (licences_df.subtype == 'Class 1  0-65 Seats')
rest_6 = (licences_df.type == 'Liquor Establishment Extended') & (licences_df.subtype == 'Class 1  0-65 Seats')


licences_rest_df = licences_df.filter(rest_1 | rest_2 | rest_3 | rest_4 | rest_5 | rest_6)
licences_rest_df = licences_rest_df.withColumn('name', coalesce('name', 'owner_name'))
licences_rest_df = licences_rest_df.dropna(subset = ['owner_name'])

# create new columns for longitude and latitude
licences_rest_df = licences_rest_df.withColumn('longitude', regexp_extract(col('geom'), '\[(.+), (.+)\]', 1).cast('float'))\
                                   .withColumn('latitude', regexp_extract(col('geom'), '\[(.+), (.+)\]', 2).cast('float'))\
                                   .drop('geom')

# rest_distinct_df = licences_rest_df.select(['year', 'status', 
#                                             'name', 'longitude', 'latitude', 
#                                             'postal_code', 'unit', 'house', 
#                                             'street', 'city']).distinct()
# rest_distinct_df = rest_distinct_df.dropna(how = 'all', subset = ['longitude', 'latitude', 'house', 'street'])
# rest_distinct_df = rest_distinct_df.withColumn('status', 
#                             when(col('status') == 'Inactive', 0)
#                             .when(col('status') == 'Cancelled', 1)
#                             .when(col('status') == 'Gone Out of Business', 2)
#                             .when(col('status') == 'Pending', 3)
#                             .when(col('status') == 'Issued', 4))

# latest_year = rest_distinct_df.sort(['year', 'status'])\
#                               .groupBy('name')\
#                               .agg(last('status').alias('status'), 
#                                    last('year').alias('year'))

# rest_distinct_df = rest_distinct_df.join(latest_year, on = ['status', 'year', 'name'], how = 'leftsemi')

In [9]:
licences_rest_df

year,licence_rsn,licence_num,licence_revision_num,owner_name,name,status,issued_date,expired_date,type,subtype,unit,unit_type,house,street,city,province,country,postal_code,area,n_employees,fee,extract_date,longitude,latitude
14,2146519,14-134109,0,Penner & Associat...,Penner & Associat...,Gone Out of Business,2014-01-10 14:29:53,2014-12-31 00:00:00,Ltd Service Food ...,,,,159,W HASTINGS ST,Vancouver,BC,CA,V6B 1H4,Downtown,1.0,130,2019-07-21 13:49:12,-123.10915,49.282604
15,2610766,15-309912,0,Tuck Shoppe Inc,The Tuck Shoppe,Issued,2016-01-07 12:09:20,2015-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,237,UNION ST,Vancouver,BC,CA,V6A 2B2,Strathcona,6.0,133,2019-07-21 13:49:21,-123.09864,49.277897
15,2610959,15-310101,0,Thi Kim Trang Le ...,Pho Win Vietnames...,Pending,,,Ltd Service Food ...,,,,2138,E HASTINGS ST,Vancouver,BC,CA,V5L 1V1,Grandview-Woodland,2.0,52,2019-07-21 13:49:21,-123.060555,49.280895
15,2610959,15-310101,0,Thi Kim Trang Le ...,Pho Win Vietnames...,Pending,,,Ltd Service Food ...,,,,2690,MCGILL ST,Vancouver,BC,CA,V5K 1H3,Grandview-Woodland,2.0,52,2019-07-21 13:49:21,-123.04977,49.288372
15,2611426,15-310562,0,Happy 8 Healthier...,Happy 8 Healthier...,Issued,2015-11-10 14:14:15,2015-12-31 00:00:00,Ltd Service Food ...,,110.0,Unit,1610,ROBSON ST,Vancouver,BC,CA,V6G 1C7,West End,0.0,92,2019-07-21 13:49:21,-123.132935,49.289455
16,2612012,16-151523,0,0980081 BC Ltd,Forum Gold Court,Gone Out of Business,2016-01-11 10:08:02,2016-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,5155,VICTORIA DRIVE,Vancouver,BC,CA,V5P 3V1,Kensington-Cedar ...,0.0,714,2019-07-21 13:49:21,-123.065796,49.237865
16,2612180,16-151536,0,Papillon Cafe & B...,Papillon Cafe & B...,Inactive,2015-10-30 15:46:48,2016-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,3403,W BROADWAY,Vancouver,BC,CA,V6R 2B4,Kitsilano,7.0,714,2019-07-21 13:49:21,-123.18019,49.264526
15,2616490,15-315321,0,BS4U Japanese Res...,Broadway Station ...,Issued,2015-12-02 09:28:39,2015-12-31 00:00:00,Restaurant Class 1,With Liquor Service,101.0,Unit,1638,E BROADWAY,Vancouver,BC,CA,V5N 1W1,Kensington-Cedar ...,7.0,234,2019-07-21 13:49:21,-123.07069,49.262005
15,2616789,15-315500,0,CBH Enterprise Ltd,Waves Coffee Hous...,Issued,2015-11-23 14:38:10,2015-12-31 00:00:00,Ltd Service Food ...,,,,590,BEATTY ST,Vancouver,BC,CA,NOT APPLIC,Downtown,0.0,133,2019-07-21 13:49:21,-123.11008,49.27979
15,2617227,15-181071,1,Hongdae Chicken E...,Hongdae Pocha Cafe,Issued,2015-11-05 09:11:59,2015-12-31 00:00:00,Restaurant Class 1,With Liquor Service,,,1642,ROBSON ST,Vancouver,BC,CA,V6G 1C7,West End,6.0,10,2019-07-21 13:49:21,-123.13331,49.289722


In [10]:
with open('../data/data_cleaned_2020-08-26.pkl', 'rb') as f:
    data = pickle.load(f)

In [19]:
data = data.drop(columns = "cord")

In [21]:
rest_features = spark.createDataFrame(data)

In [30]:
rest_features

name,name_yelp,postal_code,unit,house,street,categories,is_claimed,is_closed,rating,review_count,price,city,distance_from_bike,distance_from_meter,avg_meter_price,is_chain,density
George Simnos & J...,Bows & Arrows,V5V 4E9,,4153,FRASER ST,"[Coffee & Tea, Br...",True,True,3.5,48.0,2.0,Vancouver,484.2691754472128,737.8257296024146,3.5,False,45
Cilantro Cafe,Cilantro Cafe,V6Z 2L2,110.0,840,HOWE ST,"[Cafes, Breakfast...",False,False,5.0,1.0,,Vancouver,139.56780655131266,20.701937073581217,5.0,False,335
O-Bowl,O-bowl By Rustic ...,V6K 2H3,,3132,W BROADWAY,[Asian Fusion],True,True,3.5,5.0,2.0,Vancouver,67.98454403785654,20.196598532042703,2.0,False,84
Swiss Chalet Roti...,Swiss Chalet Roti...,V6K 2H4,,3204,W BROADWAY,"[Comfort Food, Am...",True,True,2.5,40.0,2.0,Vancouver,125.78171845506904,17.6601663847749,2.0,False,79
Rouge Bistro Inc,Rouge Bistro,V5Y 1B1,,91,W 2ND AV,[American (Tradit...,True,True,4.0,36.0,1.0,Vancouver,89.99906972522209,22.75352193411247,1.0,False,49
Old Xian's Food,Old Xian's Food,V5R 5L7,,3510,KINGSWAY,[Chinese],True,True,3.5,45.0,2.0,Vancouver,245.88966413022936,339.0376057014121,1.0,False,65
Cornerstone Coffee,Cornerstone Coffee,V6K 1P7,,2698,W 4TH AV,[Coffee & Tea],False,True,3.0,34.0,2.0,Vancouver,92.0675647037376,23.382429963619824,1.0,False,60
Buttermere Patiss...,Buttermere Café,V6A 2V3,,636,MAIN ST,[Patisserie/Cake ...,False,False,4.0,12.0,,Vancouver,33.1631618333977,19.77814909942711,3.0,False,114
Grillzilla Bistro,Victoria Restaurant,V5P 3X7,,6482,VICTORIA DRIVE,[Breakfast & Brun...,True,True,3.5,19.0,1.0,Vancouver,241.93793838832,1821.8254537861617,1.0,False,15
Bob Likes Thai Fo...,Bob Likes Thai Food,V5V 3N8,,3755,MAIN ST,[Thai],True,False,3.5,162.0,2.0,Vancouver,32.31496423025161,25.71638365841581,1.0,False,63


In [39]:

licences_rest_df.join(rest_features, on = ['name', 'postal_code', 'house', 'street', 'city'], how = 'left')

name,postal_code,house,street,city,year,licence_rsn,licence_num,licence_revision_num,owner_name,status,issued_date,expired_date,type,subtype,unit,unit_type,province,country,area,n_employees,fee,extract_date,longitude,latitude,name_yelp,unit.1,categories,is_claimed,is_closed,rating,review_count,price,distance_from_bike,distance_from_meter,avg_meter_price,is_chain,density
A&W Restaurant (7...,V6Z 1A1,778,ROBSON ST,Vancouver,16,2604146,16-145308,0,Fast Urban Foods Inc,Issued,2015-11-05 12:03:07,2016-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,BC,CA,Downtown,16.0,714.0,2019-07-21 13:49:20,-123.12095,49.281548,A&W,,"[Fast Food, Burgers]",True,False,2.5,27.0,1.0,135.5136388303206,23.458617408143432,5.0,False,352.0
A&W Restaurant (7...,V6Z 1A1,778,ROBSON ST,Vancouver,15,2375592,15-145854,0,Fast Urban Foods Inc,Issued,2014-11-07 15:43:47,2015-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,BC,CA,Downtown,16.0,700.0,2019-07-21 13:49:16,-123.12095,49.281548,A&W,,"[Fast Food, Burgers]",True,False,2.5,27.0,1.0,135.5136388303206,23.458617408143432,5.0,False,352.0
A&W Restaurant (7...,V6Z 1A1,778,ROBSON ST,Vancouver,14,2158137,14-145726,0,Fast Urban Foods Inc,Issued,2013-11-21 17:26:06,2014-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,BC,CA,Downtown,14.0,686.0,2019-07-21 13:49:12,-123.12095,49.281548,A&W,,"[Fast Food, Burgers]",True,False,2.5,27.0,1.0,135.5136388303206,23.458617408143432,5.0,False,352.0
A&W Restaurant (7...,V6Z 1A1,778,ROBSON ST,Vancouver,17,2819990,17-143667,0,Fast Urban Foods Inc,Issued,2016-11-16 16:07:03,2017-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,BC,CA,Downtown,16.0,728.0,2020-01-01 02:32:04,-123.12095,49.281548,A&W,,"[Fast Food, Burgers]",True,False,2.5,27.0,1.0,135.5136388303206,23.458617408143432,5.0,False,352.0
A&W Restaurant (7...,V6Z 1A1,778,ROBSON ST,Vancouver,13,1817007,13-197588,0,Fast Urban Foods Inc,Issued,2012-12-04 13:38:05,2013-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,BC,CA,Downtown,14.0,679.0,2019-07-21 13:49:08,-123.12095,49.281548,A&W,,"[Fast Food, Burgers]",True,False,2.5,27.0,1.0,135.5136388303206,23.458617408143432,5.0,False,352.0
A&W Restaurant (7...,V6Z 1A1,778,ROBSON ST,Vancouver,19,3273995,19-148235,0,Fast Urban Foods Inc,Issued,2018-11-24 00:35:46,2019-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,BC,CA,Downtown,16.0,787.0,2020-04-01 02:32:25,-123.12095,49.281548,A&W,,"[Fast Food, Burgers]",True,False,2.5,27.0,1.0,135.5136388303206,23.458617408143432,5.0,False,352.0
A&W Restaurant (7...,V6Z 1A1,778,ROBSON ST,Vancouver,18,3040045,18-471714,0,Fast Urban Foods Inc,Issued,2017-11-07 13:59:35,2018-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,BC,CA,Downtown,16.0,757.0,2020-04-01 02:32:19,-123.12095,49.281548,A&W,,"[Fast Food, Burgers]",True,False,2.5,27.0,1.0,135.5136388303206,23.458617408143432,5.0,False,352.0
A&W Restaurant (7...,V6Z 1A1,778,ROBSON ST,Vancouver,20,3503656,20-150415,0,Fast Urban Foods Inc,Issued,2019-12-23 19:39:37,2020-12-31 00:00:00,Restaurant Class 1,No Liquor Service,,,BC,CA,Downtown,16.0,807.0,2020-05-01 00:08:43,-123.12095,49.281548,A&W,,"[Fast Food, Burgers]",True,False,2.5,27.0,1.0,135.5136388303206,23.458617408143432,5.0,False,352.0
Addis Cafe,V5N 4B1,2017,COMMERCIAL DRIVE,Vancouver,17,2801084,17-124787,0,Fitsum Alemu Tari...,Issued,2016-12-24 17:57:21,2017-12-31 00:00:00,Ltd Service Food ...,,,,BC,CA,Grandview-Woodland,2.0,503.0,2020-01-01 02:32:03,-123.07007,49.26663,Addis Cafe Ethiop...,,"[Ethiopian, Veget...",True,False,4.5,29.0,2.0,254.21374987351172,37.58309347043261,1.5,False,84.0
Addis Cafe,V5N 4B1,2017,COMMERCIAL DRIVE,Vancouver,16,2669019,16-126212,1,Fitsum Alemu Tari...,Cancelled,,,Restaurant Class 1,With Liquor Service,,,BC,CA,Grandview-Woodland,0.0,10.0,2019-07-21 13:49:21,-123.07007,49.26663,Addis Cafe Ethiop...,,"[Ethiopian, Veget...",True,False,4.5,29.0,2.0,254.21374987351172,37.58309347043261,1.5,False,84.0


In [None]:
# sort by 
# what about cafes that close and return 

## Merging Back to the Original Dataframe

> Here I will create a PySpark dataframe to eventually join it back to the `rest_distinct_df` dataframe.

In [6]:
licences_rest_df = licences_rest_df.toPandas()

NameError: name 'licences_rest_df' is not defined

In [1068]:
licences_rest_info = pd.merge(data_clean, licences_rest_df, on = ['name', 'postal_code', 'unit', 'house', 'street'], how = 'left')

> quantify similarity of restaurant cuisines to see how similar restuarants in the same proximity do