# Exploratory Data Analysis

In this EDA, I will be examining various data sources from [opendata.vancouver.ca](https://opendata.vancouver.ca/pages/home/).

In [32]:
import numpy as np
import pandas as pd
import re

from matplotlib import pyplot as plt
from pyspark.sql import SparkSession

In [230]:
from pyspark.sql.functions import coalesce

In [3]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col, count, mean, sum, udf, when
from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
from pyspark.sql.functions import sum, col, udf
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

In [5]:
# allows for cleaner output
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [6]:
licences_df = (spark.read
          .option('header','true')
          .option('inferSchema','true')
          .option('sep', ';')
          .csv('data/business-licences.csv'))

In [120]:
licences_df.count()

443158

> There are 443 158 observations in this dataset. 

In [7]:
licences_df.printSchema()

root
 |-- FOLDERYEAR: integer (nullable = true)
 |-- LicenceRSN: integer (nullable = true)
 |-- LicenceNumber: string (nullable = true)
 |-- LicenceRevisionNumber: integer (nullable = true)
 |-- BusinessName: string (nullable = true)
 |-- BusinessTradeName: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- IssuedDate: timestamp (nullable = true)
 |-- ExpiredDate: timestamp (nullable = true)
 |-- BusinessType: string (nullable = true)
 |-- BusinessSubType: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- House: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Province: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- PostalCode: string (nullable = true)
 |-- LocalArea: string (nullable = true)
 |-- NumberofEmployees: double (nullable = true)
 |-- FeePaid: integer (nullable = true)
 |-- ExtractDate: timestamp (nullable = true)
 |-- Geom: st

In [19]:
licences_df

FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,BusinessSubType,Unit,UnitType,House,Street,City,Province,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom
15,2380055,15-150315,0,Brandi Nicole Eue...,Spa Haven Boutiqu...,Gone Out of Business,,,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Kensington-Cedar ...,1.0,,2019-07-21 13:49:17,
15,2380056,15-150316,0,Kelly Ashley Bake...,,Pending,,,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Kitsilano,1.0,,2019-07-21 13:49:17,
15,2380058,15-150318,0,Heart To Mind Cra...,Heart To Mind,Issued,2014-12-20 10:39:10,2015-12-31 00:00:00,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Kerrisdale,1.0,133.0,2019-07-21 13:49:17,
15,2380062,15-150322,0,(Donna Sam),,Issued,2014-12-02 15:12:32,2015-12-31 00:00:00,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,South Cambie,0.0,155.0,2019-07-21 13:49:17,
15,2380065,15-150325,0,James Joseph Thom...,James Thompson RST,Gone Out of Business,,,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Grandview-Woodland,0.0,,2019-07-21 13:49:17,
15,2380066,15-150326,0,(Bunchu Praichit),Bunchu Praichit T...,Issued,2014-12-09 21:26:32,2015-12-31 00:00:00,Therapeutic Touch...,,,,,,Vancouver,BC,CA,,Kensington-Cedar ...,0.0,133.0,2019-07-21 13:49:17,
15,2380078,15-150338,0,Tourland Travel Ltd,,Issued,2014-11-24 10:30:17,2015-12-31 00:00:00,Travel Agent,,7.0,Unit,900.0,W GEORGIA ST,Vancouver,BC,CA,V6C 2W6,Downtown,2.0,133.0,2019-07-21 13:49:17,"""{""""type"""": """"Poi..."
15,2380079,15-150339,0,Jetway Travel Inc,,Issued,2015-02-27 14:16:59,2015-12-31 00:00:00,Travel Agent,,,,1796.0,RENFREW ST,Vancouver,BC,CA,V5M 3H8,Hastings-Sunrise,2.0,173.0,2019-07-21 13:49:17,"""{""""type"""": """"Poi..."
15,2380080,15-150340,0,Skyline Connectio...,,Issued,2014-12-29 11:34:29,2015-12-31 00:00:00,Travel Agent,,,,5318.0,VICTORIA DRIVE,Vancouver,BC,CA,V5P 3V7,Kensington-Cedar ...,2.0,133.0,2019-07-21 13:49:17,"""{""""type"""": """"Poi..."
15,2380082,15-150342,0,The Flight Shops Inc,Flight Centre,Issued,2014-12-03 09:37:03,2015-12-31 00:00:00,Travel Agent,,,,1232.0,DAVIE ST,Vancouver,BC,CA,V6E 1N3,West End,4.0,133.0,2019-07-21 13:49:17,"""{""""type"""": """"Poi..."


In [9]:
licences_df.groupBy('FOLDERYEAR').count()

FOLDERYEAR,count
,1
13.0,60915
16.0,61394
20.0,62366
94.0,1
96.0,11
19.0,70771
15.0,60938
14.0,60581
18.0,66180


In [26]:
licences_df.select('BusinessType').distinct().count()
licences_df.select('BusinessType').groupBy('BusinessType').count()

BusinessType,count
Referral Services,1835
Janitorial Services,4366
Financial Institu...,1480
Public Market Ope...,20
Non-profit Housing,1674
Motel,7
Printing Services,1010
Retail Dealer - M...,74
Contractor,27394
Private Hospital,53


In [123]:
business_types = licences_df.select('BusinessType').distinct().collect()
business_list = [business_types[i].BusinessType for i in range(len(business_types))]

r = re.compile(".*[Ff]ood.*|.*[Rr]estaurant.*|.*[Ll]iquor.*")
newlist = list(filter(r.match, business_list)) 
print(newlist)

['Manufacturer - Food with Anc. Retail', 'Ltd Service Food Establishment', 'Temp Liquor Licence Amendment', 'Liquor Delivery Services', 'Liquor Establishment Standard', 'Retail Dealer - Food', 'Restaurant Class 2', 'Food Processing', 'Liquor Establishment Extended', 'Wholesale Dealer - Food with Anc. Retail', 'Restaurant Class 1', 'Liquor License Application', 'Warehouse Operator - Food', 'Manufacturer - Food', 'Liquor Retail Store', 'Wholesale Dealer - Food']


In [None]:
restaurants_1 = (licences_df['BusinessType'] == 'Restaurant Class 1')
restaurants_2 = (licences_df['BusinessType'] == 'Restaurant Class 2')
restaurants_3 = (licences_df['BusinessType'] == 'Ltd Service Food Establishment')


licences_rest_df = licences_df.filter(restaurants_1 | restaurants_2 | restaurants_3)

> Use an external dataset such as yelp or zomato to get restuarant cuisine, price range, and other features. 

In [139]:
licences_rest_df.groupBy("Status").count()

Status,count
Cancelled,510
Gone Out of Business,2488
Issued,20666
Inactive,518
Pending,1380


> There are 3693 businesses with missing name values so I will replace these entries with their business names. 

In [216]:
licences_rest_df.filter('BusinessTradeName is null')

3693

In [233]:
licences_rest_df = licences_rest_df.withColumn('BusinessTradeName', coalesce('BusinessTradeName', 'BusinessName'))

In [241]:
licences_rest_df.na.drop(subset = ['BusinessName']).count()

25538

> How many stores have changed owners? (BusinessName is the owner, BusinessTradeName is the name of the shop). There are 5682 different owners and 5150 different restaurants. 

In [240]:
print(licences_rest_df.select('BusinessName').distinct().count())
print(licences_rest_df.select('BusinessTradeName').distinct().count())

5682
5150
