In [3]:
# Load all required libraries for the notebook, including data package
if(!require("opendatatoronto")) {
    install.packages("opendatatoronto")
    library(opendatatoronto) # data source
}
if(!require("tidyjson")) {
    install.packages("tidyjson")
    library(tidyjson)
}
if(!require("tidygeocoder")) {
    install.packages("tidygeocoder")
    library(tidygeocoder)
}
if(!require("sf")) {
    install.packages("sf")
    library(sf)
}

if(!require("ggspatial")) {
    install.packages("ggspatial")
    library(ggspatial)
}
if(!require("stringr")) {
    install.packages("stringr")
    library(stringr)
}
if(!require("dplyr")) {
    install.packages("dplyr")
    library(dplyr)
}
if(!require("ggplot2")) {
    install.packages("ggplot2")
    library(ggplot2)
}
if(!require("tidyverse")) {
    install.packages("tidyverse")
    library(tidyverse)
}
if(!require("FNN")) {
    install.packages("FNN")
    library(FNN)
}

if(!require("mapview")) {
    install.packages("mapview")
    library(mapview)
}

Loading required package: opendatatoronto



Loading required package: tidyjson


Attaching package: 'tidyjson'


The following object is masked from 'package:stats':

    filter


Loading required package: tidygeocoder

Loading required package: sf

Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE

Loading required package: ggspatial

Loading required package: stringr

Loading required package: dplyr


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Loading required package: ggplot2

Loading required package: tidyverse

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     [32m✔[39m [34mtidyr    [39m 1.3.0
── [1mConflicts

In [4]:
# Get Data - Traffic
# output Data Description:
# Dataframe with all intersection and daily count ( peak 4 hours), including lng / lat

# # ? and todo:
# # is separate street name needed
# # direction of the street to be determined. How?


# # package_traffic <- show_package("traffic-volumes-at-intersections-for-all-modes")

# # get all resources for this package
# resources <- list_package_resources("traffic-volumes-at-intersections-for-all-modes")

# # identify datastore resources; by default, Toronto Open Data sets datastore resource format to CSV for non-geospatial and GeoJSON for geospatial resources
# datastore_resources <- filter(resources, tolower(format) %in% c("csv", "geojson"))

# # load data # The method of loading data directly using turns out to be insufficient as the get_resource() only returns first 32000 rows of record. 
# location <- filter(datastore_resources, row_number() == 1) %>% get_resource()
# traffic1 <- filter(datastore_resources, row_number() == 3) %>% get_resource()
# traffic2 <- filter(datastore_resources, row_number() == 4) %>% get_resource()
# traffic3 <- filter(datastore_resources, row_number() == 5) %>% get_resource()
# traffic4 <- filter(datastore_resources, row_number() == 6) %>% get_resource()
# traffic5 <- filter(datastore_resources, row_number() == 7) %>% get_resource()

# To Run the code download the raw files from 
# https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/ 
# and save the files in .csv format to the path: ../Data/Toronto/Traffic, 7 Files below. 
# count_metadata.csv
# locations.csv
# raw-data-1980-1989.csv
# raw-data-1990-1999.csv
# raw-data-2000-2009.csv
# raw-data-2010-2019.csv
# raw-data-2020-2029.csv
location <- read.csv("../Data/Toronto/Traffic/locations.csv") # ensure these path are correct if you have to download the files manually. 
traffic1 <- read.csv("../Data/Toronto/Traffic/raw-data-1980-1989.csv")
traffic2 <- read.csv("../Data/Toronto/Traffic/raw-data-1990-1999.csv")
traffic3 <- read.csv("../Data/Toronto/Traffic/raw-data-2000-2009.csv")
traffic4 <- read.csv("../Data/Toronto/Traffic/raw-data-2010-2019.csv")
traffic5 <- read.csv("../Data/Toronto/Traffic/raw-data-2020-2029.csv")
all_traffic = bind_rows(traffic1,traffic2,traffic3,traffic4,traffic5) # combine all raw data into one data frame

In [249]:
# clean and transform load - Traffic Data
# Output data for modelling CleanTraffic
# define parameters for cleaning
peakhours <- 4 # number of peak hours of data per day. value should be between 1 and 10

# get full intersection volume for each intersection based on peak hours of each day. 
# get average vol per intersection per year. 
# get number of years list, sort from low to high

# 

CleanTraffic <- all_traffic %>%
  filter(centreline_type == 2) %>% # only need intersection data
  mutate(counthour = str_extract(time_start, "(?<=T)(\\d+)(?=\\:)")) %>% # extract hour
  mutate(total_int_traffic = sb_cars_r + sb_cars_t + sb_cars_l +
    nb_cars_r + nb_cars_t + nb_cars_l + wb_cars_r + wb_cars_t +
    wb_cars_l + eb_cars_r + eb_cars_t + eb_cars_l) %>% # get total sum
  select(one_of(c(
    "count_date", "location_id", "location", "lng", "lat", "counthour",
    "total_int_traffic"
  ))) %>% # remove raw attributes, retain aggregate only
  group_by(across(all_of(c("count_date", "location_id", "location", "lng", "lat", "counthour")))) %>%
  summarise_at("total_int_traffic",sum) %>% # agregate hourly volume
  group_by(across(all_of(c("count_date", "location_id", "location", "lng", "lat")))) %>%
  slice_max(order_by = total_int_traffic, n = peakhours) %>% # filter top peak hour volume
  group_by(across(all_of(c("count_date", "location_id", "location", "lng", "lat")))) %>%
  summarise_at("total_int_traffic", sum)%>% # aggregate daily peak hour volume
  mutate(count_date= as.Date(count_date, format("%Y-%m-%d"))) %>% 
  mutate(year = as.numeric(format(count_date,'%Y'))) %>% #add year
  group_by(across(all_of(c("year", "location_id", "location", "lat","lng")))) %>% # group by year to get the average per intersection per year
  summarise(AvgTotal = mean(total_int_traffic), .groups = "drop")  # average traffic volume for that year and location
traffic_years <- unique(CleanTraffic$year) # get number of years list, sort from low to high

In [250]:
# Get Green P Parking package from Open Data-Toronto
package <- show_package("b66466c3-69c8-4825-9c8b-04b270069193")

data=as.data.frame('Green P Parking')  # read the dataset Green P Parking in the package 
data<-show_package(package)

resources<-list_package_resources(package)

# Identify resources
data_resources <- filter(resources, tolower(format) %in% c("csv", "json"))

# Load Green P Parking 2019 data
data <- filter(data_resources, row_number() == 1) %>% get_resource()

# Extract required columns from main data
data<- data.frame(address=data$carparks$address,
                                     lat=data$carparks$lat,
                                     lng=data$carparks$lng,
                                      carpark_type=data$carparks$carpark_type_str,
                                      rate_half_hr=data$carparks$rate_half_hour,
                                      capacity=data$carparks$capacity,
                                      rate=data$carparks$rate_half_hour
                  )

# Check class of each attribute
sapply(data, class) 

# Convert char to numeric class
data$lat<-as.numeric(data$lat)
data$lng<-as.numeric(data$lng)
data$rate_half_hr<-as.numeric(data$rate_half_hr)
data$capacity<-as.numeric(data$capacity)

sapply(data, class) 

# Check for missing values
any(is.na(data))

# Extract street name from address
data <- data %>%
  mutate(extracted_address = str_replace_all(data$address, "\\(.*?\\)",""))

data$extracted_address<-str_replace_all(data$extracted_address, "-.*", "")
data$extracted_address<-str_replace_all(data$extracted_address, ",.*", "")

# Extract data with carpark_type as 'Surface'
data <- data %>%
  filter(carpark_type == "Surface")



In [251]:
# Extracting address, latitude, and longitude
data <- data[, c("address", "lat", "lng")]

# Adding the new rows
new_rows <- data.frame(
  address = c(
    "365 Lippincott Street",
    "35 Erindale Avenue",
    "14 Arundel Avenue",
    "265 Armadale Avenue",
    "1612 Danforth Avenue",
    "117 Hammersmith Avenue",
    "166 Woodbine Ave",
    "19 Spadina Road",
    "2300 Lake shore Boulevard West",
    "35 Erindale Avenue",
    "265 Armadale Avenue",
    "2300 Lake shore Boulevard West"
  ),
  lat = c(
    43.665054,
    43.688543,
    43.695882,
    43.721568,
    43.684903,
    43.693819,
    43.674124,
    43.677754,
    43.623545,
    43.688543,
    43.721568,
    43.623545
  ),
  lng = c(
    -79.409662,
    -79.411305,
    -79.42134,
    -79.427191,
    -79.325679,
    -79.428246,
    -79.319325,
    -79.403948,
    -79.479056,
    -79.411305,
    -79.427191,
    -79.479056
  )
)

# Adding the new rows to the extracted GreenPParking dataset
data <- rbind(data, new_rows)

# Creating the 'Convert' column
data$Convert <- ifelse(data$address %in% new_rows$address, 1, 0)

write.csv(data, "GreenPParking.csv",row.names = FALSE)

In [252]:
# Graphical View of Parking Data
# Convert Lat/Lng to address
geo_rev_data<-data %>%
  tidygeocoder::reverse_geocode(
    lat=lat,
    long=lng,
    method="osm")

# # Plot map
# map<-data %>%
#   st_as_sf(
#     coords=c("lng","lat"),
#     crs=4326
#   )

# map %>% mapview()



Passing 228 coordinates to the Nominatim single coordinate geocoder

Query completed in: 237 seconds

[1m[22mNew names:
[36m•[39m `address` -> `address...1`
[36m•[39m `address` -> `address...5`


In [None]:
# Combine Parking Data and Clean Traffic Data to prepare input data for Time Series Model to predict traffic / EV charging demand

# loop through low to high year number
# filter each year, to list of unique intersections with traffic volume
# run knn algo against parking lot, find k nearst intersection
# use knn nn.index to find the k intersection vol, get average, creating a vector
# add vector to parking data as new column with year and volume
# next loop for the next year

#Output TS_Input

K <- 5 # Parameter used in knn calculation. 
All_park <- data %>%
    select(one_of(c("lat","lng"))) # location coordinates of all surface car parks. 

TS_Input <- data # TS_Input is the input data for the time series of traffic volume forecast. 

for (i in traffic_years) {
    # print(paste("This is year" ,i))
    year_traffic<- filter(CleanTraffic, year == i) # get the traffic volume, intersection for each year.
    All_int <-select(year_traffic,one_of(c("lat","lng"))) # extract just the coordinates for K nearest neighter calculation
    knn_dist <- get.knnx(All_int, All_park, k=K, algorithm="kd_tree") # get the K nearest intersection to the parking lot and their index list
    
    get.mean <- function(x) { # custom function defined to calculate the average of the K nearest intersection volume
    mean(slice(year_traffic, c(x))$AvgTotal)
    }

    TS_Input[paste0("NearbyTraffic_",i)]  <- apply(knn_dist$nn.index,1,get.mean) # calculate the avaerage per row of indices

}

In [None]:
# Model 1 - Time Series Forecast

# forecast traffic for 2024 given the TS_Input
# apply % of EV vehicle multiplier for 2022 and predicted year 2024 to get amount of estimated EV traffic.
# data frame that has the original location, EV traffic for 2022, and predicted EV traffic for 2024


library(forecast)
library(purrr)

# Time series model and Forecast for 2024
arima_models <- apply(TS_Input[, -c(1,2,3,4,5,6,7,8)], 1, auto.arima)
model_forecasts_2024 <- lapply(arima_models, function(x) forecast(x, h = 1))

TS_Output_2024 <- map_dbl(model_forecasts_2024, "mean")
TS_Output_2024 <- round(TS_Output_2024)


# Format output for 2024
# Data frame that contains the original location, lat, and lng columns,
# % of EV traffic for 2022 and % of EV traffic for predicted 2024

head(TS_Output_2024)

# % of EV vehicle multiplier for 2022
mulitplier_2022 = .03

# % of EV Vehicle multiplier for predicted year 
multiplier_predicted = .0533

TS_Output_df = data.frame(TS_Input$address,TS_Input$lat,TS_Input$lng, TS_Input$NearbyTraffic_2022,TS_Output_2024)
#head(TS_Output_df)

# Apply % of EV Vehicle multiplier to 2022 and predicted year 2024, present a new columns to Df
traffic_2022 <- data.frame(TS_Output_df$TS_Input.NearbyTraffic_2022)
traffic_2024 <-data.frame(TS_Output_df$TS_Output_2024)

TS_Output_df$EV_nearbyTraffic_2022 <- apply(traffic_2022, 1, function(x) round(x * mulitplier_2022))
TS_Output_df$EV_nearbyTraffic_2024 <- apply(traffic_2024, 1, function(x) round(x * multiplier_predicted))


# Final Output
head(TS_Output_df)

In [None]:
# Model 2 - SVM

library(dplyr)
library(FNN)  # For nearest neighbor calculations

# Read data
parking_data <- read.csv("GreenPParking.csv")
business_data <- read.csv("business.csv")

K <- 8  # Number of nearest businesses to consider

for (i in 1:nrow(parking_data)) {
    parking_spot <- parking_data[i, c("lat", "lng")]  # Coordinates of the parking spot
    nearest_biz_indices <- get.knnx(business_data[, c("lat", "long")], parking_spot, k = K, algorithm = "kd_tree")$nn.index
    nearest_biz_distances <- get.knnx(business_data[, c("lat", "long")], parking_spot, k = K, algorithm = "kd_tree")$nn.dist
    
    # Calculate the number of nearest businesses and their distances
    num_nearest_biz <- length(nearest_biz_indices)
    avg_distance <- mean(nearest_biz_distances)
    
    nearest_biz_customers <- sapply(nearest_biz_indices, function(idx) {
  mean(business_data[idx, "qCustomer"])
})
    nearest_biz_time_spent <- sapply(nearest_biz_indices, function(idx) {
  mean(business_data[idx, "tCustomer"])
})

    overall_mean_customers <- mean(nearest_biz_customers)
    overall_mean_time_spent <- mean(nearest_biz_time_spent)


    # Add the results to the parking_data dataframe
    parking_data[i, paste0("nearest_biz_count_", K)] <- num_nearest_biz
    parking_data[i, paste0("avg_distance_to_biz_", K)] <- avg_distance
    parking_data[i, paste0("avg_customers_nearby_", K)] <- mean(overall_mean_customers)
    parking_data[i, paste0("avg_time_spent_nearby_", K)] <- mean(overall_mean_time_spent)
}

# Add EV Traffic data
parking_data$traffic_volume=TS_Output_df$EV_nearbyTraffic_2022	

parking_data <- parking_data %>%
  group_by(address) %>%
  summarise (distance = mean(avg_distance_to_biz_8), n_business = mean(nearest_biz_count_8),n_customers = mean(avg_customers_nearby_8), time_spent= mean(avg_time_spent_nearby_8), traffic_volume = traffic_volume, convert= mean(Convert)) 

# demand = 0.1
# filtered_result_df3 <- filtered_result_df3 %>%
#   mutate(biz_demand = demand*interaction_term)

In [None]:
# # Normalize predictors
parking_data$distance <- scale(parking_data$distance)
parking_data$traffic_volume <- scale(parking_data$traffic_volume)
# filtered_result_df3$interaction_term <- scale(filtered_result_df3$interaction_term)
# filtered_result_df3$biz_demand<- scale(filtered_result_df3$biz_demand)
parking_data$time_spent<- scale(parking_data$time_spent)
# parking_data$n_business<- scale(parking_data$n_business)
parking_data$n_customers<- scale(parking_data$n_customers)

In [None]:
library(e1071)  # for svm

set.seed(123)  # for reproducibility
train_indices <- sample(1:nrow(filtered_result_df3), 0.7 * nrow(parking_data))  # 70% for training
train_data <- parking_data[train_indices, ]
test_data <- parking_data[-train_indices, ]

# Create a formula including predictor columns
formula <- as.formula("convert ~ distance + traffic_volume ")

# Create the SVM model with the formula
svm_model <- svm(formula, data = train_data, kernel = "linear")

# Get the coefficients (weights) of the model
weights <- coef(svm_model)#[[1]]



In [216]:
# Coefficients from the SVM model
coefficients <- coef(svm_model)[-1]  # Exclude the intercept

# Select the columns for which to calculate the score
cols <- c("distance", "traffic_volume")

# Calculate the score for each row
parking_data$score <- rowSums(parking_data[cols] * coefficients)

# View the results
head(parking_data)


address,distance,n_business,n_customers,time_spent,traffic_volume,convert,score
<chr>,"<dbl[,1]>",<dbl>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>",<dbl>,<dbl>
1 Brimley Road (Bluffer's Park),2.601484,8,-0.7518021,-0.5965834,5.5784382,0,-0.004953064
1 Shortt Street,-0.5157704,8,1.576868,-0.2924004,-0.5027482,0,-2.986566e-05
10 Empress Avenue,0.864064,8,-0.9750993,1.8004656,-0.5952377,0,-0.0001627783
10 Harlandale Avenue,0.6937224,8,-0.9750993,1.8004656,-0.3986975,0,8.650913e-06
10 Kingsdale Avenue,0.8854988,8,-0.9750993,1.8004656,-0.5952377,0,-0.0001757574
100 Grangeway Avenue,2.874718,8,-0.528505,-0.4476033,0.4683918,0,9.802883e-05


In [217]:
library(dplyr)

# Sort the rows in descending order according to the score
sorted_result <- parking_data %>%
  arrange(desc(score))

# View the sorted result
head(sorted_result,12)


address,distance,n_business,n_customers,time_spent,traffic_volume,convert,score
<chr>,"<dbl[,1]>",<dbl>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>",<dbl>,<dbl>
34 Hanna Avenue,-1.0346215,8,0.9388762,-1.04234472,-1.1501749,0,0.001322926
1155 King Street West,-1.0532615,8,1.6087676,-1.27027712,-1.1270525,0,0.001320212
11 Kenwood Avenue,-1.0008281,8,0.1413864,0.89949268,-1.1732973,0,0.001316465
300 Remembrance Drive,-0.8817283,8,0.6517799,0.66271405,-1.0114406,0,0.001146342
"745 Ossington Avenue, 16 Carling Avenue",-1.1404472,8,0.7474786,-0.41833177,-0.7455332,0,0.001141989
9 Salem Avenue,-1.1605139,8,0.6198803,0.66847008,-0.7108496,0,0.001133138
18 Ossington Avenue,-1.1443049,8,0.332784,0.04912894,-0.6992884,0,0.001116323
789 St. Clair Avenue West,-0.9164157,8,0.4922819,1.35045623,-0.8727063,0,0.00108334
545 Lake Shore Boulevard West,-0.8845014,8,0.7155791,-0.09504839,-0.8611451,0,0.001057015
800 Fleet Street,-0.9713289,8,0.6517799,0.66271405,-0.7108496,0,0.001018584


In [253]:

# # Plot selected rows on map
# map_df <- selected_rows %>%
#   st_as_sf() %>%
#   st_set_crs(4326) %>%
#   fortify()   # Convert the 'sf' object to a format suitable for use with 'ggplot2'

# # Display the spatial object using 'mapview'
# mapview(map_df)




In [254]:
# write.csv(selected_rows, "Parking.csv", row.names = FALSE)


In [255]:
# # Add lat, lng and remove geometry column
# selected_rows1<-selected_rows
# coords <- st_coordinates(selected_rows1$geometry)
# selected_rows1 <- cbind(selected_rows1, lng = coords[, "X"], lat = coords[, "Y"])
# parking_data <- st_drop_geometry(selected_rows1)
# write.csv(parking_data, "Parking_data.csv", row.names = FALSE)


In [256]:
# Get Data - Intersection
# output Data Description:


In [257]:
# Get Data - Business
# output Data Description: 

In [258]:
# Define Region of Interest - Boundary
## coordinates manually looked up from location dataset
# 1406	5370	DUPONT ST AT OSSINGTON AVE (PX 842)	-79.429019	43.670031996501194
# 251	4180	DUPONT ST AT SPADINA RD (PX 840)	-79.407122	43.67485699954096
# 1885	5864	COLLEGE ST AT OSSINGTON AVE (PX 829)	-79.422705	43.65439999619167
# 241	4170	COLLEGE ST AT SPADINA AVE (PX 279)	-79.400048	43.65794800150128

# Input
# Output

boundary <- location %>%
  select(location_id, location, lng, lat) %>%
  filter(location_id %in% list(5370, 4180, 5864, 4170)) # boundary intersection ID

lng_min <- min(boundary$lng) # west most value since it's negative
lng_max <- max(boundary$lng) # east most value
lat_min <- min(boundary$lat) # south most value
lat_max <- max(boundary$lat) # north most value


In [260]:
#write.csv(TS_Input, "TS_Input.csv", row.names = FALSE)


In [261]:
# # Convert the data frame 'TS_Input' to an 'sf' object, specifying the columns containing longitude and latitude as coordinates, and set the CRS
# TS_Input_sf <- st_as_sf(TS_Input, coords = c("lng", "lat"), crs = 4326)

# # Find the indices of the rows in 'data_sf' that fall within the polygon
# t_indices <- st_within(TS_Input_sf, polygon)

# # Convert the indices to a data frame
# t_indices_df<-data.frame(t_indices)

# t_selected_rows <- TS_Input_sf[match(t_indices_df$row.id, seq_len(nrow(TS_Input))), ]

In [262]:
# print(head(t_selected_rows))

Unnamed: 0_level_0,TS_Input.address,TS_Input.lat,TS_Input.lng,TS_Input.NearbyTraffic_2022,TS_Output_2024,EV_nearbyTraffic_2022,EV_nearbyTraffic_2024
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,13 Isabella Street,43.66758,-79.38471,4286.44,4539,129,242
2,15 Wellesley Street East,43.66484,-79.38359,4408.74,5241,132,279
3,30 Alvin Avenue,43.68919,-79.3927,4058.117,4657,122,248
4,716 Pape Avenue,43.67971,-79.34538,4576.333,2576,137,137
5,351 Keele Street,43.66458,-79.46399,6034.167,4717,181,251
6,385 Pacific Avenue,43.66474,-79.46812,7425.817,5924,223,316


Unnamed: 0_level_0,address,lat,lng,Convert,nearest_biz_count_8,avg_distance_to_biz_8,avg_customers_nearby_8,avg_time_spent_nearby_8,traffic_volume
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
1,13 Isabella Street,43.66758,-79.38471,0,8,0.005030687,27.0,0.7047446,129
2,15 Wellesley Street East,43.66484,-79.38359,0,8,0.004699575,21.75,0.746713,132
3,30 Alvin Avenue,43.68919,-79.3927,0,8,0.012040063,24.625,1.1426329,122
4,716 Pape Avenue,43.67971,-79.34538,0,8,0.045221435,23.875,0.6687038,137
5,351 Keele Street,43.66458,-79.46399,0,8,0.02183017,32.25,0.7426011,181
6,385 Pacific Avenue,43.66474,-79.46812,0,8,0.025832027,32.25,0.7426011,223


In [266]:
# ################ SVM Model ######################
# # Find nearest businesses to each parking spot. 
# # Merge GreenPParking, business and TS_Output_df. Result: Dataframe of address,	lat, lng,	Convert,	nearest_business,	traffic_volume
# # Make SVM model (with distance + time_spent + traffic_volume + interaction_term + n_business + biz_demand as predictors ) 



# # Read data
# parking<-read.csv("GreenPParking.csv")
# business<-read.csv("business.csv")

# # Convert the business dataset to a spatial object
# business_sf <- st_as_sf(business, coords = c("long", "lat"), crs = 4326)

# # Convert the parking dataset to a spatial object
# parking_sf <- st_as_sf(parking, coords = c("lng", "lat"), crs = 4326)

# # Perform a spatial join to find the nearest business for each parking space
# nearest_business <- st_nearest_feature(parking_sf, business_sf)
# nearest_points <-st_nearest_points(parking_sf,business_sf)

# # Add the nearest business information to the parking dataset
# parking_data_with_nearest_business <- cbind(parking, nearest_business)
# #parking_data_with_nearest_business$traffic_volume= sample(2:221,nrow(parking), replace=F)  # To be replaced with traffic data
# parking_data_with_nearest_business$traffic_volume=TS_Output_df$EV_nearbyTraffic_2022	


# # Function to calculate angular distance between two points on Earth
# haversine_distance <- function(lon1, lat1, lon2, lat2) {
#   R <- 6371 # Earth radius in km

#   dlat <- (lat2 - lat1) * pi / 180
#   dlon <- (lon2 - lon1) * pi / 180
#   a <- sin(dlat/2)^2 + cos(lat1 * pi / 180) * cos(lat2 * pi / 180) * sin(dlon/2)^2
#   c <- 2 * atan2(sqrt(a), sqrt(1 - a))
#   distance <- R * c
#   return(distance) # Distance in km
# }

In [267]:
# ## Calculate distance between each parking spot and each business

# # Create an empty list to store the results
# result_list <- list()

# for (i in 1:nrow(parking_data)){
  
#   # Create a temporary data frame to store the results for this parking spot
#   temp_df <- data.frame(
#     address = character(),
#     lat = double(),
#     lng = double(),
#     Convert = integer(),
#     distance = double(),
#     #capacity = integer(),
#     traffic_volume = double(),
#     #rate_half_hr= double(),
#     n_business = double(),
#     n_customers =double(),
#     time_spent = double(),
#     category = character(),
#     Operating.name = character(),
#     interaction_term = double()
    
# )
  
#   for( j in 1:nrow(business)){
#     lon1 <- parking_data_with_nearest_business[i,"lng"]
#     lat1 <- parking_data_with_nearest_business[i,"lat"]
#     lon2 <- business[j, "long"]
#     lat2 <- business[j, "lat"]
 

#     # # Calculate distance
#     # distance <- st_distance(st_point(c(lon1, lat1)), st_point(c(lon2, lat2)))
    
#     # Store the results in the temporary data frame
#     temp_df[nrow(temp_df) + 1, ] <- list(
#       address = parking_data_with_nearest_business[i,"address"],
#       lat = parking_data_with_nearest_business[i,"lat"],
#       lng = parking_data_with_nearest_business[i,"lng"],
#       Convert = parking_data_with_nearest_business[i,"Convert"],
#       distance = distance,
#       #capacity = parking_data_with_nearest_business[i,"capacity"],
#       traffic_volume = parking_data_with_nearest_business[i,"traffic_volume"],
#       n_business = parking_data_with_nearest_business[i,"nearest_business"],
#       #rate_half_hr = parking_data_with_nearest_business[i,"rate_half_hr"],
#       n_customers = business[j,"qCustomer"],
#       time_spent = business[j,"tCustomer"],
#       category = business[j,"Category"],
#       Operating.name = business[j,"Operating.Name"],
#       interaction_term = business[j,"qCustomer"] * business[j,"tCustomer"]
#       #Convert = ifelse(is.na(parking_data_with_nearest_business[i,"Convert"]), 0, parking_data_with_nearest_business[i,"Convert"])
#     )
#   }
  
#   # Append the temporary data frame to the result list
#   result_list[[i]] <- temp_df
# }

# # Combine all the results into a single data frame
# result_df <- do.call(rbind, result_list)



In [268]:
# filtered_result_df2 <- filter(result_df, result_df$distance <= 0.250)
# filtered_result_df2 <- filtered_result_df2 %>%
#   group_by(Operating.name) %>% 
#   slice(which.min(distance))



In [269]:
# head(filtered_result_df2)

In [270]:
# filtered_result_df3 <- parking_data %>%
#   group_by(address) %>%
#   summarise (distance = mean(avg_distance_to_biz_8, n_business =nearest_biz_count_8, nCustomer =avg_customers_nearby_8, traffic_volume = sum(traffic_volume), convert= mean(Convert)) 

# # demand = 0.1
# # filtered_result_df3 <- filtered_result_df3 %>%
# #   mutate(biz_demand = demand*interaction_term)

"[1m[22mReturning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
[36mℹ[39m Please use `reframe()` instead.
[36mℹ[39m When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly."
[1m[22m`summarise()` has grouped output by 'address'. You can override using the
`.groups` argument.


In [134]:
# # # Normalize predictors
# filtered_result_df3$distance <- scale(filtered_result_df3$distance)
# filtered_result_df3$traffic_volume <- scale(filtered_result_df3$traffic_volume)
# # filtered_result_df3$interaction_term <- scale(filtered_result_df3$interaction_term)
# # filtered_result_df3$biz_demand<- scale(filtered_result_df3$biz_demand)
# # filtered_result_df3$time_spent<- scale(filtered_result_df3$time_spent)
# filtered_result_df3$n_business<- scale(filtered_result_df3$n_business)

In [189]:
head(parking_data)

address,distance,n_business,n_customers,time_spent,traffic_volume,convert
<chr>,"<dbl[,1]>",<dbl>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>",<dbl>
1 Brimley Road (Bluffer's Park),2.601484,8,-0.7518021,-0.5965834,5.5784382,0
1 Shortt Street,-0.5157704,8,1.576868,-0.2924004,-0.5027482,0
10 Empress Avenue,0.864064,8,-0.9750993,1.8004656,-0.5952377,0
10 Harlandale Avenue,0.6937224,8,-0.9750993,1.8004656,-0.3986975,0
10 Kingsdale Avenue,0.8854988,8,-0.9750993,1.8004656,-0.5952377,0
100 Grangeway Avenue,2.874718,8,-0.528505,-0.4476033,0.4683918,0


In [215]:
print(weights)

   (Intercept)       distance traffic_volume 
 -9.789742e-02  -6.055148e-04   2.932265e-05 


In [204]:
coef(svm_model)

In [74]:
# library(dplyr)

# # Get the rows with the 12 highest scores
# top_12_rows <- filtered_result_df3 %>%
#   top_n(12, score)

# # View the top 12 rows
# top_12_rows


In [75]:

# data <- filtered_result_df3[, c("distance", "traffic_volume", "biz_demand", "convert")]
# set.seed(123)  # for reproducibility
# train_indices <- sample(1:nrow(data), 0.7 * nrow(data))  # 70% for training
# train_data <- data[train_indices, ]
# test_data <- data[-train_indices, ]
# library(e1071)  # for svm
# svm_model <- svm(convert ~ ., data = train_data, kernel = "linear")
# predictions <- predict(svm_model, newdata = test_data)
# accuracy <- mean(predictions == test_data$convert)
# print(accuracy)

In [None]:
# ## Calculate distance between each parking spot and each business
# suppressWarnings({

# ## Calculate distance between each parking spot and each business

# # Create an empty list to store the results
# result_list <- list()

# for (i in 1:nrow(parking_data_with_nearest_business)){
  
#   # Create a temporary data frame to store the results for this parking spot
#   temp_df <- data.frame(
#     address = character(),
#     lat = double(),
#     lng = double(),
#     distance = double(),
#     #capacity = integer(),
#     traffic_volume = integer(),
#     #rate_half_hr= double(),
#     #n_business = integer(),
#     n_customers =integer(),
#     time_spent = double(),
#     category = character(),
#     Operating.name = character(),
#     interaction_term = integer()
# )
  
#   for( j in 1:nrow(business)){
#     lon1 <- parking_data_with_nearest_business[i,"lng"]
#     lat1 <- parking_data_with_nearest_business[i,"lat"]
#     lon2 <- business[j, "long"]
#     lat2 <- business[j, "lat"]
 

#     # Calculate distance
#     distance <- st_distance(st_point(c(lon1, lat1)), st_point(c(lon2, lat2)))
    
#     # Store the results in the temporary data frame
#     temp_df[nrow(temp_df) + 1, ] <- list(
#       address = parking_data_with_nearest_business[i,"address"],
#       lat = parking_data_with_nearest_business[i,"lat"],
#       lng = parking_data_with_nearest_business[i,"lng"],
#       distance = distance,
#       #capacity = parking_data_with_nearest_business[i,"capacity"],
#       traffic_volume = parking_data_with_nearest_business[i,"traffic_volume"],
#       #n_business = parking_data_with_nearest_business[i,"nearest_business"],
#       #rate_half_hr = parking_data_with_nearest_business[i,"rate_half_hr"],
#       n_customers = business[j,"qCustomer"],
#       time_spent = business[j,"tCustomer"],
#       category = business[j,"Category"],
#       Operating.name = business[j,"Operating.Name"],
#       interaction_term = business[j,"qCustomer"] * business[j,"tCustomer"]
#     )
#   }
  
#   # Append the temporary data frame to the result list
#   result_list[[i]] <- temp_df
# }

# # Combine all the results into a single data frame
# result_df <- do.call(rbind, result_list)

# ###############
# filtered_result_df2 <- filter(result_df, result_df$distance <= 0.5)
# filtered_result_df2 <- filtered_result_df2 %>%
#   group_by(Operating.name) %>% 
#   slice(which.min(distance))

# filtered_result_df3 <- filtered_result_df2 %>%
#   group_by(address) %>%
#   summarise (distance = mean(distance), capacity =mean(capacity), n_business =n(),n_customers = sum(n_customers), time_spent= sum(time_spent), traffic_volume = sum(traffic_volume), interaction_term = sum(interaction_term)) 

# demand = 0.1
# filtered_result_df3 <- filtered_result_df3 %>%
#   mutate(biz_demand = demand*interaction_term)
  

# model1b <- lm(capacity ~ distance + n_customers + time_spent + interaction_term, data = filtered_result_df3)
# summary(model1b)

# model1c <- lm(capacity ~ distance + traffic_volume + n_customers + time_spent + interaction_term, data = filtered_result_df3)
# summary(model1c)

# model2a <- lm(capacity ~ distance  + interaction_term, data = filtered_result_df3)
# summary(model2a)

# model2b <- lm(capacity ~ distance + interaction_term, data = filtered_result_df3)
# summary(model2b)

# model2c <- lm(capacity ~ interaction_term, data = filtered_result_df3)
# summary(model2c)

     
# })


In [None]:
# filtered_result_df2 <- filter(result_df, result_df$distance <= 0.5)
# filtered_result_df2 <- filtered_result_df2 %>%
#   group_by(Operating.name) %>% 
#   slice(which.min(distance))

# filtered_result_df3 <- filtered_result_df2 %>%
#   group_by(address) %>%
#   summarise (distance = mean(distance),rate_half_hr = mean(rate_half_hr), capacity =mean(capacity), n_business =n(),n_customers = sum(n_customers), time_spent= sum(time_spent), traffic_volume = sum(traffic_volume), interaction_term = sum(interaction_term)) 

# demand = 0.1
# filtered_result_df3 <- filtered_result_df3 %>%
#   mutate(biz_demand = demand*interaction_term)
  

# model1b <- lm(capacity ~ distance + n_customers + time_spent + interaction_term, data = filtered_result_df3)
# summary(model1b)

# model1c <- lm(capacity ~ distance + traffic_volume + n_customers + time_spent + interaction_term, data = filtered_result_df3)
# summary(model1c)

# model2a <- lm(capacity ~ distance  + rate_half_hr + interaction_term, data = filtered_result_df3)
# summary(model2a)

# model2b <- lm(capacity ~ distance + interaction_term, data = filtered_result_df3)
# summary(model2b)

# model2c <- lm(capacity ~ interaction_term, data = filtered_result_df3)
# summary(model2c)


# #######################

# # Filter rows according to radius

# filtered_result_df <- result_df[result_df$distance <= 0.50, ]
# a=filtered_result_df %>% count(address)


# filtered_result_df <- filtered_result_df %>%
#   left_join(a, by = "address")

# # Select desired columns and rename the "n" column
# filtered_result_df <- filtered_result_df %>%
#   select(address, lat,lng, distance,rate_half_hr, n_businesses = n, capacity, n_customers, time_spent, traffic_volume, interaction_term) 


In [None]:
# filtered_result_df2 <- filtered_result_df %>%
#   group_by(address, lat, lng) %>%
#   summarise(sum_n_customers = sum(n_customers),
#             sum_time_spent = sum(time_spent),
#             sum_traffic_volume =sum(traffic_volume)) %>%
#   select(address, lat, lng, sum_n_customers, sum_time_spent, sum_traffic_volume)

# print(head(filtered_result_df2))

In [None]:
# # Normalize predictors
# filtered_result_df$traffic_volume <- scale(filtered_result_df$traffic_volume)
# filtered_result_df$distance <- scale(filtered_result_df$distance)
# filtered_result_df$n_customers <- scale(filtered_result_df$n_customers)
# filtered_result_df$time_spent <- scale(filtered_result_df$time_spent)
# filtered_result_df$interaction_term <- scale(filtered_result_df$interaction_term)
# filtered_result_df$rate_half_hr <- scale(filtered_result_df$rate_half_hr)




In [None]:

# model1 <- glm(family=gaussian, capacity ~ traffic_volume  + distance + n_customers + time_spent + interaction_term  + rate_half_hr, data = filtered_result_df)

# summary(model1)

# model2 <- glm(capacity ~ traffic_volume  + distance  + rate_half_hr + interaction_term, data = filtered_result_df)
# summary(model2)

In [None]:

# # ----------------------------For 0.4 * existing traffic_volume -----------------------------------

# all_results <- data.frame()

# x_percent=0.4
# new_traffic_scenario = filtered_result_df$traffic_volume * x_percent
# newdata = data.frame(traffic_volume = new_traffic_scenario,
#                      address=filtered_result_df$address,
#                      distance = filtered_result_df$distance,
#                      n_customers = filtered_result_df$n_customers,
#                      time_spent = filtered_result_df$time_spent,
#                      interaction_term = filtered_result_df$interaction_term,
#                      rate_half_hr= filtered_result_df$rate_half_hr)

# # Make predictions
# predicted_capacity <- predict(model2, newdata)

# # Determine conversion needs
# capacity_needed <- ifelse(predicted_capacity > 1, ceiling(predicted_capacity), 0)

# # Store results for this scenario 
# results <- data.frame(filtered_result_df$address, filtered_result_df$lat, filtered_result_df$lng, filtered_result_df$capacity,capacity_needed, filtered_result_df$n_businesses, filtered_result_df$n_customers, filtered_result_df$time_spent)


# # ----------------------------For different % of traffic_volumes -----------------------------------

# # Iterate for different EV adoption rates
# x_percent_values <- seq(from = 0.1, to = 1, by = 0.4)  
# all_results <- data.frame()

# for (x_percent in x_percent_values) {
#   # Create new traffic scenario
#   new_traffic_scenario = result_df$traffic_volume * x_percent
#   newdata = data.frame(traffic_volume = new_traffic_scenario,
#                      address=filtered_result_df$address,
#                      distance = filtered_result_df$distance,
#                      n_customers = filtered_result_df$n_customers,
#                      time_spent = filtered_result_df$time_spent,
#                      interaction_term = filtered_result_df$interaction_term,
#                      rate_half_hr= filtered_result_df$rate_half_hr)

#   # Make predictions
#   predicted_capacity <- predict(model1, newdata)
#   # Determine conversion needs
#   capacity_needed <- ifelse(predicted_capacity > 1, ceiling(predicted_capacity), 0)

#   # Store results for this scenario 
#   results <- data.frame(data.frame(filtered_result_df$address, filtered_result_df$lat, filtered_result_df$lng, filtered_result_df$capacity,capacity_needed, filtered_result_df$n_businesses, filtered_result_df$n_customers, filtered_result_df$time_spent))
#   all_results <- rbind(all_results, results)
# }

# # Sort combined results
# all_results <- arrange(all_results, x_percent)

# # Create table
# library(kableExtra)  
# table <- kable(all_results, caption = "Conversion Needs by EV Adoption Rate")
# print(table)




In [None]:
# # Scoring function
# calculate_score <- function(traffic_volume, capacity, rate_half_hr, nearest_businesses, customers, c_time, distance) {
#   # Define weights for each factor
#   weights <- c(0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1)
  
#   # Normalize each factor
#   normalized_traffic <- (traffic_volume - min(traffic_volume)) / (max(traffic_volume) - min(traffic_volume))
#   normalized_capacity <- (capacity - min(capacity)) / (max(capacity) - min(capacity))
#   normalized_rate <- (rate_half_hr - min(rate_half_hr)) / (max(rate_half_hr) - min(rate_half_hr))
#   normalized_n_businesses <- (nearest_businesses - min(nearest_businesses)) / (max(nearest_businesses) - min(nearest_businesses))
#   normalized_customers <-(customers - min(customers)) / (max(customers) - min(customers))
#   normalized_time <- (c_time - min(c_time)) / (max(c_time) - min(c_time))
#   normalized_distance <- (distance - min(distance)) / (max(distance) - min(distance))

#   # Calculate the score
#   score <- weights[1] * normalized_traffic +
#     weights[2] * normalized_capacity +
#     weights[3] * normalized_distance +
#     weights[4] * normalized_n_businesses + 
#     weights[5] * (normalized_customers * normalized_time) +
#     weights[6] * (normalized_rate)

#   return(abs(score))
# }

# # Calculate score for each parking spot
# filtered_result_df$score <- calculate_score(filtered_result_df$traffic_volume, filtered_result_df$capacity, filtered_result_df$rate_half_hr, filtered_result_df$n_businesses, filtered_result_df$n_customers, filtered_result_df$time_spent,filtered_result_df$distance)
# # Rank the parking spots based on the score
# ranked_data <- filtered_result_df[order(-filtered_result_df$score),]

# # Find highest scored parking spot
# highest_score_index <- which.max(ranked_data$score)
# highest_score_parking_spot <- filtered_result_df[highest_score_index, ]

# # Print the result
# print(paste0("Address of parking spot: ",highest_score_parking_spot$address))
# print(paste0("Latitude of parking spot: ",highest_score_parking_spot$lat))
# print(paste0("Longitude of parking spot: ",highest_score_parking_spot$lng))


In [None]:
# Result and Discussion


In [None]:
# # Define the polygon coordinates
# polygon_coords <- matrix(c(
#   -79.4289964889767, 43.6700360241176, -79.4226792245877, 43.6543887000655, -79.4000484228339, 43.657948514946, -79.4070875693025, 43.6748646586934,-79.4289964889767, 43.6700360241176  # Repeat first point to close the polygon exactly
# ), ncol = 2, byrow = TRUE)

# # Create a polygon geometry
# polygon <- st_polygon(list(polygon_coords))

# # Convert the polygon object to an 'sf' object with a specified CRS (Coordinate Reference System)
# polygon <- st_sfc(polygon, crs = 4326)

# # Convert the data frame 'data' to an 'sf' object, specifying the columns containing longitude and latitude as coordinates, and set the CRS
# data_sf <- st_as_sf(data, coords = c("lng", "lat"), crs = 4326)

# # Find the indices of the rows in 'data_sf' that fall within the polygon
# indices <- st_within(data_sf, polygon)

# # Convert the indices to a data frame
# indices_df<-data.frame(indices)

# selected_rows <- data_sf[match(indices_df$row.id, seq_len(nrow(data))), ]
