# Scraping the International visitor arrival data from Stats NZ website

This page is dynamically generated, due to which __rvest::read_html__ does not load the page completely. To work around we use __RSelenium__ library. Now, in this case scraping process is divided into two steps:
 - Automate/simulate the opening the page in browser using tools provided by __RSelenium__
 - Scrape the required contents from the dynamically loaded page as usual using __rvest__ tools

In [1]:
library(tidyverse)
library(dplyr)
library(rvest)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: 'rvest'


The following object is masked from 'package:readr':

    guess_encoding




In [2]:
library(RSelenium)

In [3]:
#install.packages("janitor")
library(janitor)


Attaching package: 'janitor'


The following objects are masked from 'package:stats':

    chisq.test, fisher.test




## Step 1: Browser automation
Automation and loading the dynamic page

#### Starting Selenium server and browser 

In [17]:
rD <- rsDriver(verbose = FALSE, port = 4444L,  browser = "chrome", chromever = "106.0.5249.21")

#### Navigating to the Statistics New Zealand's tourism page

In [18]:
remDr <- rD$client
remDr$navigate("https://infoshare.stats.govt.nz")

#### Getting contents of the \<table>
Navigating to the target node in the tree by expanding it. Each node is dynamically generated, therefore, can not be directly accessed.

In [19]:
# Click the Industry sector node
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest5")
elem$clickElement()

In [20]:
# click the transport node
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest25")
elem$clickElement()

In [21]:
# click the 'motor vehicles currently licensed by type (qrtly-mar/jun/sep/dec)' node 
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest30")
elem$clickElement()

In [24]:
# click the node 'Total all vehicles' for columns
elem <- remDr$findElement(using = "xpath", "//select[@id = 'ctl00_MainContent_ctl02_lbVariableOptions']/option[24]")
elem$clickElement()

In [26]:
# click the node 'total both periods' for period of time
elem <- remDr$findElement(using = "xpath", "//select[@id = 'ctl00_MainContent_ctl04_lbVariableOptions']/option[3]")
elem$clickElement()

In [27]:
# data from year 1988 onwards
elem <- remDr$findElement(using="css selector", value="span#ctl00_MainContent_ctl07_lblSelectAll")
elem$clickElement()

In [28]:
# click the 'Go' button
elem <- remDr$findElement(using="css selector", value="input#ctl00_MainContent_btnGo")
elem$clickElement()

### Saving the Table source into an element for later use.

In [29]:
# saving the resultent table into a variable
elem <- remDr$findElement(using="css selector", value="table.pxtableParent")
web_page_src <- remDr$getPageSource()[[1]]

### Stoping the driver and closing the browser

In [62]:
# release the selenium browser driver
rD[["server"]]$stop()

## Step 2: Scraping the page

In [30]:
web_page <- web_page_src %>% 
                read_html()

statsnz_data_html <- web_page %>%
                      html_nodes("table.pxtable")
statsnz_data_html[[2]]

{html_node}
<table class="pxtable" cellpadding="0" cellspacing="0" border="0">
[1] <tbody>\n<tr>\n<td class="headfirst" colspan="1" rowspan="2"> </td>\n     ...

#### Extracting table and saving it in tibble

In [42]:
table_node_table <- statsnz_data_html[[2]] %>%
                        html_table() %>%
                        row_to_names(1)     # function from library janitor to make the first row as header.
table_node_table = table_node_table[-1,]

In [43]:
head(table_node_table)
tail(table_node_table)

Unnamed: 0_level_0,Total All Vehicles
<chr>,<chr>
1988Q1,2179005
1988Q2,2147112
1988Q3,2166716
1988Q4,2173005
1989Q1,2217259
1989Q2,2188108


Unnamed: 0_level_0,Total All Vehicles
<chr>,<chr>
2021Q2,4409033
2021Q3,4383197
2021Q4,4497463
2022Q1,4502027
2022Q2,4452608
2022Q3,4442030


#### Saving dataframe into csv file

In [18]:
#table_node_table %>%
#    write.csv('data/tourism_international_visitors_arriavals_1921Q1-2022Q2.csv')

## Step 3: Wrangling the data and tidying it up as per requirement

In [54]:
names(table_node_table)[1] <- 'date'
names(table_node_table)[2] <- 'total_all_vehicles'

### Convert the counts column into numeric format 

In [55]:
table_node_table_df <- table_node_table

In [56]:
# Converting type of 'Actual Counts' column to numeric
table_node_table_df$total_all_vehicles <- table_node_table_df$total_all_vehicles %>%
    gsub(",", "", .) %>%
    as.numeric()

table_node_table_df %>% head()

date,total_all_vehicles
<chr>,<dbl>
1988Q1,2179005
1988Q2,2147112
1988Q3,2166716
1988Q4,2173005
1989Q1,2217259
1989Q2,2188108


### Changing the format of the data in Time column from 2022Q2 to 2022-2

In [57]:
df <- table_node_table_df %>% 
    separate(date, c("Year", "Quarter"),sep = "Q")

In [58]:
df$Quarter <- as.numeric(df$Quarter) * 3

df$Quarter <- ifelse(df$Quarter < 10, paste0("0", df$Quarter), df$Quarter)

head(df)

Year,Quarter,total_all_vehicles
<chr>,<chr>,<dbl>
1988,3,2179005
1988,6,2147112
1988,9,2166716
1988,12,2173005
1989,3,2217259
1989,6,2188108


In [59]:
table_node_table_df$date <- paste0(df$Year, "-", df$Quarter)

In [60]:
table_node_table_df %>%
    head()

date,total_all_vehicles
<chr>,<dbl>
1988-03,2179005
1988-06,2147112
1988-09,2166716
1988-12,2173005
1989-03,2217259
1989-06,2188108


### saving the dataframe into csv file

In [61]:
table_node_table_df %>%
    write.csv('data/transport_final_clean_vehicle_licensed_1921-2022.csv')

#### End of file