# Scraping the International visitor arrival data from Stats NZ website

This page is dynamically generated, due to which __rvest::read_html__ does not load the page completely. To work around we use __RSelenium__ library. Now, in this case scraping process is divided into two steps:
 - Automate/simulate the opening the page in browser using tools provided by __RSelenium__
 - Scrape the required contents from the dynamically loaded page as usual using __rvest__ tools

In [3]:
library(tidyverse)
library(dplyr)
library(rvest)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: 'rvest'


The following object is masked from 'package:readr':

    guess_encoding




In [4]:
library(RSelenium)

In [5]:
#install.packages("janitor")
library(janitor)


Attaching package: 'janitor'


The following objects are masked from 'package:stats':

    chisq.test, fisher.test




## Step 1: Browser automation
Automation and loading the dynamic page

#### Starting Selenium server and browser 

In [7]:
rD <- rsDriver(verbose = FALSE, port = 4444L,  browser = "chrome", chromever = "106.0.5249.21")

#### Navigating to the Statistics New Zealand's tourism page

In [8]:
remDr <- rD$client
remDr$navigate("https://infoshare.stats.govt.nz")

#### Getting contents of the \<table>
Navigating to the target node in the tree by expanding it. Each node is dynamically generated, therefore, can not be directly accessed.

In [9]:
# Click the Tourism node
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest9")
elem$clickElement()

In [11]:
# click the node 'International travel and migration'
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest11")
elem$clickElement()

In [12]:
# click the node 'visitor arrival totals (qrtly-mar/jun/sep/dec)'
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest520")
elem$clickElement()

In [13]:
# click the node 'select all' in Count type section
elem <- remDr$findElement(using="css selector", value="span#ctl00_MainContent_ctl02_lblSelectAll")
elem$clickElement()

In [14]:
# click the node 'select all' in time section
elem <- remDr$findElement(using="css selector", value="span#ctl00_MainContent_ctl04_lblSelectAll")
elem$clickElement()

In [15]:
# click the 'Go' button
elem <- remDr$findElement(using="css selector", value="input#ctl00_MainContent_btnGo")
elem$clickElement()

### Saving the Table source into an element for later use.

In [16]:
# saving the resultent table into a variable
elem <- remDr$findElement(using="css selector", value="table.pxtableParent")
web_page_src <- remDr$getPageSource()[[1]]

### Stoping the driver and closing the browser

In [17]:
# release the selenium browser driver
rD[["server"]]$stop()

## Step 2: Scraping the page

In [18]:
web_page <- web_page_src %>% 
                read_html()

statsnz_data_html <- web_page %>%
                      html_nodes("table.pxtable")
statsnz_data_html[[2]]

{html_node}
<table class="pxtable" cellpadding="0" cellspacing="0" border="0">
[1] <tbody>\n<tr>\n<td class="headfirst" colspan="1" rowspan="1"> </td>\n     ...

#### Extracting table and saving it in tibble

In [19]:
table_node_table <- statsnz_data_html[[2]] %>%
                        html_table() %>%
                        row_to_names(1)     # function from library janitor to make the first row as header.

In [20]:
head(table_node_table)
tail(table_node_table)

Unnamed: 0_level_0,Actual Counts,Seasonally Adjusted,Counts Derived from a Sample
<chr>,<chr>,<chr>,<chr>
1921Q2,2934,..,2934
1921Q3,1808,..,1808
1921Q4,3783,..,3783
1922Q1,3801,..,3801
1922Q2,1964,..,1964
1922Q3,1565,..,1565


Unnamed: 0_level_0,Actual Counts,Seasonally Adjusted,Counts Derived from a Sample
<chr>,<chr>,<chr>,<chr>
2021Q1,15384,15384,15384
2021Q2,141126,141126,141126
2021Q3,35195,35195,35195
2021Q4,15157,15157,15157
2022Q1,37892,37892,37892
2022Q2,221706,221706,221706


#### Saving dataframe into csv file

In [18]:
#table_node_table %>%
#    write.csv('data/tourism_international_visitors_arriavals_1921Q1-2022Q2.csv')

## Step 3: Wrangling the data and tidying it up as per requirement

In [54]:
names(table_node_table)[1] <- 'date'
names(table_node_table)[2] <- 'Actual_counts'

### Get rid of extra columns

In [56]:
int_visitors_arrivals_df <- table_node_table[ , c("date", "Actual_counts")]   

### Convert the counts column into numeric format 

In [57]:
# Converting type of 'Actual Counts' column to numeric
int_visitors_arrivals_df$Actual_counts <- int_visitors_arrivals_df$Actual_counts %>%
    gsub(",", "", .) %>%
    as.numeric()

int_visitors_arrivals_df %>% head()

date,Actual_counts
<chr>,<dbl>
1921Q2,2934
1921Q3,1808
1921Q4,3783
1922Q1,3801
1922Q2,1964
1922Q3,1565


### Changing the format of the data in Time column from 2022Q2 to 2022-2

In [59]:
df <- int_visitors_arrivals_df %>% 
    separate(date, c("Year", "Quarter"),sep = "Q")

In [60]:
df$Quarter <- as.numeric(df$Quarter) * 3

df$Quarter <- ifelse(df$Quarter < 10, paste0("0", df$Quarter), df$Quarter)
#sprintf("%02d", df$Quarter)

head(df)

Year,Quarter,Actual_counts
<chr>,<chr>,<dbl>
1921,6,2934
1921,9,1808
1921,12,3783
1922,3,3801
1922,6,1964
1922,9,1565


In [61]:
int_visitors_arrivals_df$date <- paste0(df$Year, "-", df$Quarter)

In [62]:
int_visitors_arrivals_df %>%
    head()

date,Actual_counts
<chr>,<dbl>
1921-06,2934
1921-09,1808
1921-12,3783
1922-03,3801
1922-06,1964
1922-09,1565


### saving the dataframe into csv file

In [63]:
int_visitors_arrivals_df %>%
    write.csv('data/tourism_final_clean_arriavals_1921Q1-2022Q2.csv')

#### End of file