# Scraping the International visitor arrival data from Stats NZ website

This page is dynamically generated, due to which __rvest::read_html__ does not load the page completely. To work around we use __RSelenium__ library. Now, in this case scraping process is divided into two steps:
 - Automate/simulate the opening the page in browser using tools provided by __RSelenium__
 - Scrape the required contents from the dynamically loaded page as usual using __rvest__ tools

In [1]:
library(rvest)

In [2]:
library(RSelenium)

## Step 1: Browser automation
Automation and loading the dynamic page

#### Starting Selenium server and browser 

In [5]:
rD <- rsDriver(verbose = FALSE, port = 4444L,  browser = "chrome", chromever = "106.0.5249.21")

#### Navigating to the Statistics New Zealand's tourism page

In [6]:
remDr <- rD$client
remDr$navigate("https://infoshare.stats.govt.nz")

#### Getting contents of \<table>  

In [7]:
# Click the Tourism node
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest9")
elem$clickElement()

In [8]:
# click the node 'International travel and migration'
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest11")
elem$clickElement()

In [9]:
# click the node 'visitor arrival totals (qrtly-mar/jun/sep/dec)'
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest520")
elem$clickElement()

In [10]:
# click the node 'select all' in Count type section
elem <- remDr$findElement(using="css selector", value="span#ctl00_MainContent_ctl02_lblSelectAll")
elem$clickElement()

In [11]:
# click the node 'select all' in time section
elem <- remDr$findElement(using="css selector", value="span#ctl00_MainContent_ctl04_lblSelectAll")
elem$clickElement()

In [12]:
# click the 'Go' button
elem <- remDr$findElement(using="css selector", value="input#ctl00_MainContent_btnGo")
elem$clickElement()

In [13]:
# saving the resultent table into a variable
elem <- remDr$findElement(using="css selector", value="table.pxtableParent")
web_page_src <- remDr$getPageSource()[[1]]

In [14]:
# release the selenium browser driver
rD[["server"]]$stop()

## Step 2: Scraping the page

In [17]:
web_page <- web_page_src %>% 
                read_html()

statsnz_data_html <- web_page %>%
                      html_nodes("table.pxtable")
statsnz_data_html[[2]]

{html_node}
<table class="pxtable" cellpadding="0" cellspacing="0" border="0">
[1] <tbody>\n<tr>\n<td class="headfirst" colspan="1" rowspan="1"> </td>\n     ...

#### Extracting table and saving it in tibble

In [19]:
table_node_table<-html_table(statsnz_data_html[[2]])
head(table_node_table)
tail(table_node_table)

X1,X2,X3,X4
<chr>,<chr>,<chr>,<chr>
,Actual Counts,Seasonally Adjusted,Counts Derived from a Sample
1921Q2,2934,..,2934
1921Q3,1808,..,1808
1921Q4,3783,..,3783
1922Q1,3801,..,3801
1922Q2,1964,..,1964


X1,X2,X3,X4
<chr>,<chr>,<chr>,<chr>
2021Q1,15384,15384,15384
2021Q2,141126,141126,141126
2021Q3,35195,35195,35195
2021Q4,15157,15157,15157
2022Q1,37892,37892,37892
2022Q2,221706,221706,221706


#### Saving dataframe into csv file

In [20]:
table_node_table %>%
    write.csv('data/tourism_international_visitors_arriavals_1921Q1-2022Q2.csv')

#### Releasing the port used by rsDriver