# Scraping the International visitor arrival data from Stats NZ website

This page is dynamically generated, due to which __rvest::read_html__ does not load the page completely. To work around we use __RSelenium__ library. Now, in this case scraping process is divided into two steps:
 - Automate/simulate the opening the page in browser using tools provided by __RSelenium__
 - Scrape the required contents from the dynamically loaded page as usual using __rvest__ tools

In [1]:
library(tidyverse)
library(dplyr)
library(rvest)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: 'rvest'


The following object is masked from 'package:readr':

    guess_encoding




In [2]:
library(RSelenium)

In [3]:
#install.packages("janitor")
library(janitor)


Attaching package: 'janitor'


The following objects are masked from 'package:stats':

    chisq.test, fisher.test




## Step 1: Browser automation
Automation and loading the dynamic page

#### Starting Selenium server and browser 

In [4]:
rD <- rsDriver(verbose = FALSE, port = 4444L,  browser = "chrome", chromever = "106.0.5249.21")

#### Navigating to the Statistics New Zealand's tourism page

In [5]:
remDr <- rD$client
remDr$navigate("https://infoshare.stats.govt.nz")

#### Getting contents of the \<table>
Navigating to the target node in the tree by expanding it. Each node is dynamically generated, therefore, can not be directly accessed.

In [11]:
# Click the Imports & Exports node
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest4")
elem$clickElement()

In [12]:
# click the node 'Overseas cargo statistics'
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest8")
elem$clickElement()

In [13]:
# click the node 'Total imports by NZ (qrtly-mar/jun/sep/dec)'
elem <- remDr$findElement(using="css selector", value="a#ctl00_MainContent_tvBrowseNodest23")
elem$clickElement()

In [14]:
# click the node 'select the last optin in the list' in NZ port section
elem <- remDr$findElement(using = "xpath", "//select[@id = 'ctl00_MainContent_ctl02_lbVariableOptions']/option[last()]")
elem$clickElement()

In [15]:
# click the node 'select all' in observation section
elem <- remDr$findElement(using = "xpath", "//select[@id = 'ctl00_MainContent_ctl04_lbVariableOptions']/option[last()]")
elem$clickElement()

In [16]:
# click the node 'select all' in time section
elem <- remDr$findElement(using="css selector", value="span#ctl00_MainContent_ctl07_lblSelectAll")
elem$clickElement()

In [17]:
# click the 'Go' button
elem <- remDr$findElement(using="css selector", value="input#ctl00_MainContent_btnGo")
elem$clickElement()

### Saving the Table source into an element for later use.

In [18]:
# saving the resultent table into a variable
elem <- remDr$findElement(using="css selector", value="table.pxtableParent")
web_page_src <- remDr$getPageSource()[[1]]

### Stoping the driver and closing the browser

In [19]:
# release the selenium browser driver
rD[["server"]]$stop()

## Step 2: Scraping the page

In [30]:
web_page <- web_page_src %>% 
                read_html()

statsnz_data_html <- web_page %>%
                      html_nodes("table.pxtable")
statsnz_data_html[[2]]

{html_node}
<table class="pxtable" cellpadding="0" cellspacing="0" border="0">
[1] <tbody>\n<tr>\n<td class="headfirst" colspan="1" rowspan="2"> </td>\n     ...

#### Extracting table and saving it in tibble

In [31]:
table_node_table <- statsnz_data_html[[2]] %>%
                        html_table() %>%
                        row_to_names(1)     # function from library janitor to make the first row as header.

In [32]:
head(table_node_table)
tail(table_node_table)

Unnamed: 0_level_0,Total All Cargo
<chr>,<chr>
,Gross weight (tonnes)
1988Q1,2036304
1988Q2,1695013
1988Q3,1612744
1988Q4,1618301
1989Q1,1961851


Unnamed: 0_level_0,Total All Cargo
<chr>,<chr>
2021Q2,6041585
2021Q3,6713726
2021Q4,6720260
2022Q1,5644293
2022Q2,5800983
2022Q3,5739492


#### Saving dataframe into csv file

In [33]:
#table_node_table %>%
#    write.csv('data/tourism_international_visitors_arriavals_1921Q1-2022Q2.csv')

## Step 3: Wrangling the data and tidying it up as per requirement

In [35]:
names(table_node_table)[1] <- 'date'
names(table_node_table)[2] <- 'Total_Gross_Imports_in_tonnes'

In [36]:
int_total_imports_df <- table_node_table  

# there is double head in actual table hence removing secon subheading (whcih is imported as a row in tibble)
int_total_imports_df = int_total_imports_df[-1,]

### Convert the counts column into numeric format 

In [37]:
# Converting type of 'Total exports' column to numeric
int_total_imports_df$Total_Gross_Imports_in_tonnes <- int_total_imports_df$Total_Gross_Imports_in_tonnes %>%
    gsub(",", "", .) %>%
    as.numeric()

int_total_imports_df %>% head()

date,Total_Gross_Imports_in_tonnes
<chr>,<dbl>
1988Q1,2036304
1988Q2,1695013
1988Q3,1612744
1988Q4,1618301
1989Q1,1961851
1989Q2,1945879


### Changing the format of the data in Time column from 2022Q2 to 2022-2

In [38]:
df <- int_total_imports_df %>% 
    separate(date, c("Year", "Quarter"),sep = "Q")

In [39]:
df$Quarter <- as.numeric(df$Quarter) * 3

df$Quarter <- ifelse(df$Quarter < 10, paste0("0", df$Quarter), df$Quarter)
#sprintf("%02d", df$Quarter)

head(df)

Year,Quarter,Total_Gross_Imports_in_tonnes
<chr>,<chr>,<dbl>
1988,3,2036304
1988,6,1695013
1988,9,1612744
1988,12,1618301
1989,3,1961851
1989,6,1945879


In [40]:
int_total_imports_df$date <- paste0(df$Year, "-", df$Quarter)

In [41]:
int_total_imports_df %>%
    head()

date,Total_Gross_Imports_in_tonnes
<chr>,<dbl>
1988-03,2036304
1988-06,1695013
1988-09,1612744
1988-12,1618301
1989-03,1961851
1989-06,1945879


### saving the dataframe into csv file

In [42]:
int_total_imports_df %>%
    write.csv('data/total_imports_final_clean_1988Q1-2022Q3.csv')

#### End of file