# Explore the time-series of metrics in the pre-test period

IMPORTANT: This notebook uses an R kernel.

#### How to setup Jupyter Notebook for R

These instructions assume that you already have a working Python environment for your local repository of this project, and Jupyter Notebook already installed in that environment that you can execute from your Terminal.

1. Install R 

   If not already installed, see https://cloud.r-project.org/index.html
   

2. Install R kernel for Jupyter Notebook

    In your Terminal (note: not in RStudio, not in the R GUI):
    
    - launch R by entering `R` on the command line.

    - You should now be using R from your Terminal. Thus, run:
    ```
    install.packages('IRkernel')
    IRkernel::installspec()
    ```

    Done! You can now quit R by entering `q()`.

If you now launch Jupyter Notebook, you'll have the option to choose `R` as kernel.


### Setting things up

In [None]:
# Install packages, if they aren't already available.
# This can take a minute or two.
packages <- c("bigrquery", "tidyverse", "plotly")
install.packages(setdiff(packages, rownames(installed.packages())), quiet = TRUE) 

In [None]:
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(bigrquery))
suppressPackageStartupMessages(library(tidyverse))

In [None]:
#Authenticate
bq_auth(path = "/Users/alessiatosi/Secrets/govuk-bigquery-analytics-service-credentials.json")  #/path/to/your/service-account.json

In [None]:
# Make plots wider 
options(repr.plot.width=15, repr.plot.height=8)

In [None]:
# create custom plotting theme
theme_custom <- theme(plot.title = element_text(face = "bold", hjust = 0.5, size=18),
                      plot.subtitle = element_text(size=14),
                      axis.text.y = element_text(colour = 'black', size = 12), 
                      axis.title.y = element_text(size = 16, hjust = 0.5, vjust = 0.2),
                      axis.text.x = element_text(colour = 'black', size = 12), 
                      axis.title.x = element_text(size = 16, hjust = 0.5, vjust = 0.2),
                      panel.background = element_blank(),
                      axis.line = element_line(colour = "black"),
                      legend.position = "bottom",
                      legend.direction = "horizontal")

### Get the data

In [None]:
#billing <- "govuk-xgov" # replace this with your project ID 
project = "govuk-bigquery-analytics"
sql <- "SELECT * FROM `govuk-bigquery-analytics.datascience.related_links_20211023_20211119_pre_test_data`"

tb <- bq_table_download(bq_project_query(project, sql))

In [None]:
tb

### Data pre-processing

In [None]:
# cast date as a date type variable
tb$date <- as.Date(strptime(tb$date, "%Y%m%d"))

In [None]:
tb <- tb %>% 
    arrange(date)

### Plotting

In [None]:
plot_timeseries <- function(data, ts_var="", title="", x_title=""){
    #'@param data (data.frame) : dataset  
    #'@param ts_var (character string) : name of the variable containing the time-series data
    #'@param title (character string) : plot title
    #'@param x_title (character string) : x-axis title
    #'@return time-series plot
    
    if(!"date" %in% colnames(data)) stop(paste0("column `date` is missing from dataset"))
    
    sym_ts_var <- dplyr::sym(ts_var)
    
    data %>% 
    ggplot2::ggplot(., aes(date, !!sym_ts_var)) +
    geom_point(size=2) +
    geom_line(size=1) +
    #geom_smooth(method="lm", colour="blue") +
    geom_smooth(method = "loess", formula=y~x, colour="red", se=TRUE) +
    geom_vline(aes(xintercept = as.Date("20211111", "%Y%m%d")), col="blue", linetype=2) +
    labs(
        title = title,
        subtitle = "Pre-intervention time series") +
    ylab(x_title) +
    theme_custom
    }

In [None]:
plot_timeseries(data=tb,
               ts_var="pc_visitors_used_rl",
               title="Percentage of visitors who clicked on at least 1 related link",
               x_title="% of visitors")

In [None]:
plot_timeseries(data=tb,
               ts_var="pc_visitors_that_clicked_navigation",
               title="Percentage of visitors who clicked on a navigation element",
               x_title="% of visitors")

In [None]:
plot_timeseries(data=tb,
               ts_var="pc_visitors_2_or_more_rl",
               title="Percentage of visitors who clicked 2 or more related links",
               x_title="% of visitors")

In [None]:
plot_timeseries(data=tb,
               ts_var="pc_visitors_that_used_search",
               title="Percentage of visitors who used internal search",
               x_title="% of visitors")