# R, BIGQUERY, CLOUD STORAGE SETUP

## Install and import necessary R libraries for this notebook, set options

In [None]:
suppressMessages({
  # Install packages that might be not be installed by default 
  # install.packages('plotly')
  # install.packages('furrr')
  # install.packages('tictoc')

  library(tidyverse)
  library(glue)
  library(future)
  library(furrr)
  library(tictoc)
    
  library(bigrquery)
    
  library(DT)
  library(plotly)
})

options(tibble.width = Inf)

## Enter Google Cloud/BigQuery Project ID in cell below, authenticate BigQuery

In [None]:
# ENTER YOUR PROJECT ID HERE
PROJECT_ID <- 'gcp-data-science-demo'

bq_auth(use_oob = TRUE)

# PICK 2 ATHLETES OF INTEREST AND LOOK AT DAILY WIKIPEDIA VIEWS

## Enter names of 2 athletes of interest in cell below

In [None]:
# ENTER 2 ATHLETES' NAMES HERE
ATHLETE_1_NAME <- 'LeBron James'
ATHLETE_2_NAME <- 'Patrick Mahomes'

athlete_wiki_page_titles <- c(ATHLETE_1_NAME, ATHLETE_2_NAME) %>%
  tolower() %>%
  str_replace(" ", "_") %>%
  paste0("'", ., "'") %>%
  paste0(collapse = ", ") %>%
  paste0("(", ., ")")

print(athlete_wiki_page_titles)

## Create SQL query to get specific Wikipedia page views using text formatting

In [None]:
sql_query_with_names_param <- "
    SELECT
      DATE(datehour, 'America/Los_Angeles') AS date,
      LOWER(title) AS page_title,
      SUM(views) AS views

    FROM 
      `bigquery-public-data.wikipedia.pageviews_2020`

    WHERE 
      DATE(datehour, 'America/Los_Angeles') >= '2020-01-01' AND
      DATE(datehour, 'America/Los_Angeles') <= '2020-12-31' AND
      LOWER(title) IN %s AND
      wiki IN ('en', 'en.m')

    GROUP BY
      date, page_title

    ORDER BY
      date DESC, views DESC
    "

sql_query_with_names <- sprintf(sql_query_with_names_param,
  athlete_wiki_page_titles)

cat(sql_query_with_names)

## Look at page views data with both static and interactive tables

In [None]:
athlete_views <- bq_project_query(
  x = PROJECT_ID,
  query = sql_query_with_names
  ) %>%
  bq_table_download()

head(athlete_views, n = 10)

In [None]:
DT::datatable(athlete_views)

## Create interactive plot of 2 athletes daily views

In [None]:
athlete_views_by_date_plot <- ggplot( 
  data = athlete_views,
  aes(
    x = date,
    y = views,
    color = page_title
    ),
  ) + 
  scale_x_date(date_breaks = 'month', date_labels = '%b %d') +
  scale_color_manual(values = 
    c('#552583', '#E31837')
    ) +
  geom_point() + 
  geom_path() +
  ggtitle(
    label = paste0('2020 Wikipedia Page Views by Date for ',
      paste0(c(ATHLETE_1_NAME, ATHLETE_2_NAME), collapse = ', '))
    ) + 
  theme(
    axis.text.x = element_text(angle = 45)
    )

interactive_athlete_views_by_date_plot <- ggplotly(athlete_views_by_date_plot)

interactive_athlete_views_by_date_plot

### Output single plot to Cloud Storage using system commands

In [None]:
# ENTER CLOUD STORAGE BUCKET AND DESIRED INTERACTIVE PLOT OUTPUT FILE NAME
CLOUD_STORAGE_BUCKET <- 'r-demos'
PLOT_OUTPUT_FILENAME <- "athlete_wiki_views_by_date.html"

htmlwidgets::saveWidget(
  widget = interactive_athlete_views_by_date_plot, 
  file = PLOT_OUTPUT_FILENAME,
  selfcontained = T
  )

cloud_storage_bucket_url <- paste0("gs://", CLOUD_STORAGE_BUCKET, "/")

cloud_storage_upload_command <- paste("gsutil cp", PLOT_OUTPUT_FILENAME,
  cloud_storage_bucket_url)

system(cloud_storage_upload_command)

cloud_storage_check_command <- paste("gsutil ls -l", 
  cloud_storage_bucket_url)

system(cloud_storage_check_command, intern = TRUE)

# BONUS: EXTEND DATA GATHERING TO MANY MORE ATHLETES

## Set up for parallel processing in following steps

In [None]:
options(future.availableCores.methods = "mc.cores")

options(mc.cores = 64)

plan(multisession)

## Read in CSV of top athletes according to YouGov

In [None]:
top_athletes <- read_csv("top_athletes_by_yougov.csv") %>%
  mutate(
    page_title = athlete_name %>%
      tolower() %>%
      str_replace(" ", "_") %>%
      paste0(., 
        ifelse(is.na(wikipedia_page_add), "",
          paste0('_', wikipedia_page_add))
        )
    ) %>%
  print(n = 25)

## Create template of SQL query to be used to read in single athlete, single year page views

In [None]:
wiki_views_query_template <- "
    SELECT
      DATE(datehour, 'America/Los_Angeles') AS date,
      LOWER(title) AS page_title,
      SUM(views) AS views

    FROM 
      `bigquery-public-data.wikipedia.pageviews_{year}`

    WHERE 
      (DATE(datehour, 'America/Los_Angeles') BETWEEN 
        '{year}-01-01' AND '{year}-12-31') AND
      LOWER(title) = '{page_title}' AND
      wiki IN ('en', 'en.m')

    GROUP BY
      date, page_title
  "

## Enter years of interest, read in page view data for many athletes in that span (this may take several minutes, depending on years)

In [None]:
# ENTER START AND END YEARS FOR PAGE VIEW DATA GATHERING
START_YEAR <- 2017
END_YEAR <- 2020

tic("Reading in Wikipedia Page View Data for Many Athletes Across Multiple Years")

many_athlete_multi_year_page_views <- top_athletes %>%
  crossing(
    year = START_YEAR:END_YEAR
    ) %>%
  mutate(
    wiki_views_query_text = glue(wiki_views_query_template)
    ) %>%
  mutate(
    wiki_views = future_pmap(
      list(query = wiki_views_query_text),
      ~bq_table_download(bq_project_query(x = PROJECT_ID, query = .))
     )
    ) %>%
  print()

toc()

## Get unnested version of athlete daily page views data, add daily ranking (among this set)

In [None]:
athlete_page_views_unnest_with_ranks <- many_athlete_multi_year_page_views %>%
  select(year, athlete_name, wiki_views) %>%
  unnest(wiki_views) %>%
  select(date, athlete_name, views) %>%
  group_by(date) %>%
  mutate(
    date_view_rank = rank(desc(views))
    ) %>%
  ungroup() %>%
  arrange(desc(date), date_view_rank)

DT::datatable(athlete_page_views_unnest_with_ranks)

## Count # of days each athlete ranked #1 (among this set) in Wikipedia page views in span, look at those w/ most #1 days

In [None]:
athletes_with_most_days_no1_in_views <- athlete_page_views_unnest_with_ranks %>%
  group_by(athlete_name) %>%
  summarize(
    num_days_no1 = sum(ifelse(date_view_rank == 1, 1, 0))
    ) %>%
  ungroup() %>%
  arrange(desc(num_days_no1)) %>%
  print(n = 25)