In [0]:
## Importing packages
library(tidyverse) # metapackage with lots of helpful functions
library(magrittr) # for advanced piping, eg. %<>% (pipe-and-assign)
library(lubridate) # for working with dates and times

# Load Data

In [0]:
# I use the read_csv() function of the readr package (loaded automatically with tidyverse). It is faster and less prone to errors than the base R read.csv() function. 
# It also produces a tibble (dplyr version) instead of a base R data.frame. A main difference is that the tibble does not by default convert strings to factors like the data.frame (which often causes issues when joining data)
trips <- read_csv("https://github.com/SDS-AAU/M1-2019/raw/master/data/trips.csv")
trips %>% glimpse()

In [0]:
people <- read_csv("https://github.com/SDS-AAU/M1-2019/raw/master/data/people.csv")
people %>% glimpse()

In [0]:
country <- read_csv("https://github.com/SDS-AAU/M1-2019/raw/master/data/countrylist.csv")
country %>% glimpse()

# 1. Preprocessing

In [0]:
## 1.a
# Dates to timestamps using as_date from lubridate -> use as_date() instead of base R as.date() function, since faster and less errors.
trips  %<>%
    mutate(date_start = lubridate::as_date(date_start),
          date_end = as_date(date_end))

In [0]:
trips %>% head()

In [0]:
## 1.b
# Calculate duration
trips %<>%
  mutate(duration = date_end - date_start)

In [0]:
trips %>% head()

In [0]:
## 1.c
# Filter extreme duration observations
# Check what might be extreme date_start and duration observations graphically
trips %>%
  ggplot(aes(x = duration)) +
  geom_histogram()

In [0]:
trips %>%
  ggplot(aes(x = date_start)) +
  geom_histogram()

In [0]:
# Create percentiles for date_start and date_end using the percent_rank() function. Note that it neads numeric inputs instead of dates
trips %<>%
  mutate(pct_date_start = percent_rank(as.numeric(date_start)),
        pct_date_end = percent_rank(as.numeric(date_end)))

In [0]:
trips %>% head()

In [0]:
# Filter observations to be in interval
trips %<>%
  filter(0 < duration,
        0.01 <= pct_date_start, pct_date_start <= 0.97,
        0.01 <= pct_date_end, pct_date_end <= 0.97) %>%
  mutate(dur_num = as.numeric(duration))

In [0]:
## 1.d
# Joint with country list
trips %<>% 
  left_join(country, by = c("country_code" = "alpha_2"))

In [0]:
trips %>% head()

# 2: People

In [0]:
## 2.a
# People with highschool diploma
# Note: I use replace_na("") to fill missing education values with an empty string (""), so str_detect("High School") will return FALSE instead of NA
people  %<>%
  mutate(HS = ifelse(education_raw %>% replace_na("") %>% str_detect("High School"), TRUE, FALSE))

In [0]:
people %>% count(HS)
# seems to be 321

In [0]:
## 2.b
# Software developer with master
# We first create a MS variable for people with master degree
people  %<>%
  mutate(MS = ifelse(education_raw %>% replace_na("") %>% str_detect("Master"), TRUE, FALSE))

In [0]:
people %>%
  filter(work_raw %>% str_detect("Software Dev")) %>%
  count(MS)
  # Seems to be 57

In [0]:
## 2.c
# Most popular master
people %>%
  filter(MS == TRUE) %>%
  arrange(desc(followers)) %>%
  head()
# Its this first guy @levelsio... check him out on twitter if youy like :)

# 3: Trips 

In [0]:
## 3.a
# Country with the most trips using n() from dplyr
trips %>%
  group_by(country) %>%
  summarise(trips = n()) %>%
  arrange(desc(trips)) %>%
  head()

# The US it is!

In [0]:
# Note: Alternative and faster way with count()
trips %>%
  count(country, sort = TRUE) %>%
  head()

In [0]:
## 2.b
# Create start year variable with lubridate's year() function
trips %<>%
  mutate(year = year(date_start))

In [0]:
trips %>% head()

In [0]:
# Find trips in 2017
trips %>%
  filter(year == 2017) %>%
  count(country, sort = TRUE) %>%
  head()
# Again the US...

In [0]:
## 2.c
# Time in east asia 
trips %>%
  filter(sub_region == "Eastern Asia") %>%
  group_by(country) %>%
  summarise(ave_dur = mean(duration, na.rm = TRUE)) %>%
  arrange(ave_dur) %>%
  head() %>%
  ggplot(aes(x = country %>% reorder(desc(ave_dur)), y = ave_dur)) +
  geom_col() +
  coord_flip()

# North Korea it is... I am not surprised...
# Note: Reorder is used that the countries are ordered not by name but duration

In [0]:
## 3.d
# Trip duration of Software Dev people
# Merge people and trips data
trips %<>% 
  left_join(people, by = "username")

In [0]:
trips %<>%
  mutate(soft_dev = ifelse(work_raw %>% replace_na("") %>% str_detect("Software Dev"), TRUE, FALSE))

In [0]:
# Then get the average duration by group
trips %>%
  group_by(soft_dev) %>%
  summarise(ave_dur = mean(duration, na.rm = TRUE))
# They actually have slightly shorter trips

In [0]:
## 3.e
# Visualize median duration
    
# Create week-month-year with lubridate's floor_date function and summarise to get number of observations by region
trips %<>%
  mutate(date_start_week = floor_date(date_start, unit = "week"))

In [0]:
trips %>% head()

In [0]:
# Create aggregated datasets
trips_w <- trips %>%
  group_by(date_start_week, region) %>%
  summarise(trips = median(duration))

In [0]:
 # BONUS: Some countries have no trips in a given period and so no rows -> complete the dataset with the dplyr complete() function
trips_w %<>%
  complete(date_start_week, region)

In [0]:
# Plot with ggplot2 by region
trips_w %>%
  ggplot(aes(x = date_start_week, y = trips, color = region)) +
  geom_point() +
  geom_line() +
  labs(x = "Time",
       y = "Trips") 
# Not pretty but ok...