# Cleaning the Earthquake Dataset
### Importing the Dataset:

In [None]:
library(tidyverse)

In [None]:
# import the canterbury data
cant_earthquake_df_2005_to_2010 <- read_csv(file="earthquakes 01012005-31122010.csv", show_col_types = FALSE)
cant_earthquake_df_2011_to_2015 <- read_csv(file="earthquakes 01012011-31122015.csv", show_col_types = FALSE)
cant_earthquake_df_2016_to_2021 <- read_csv(file="earthquakes 01012016-30092021.csv", show_col_types = FALSE)

# import the auckland data
auk_earthquake_df_2005_to_2021 <- read_csv(file="earthquakes auckland and northland.csv", show_col_types = FALSE)

# import the wellington data
wlg_earthquake_df_2005_to_2007 <- read_csv(file="earthquakes wellington-marlborough 2005-2007.csv", show_col_types = FALSE)
wlg_earthquake_df_2007_to_2013 <- read_csv(file="earthquakes earthquakes wellington-marlborough 2007-2013.csv", show_col_types = FALSE)
wlg_earthquake_df_2013_to_2015 <- read_csv(file="earthquakes earthquakes wellington-marlborough 2013-2015.csv", show_col_types = FALSE)
wlg_earthquake_df_2015_to_2017 <- read_csv(file="earthquakes wellington-marlborough 2015-2017.csv", show_col_types = FALSE)
wlg_earthquake_df_2017_to_2021 <- read_csv(file="earthquakes wellington-marlborough 2017-2021.csv", show_col_types = FALSE)

In [None]:
# View a selection of each dataset
head(cant_earthquake_df_2005_to_2010)
head(auk_earthquake_df_2005_to_2021)
head(wlg_earthquake_df_2015_to_2017)

In [None]:
# Visualise the missing data
library(visdat)
vis_miss(cant_earthquake_df_2005_to_2010)
vis_miss(cant_earthquake_df_2011_to_2015)
vis_miss(cant_earthquake_df_2016_to_2021)

vis_miss(auk_earthquake_df_2005_to_2021)

vis_miss(wlg_earthquake_df_2005_to_2007)
vis_miss(wlg_earthquake_df_2007_to_2013)
vis_miss(wlg_earthquake_df_2013_to_2015)
vis_miss(wlg_earthquake_df_2015_to_2017)
vis_miss(wlg_earthquake_df_2017_to_2021)

# The visualisation shows that there are a few columns missing data - however, 
# these are not columns that we will need to use which is good news :)

In [None]:
glimpse(earthquake_df_2016_to_2021)

In [None]:
colnames(earthquake_df_2005_to_2010) == colnames(earthquake_df_2011_to_2015)
colnames(earthquake_df_2011_to_2015) == colnames(earthquake_df_2016_to_2021)

### Wrangling the Dataset

We need to create a function that does all the data wrangling we need, then we can run it on each of our three data sets (to save us time) :)

In [None]:
install.packages("lubridate")

In [None]:
# note - could potentially break this down into multiple functions
# we also may need to filter out magnitudetype Mw??

wrangle_earthquake_data <- function(dataset) {
    # step one - filter the required columns
    filtered_data <- dataset %>% select(eventtype, origintime, longitude, latitude, magnitude, depth)
    
    # step two - filter only the eventtype == "earthquake"
    filtered_data <- filtered_data %>% filter(eventtype == "earthquake")
    
    # step three - add a column for an earthquake's severity which will be calculated based on an earthquake's magnitude and depth
    filtered_data <- filtered_data %>% mutate(severity = magnitude)#1000 * magnitude - depth)
    
    # step four - filter out insiginificant earthquakes
    
    # I have done this by selecting the 100 most severe earthquakes.
    # This could be done by selecting the top x% or by ranking the severity 
    # into 10 categories, and only taking those in the top 5 categoires?
    filtered_data <- top_n(filtered_data, 1000, severity)
    
    # step five - group by originttime - HOW??
    # filtered_data <- as.character(filtered_data$origintime)
    # filtered_data <- as.Date(filtered_data$origintime) # changes to dates only, ignores the time
    # filtered_data <- filtered_data %>% group_by(origintime)
    
    # final step - return filtered data
    return(filtered_data)
}

In [None]:
ggplot(data=wrangled_df) + 
        geom_point(mapping=aes(x=magnitude, y=-depth, colour=severity))

In [45]:
# Run the wrangling function on each of the data sets

wrangled_df_cant_2005_to_2010 <- wrangle_earthquake_data(cant_earthquake_df_2005_to_2010)
wrangled_df_cant_2011_to_2015 <- wrangle_earthquake_data(cant_earthquake_df_2011_to_2015)
wrangled_df_cant_2016_to_2021 <- wrangle_earthquake_data(cant_earthquake_df_2016_to_2021)

wrangled_df_auk_earthquake_df_2005_to_2021 <- wrangle_earthquake_data(auk_earthquake_df_2005_to_2021)

wrangled_df_wlg_earthquake_df_2005_to_2007 <- wrangle_earthquake_data(wlg_earthquake_df_2005_to_2007)
wrangled_df_wlg_earthquake_df_2007_to_2013 <- wrangle_earthquake_data(wlg_earthquake_df_2007_to_2013)
wrangled_df_wlg_earthquake_df_2013_to_2015 <- wrangle_earthquake_data(wlg_earthquake_df_2013_to_2015)
wrangled_df_wlg_earthquake_df_2015_to_2017 <- wrangle_earthquake_data(wlg_earthquake_df_2015_to_2017)
wrangled_df_wlg_earthquake_df_2017_to_2021 <- wrangle_earthquake_data(wlg_earthquake_df_2017_to_2021)

### Combining the datasets

In [46]:
# Combines the three datasets
complete_cant_earthquake_df <- rbind(wrangled_df_cant_2005_to_2010, wrangled_df_cant_2011_to_2015, wrangled_df_cant_2016_to_2021)
complete_auk_earthquake_df <- wrangled_df_auk_earthquake_df_2005_to_2021
complete_wlg_earthquake_df <- rbind(wrangled_df_wlg_earthquake_df_2005_to_2007, wrangled_df_wlg_earthquake_df_2007_to_2013, wrangled_df_wlg_earthquake_df_2013_to_2015, wrangled_df_wlg_earthquake_df_2015_to_2017, wrangled_df_wlg_earthquake_df_2017_to_2021)

In [None]:
write.csv(complete_earthquake_df, "temp_canterbury_earthquake_data.csv", row.names=TRUE)