In [2]:
library(tidyverse)
library(anomalize)
library(forecast)

In [3]:
# Get list of CSV files in sensor_data directory
csv_files <- list.files(path = "data/sensor_data", pattern = "patch_[0-9]{2,3}.csv", full.names = TRUE)

In [None]:
# Function to load data, remove outliers, and save cleaned data
clean_data <- function(file_path) {

    df <- read_csv(file_path)

    var_names <- grep("water_content", names(df), value = TRUE)

    df <- df %>% select(c(dateTime,all_of(var_names)))
    df <- na.omit(df)

    patch <- str_extract(file_path, "\\d+")

    print(paste("After removing NA: ", nrow(df)))

    df_list <- list()

    for (var in var_names) {
        # Perform time series decomposition and anomaly detection

        df %>% 
        time_decompose(.data[[var]], method = "stl", frequency = 60, trend = 180) %>% # 24 hour frequency (season), 3 hour trend with STL works
        anomalize(remainder, method = "iqr", alpha = 0.05) -> df_outliers  # Adjusted parameters

        var_names_2 <- grep("observed", names(df_outliers), value = TRUE)

        # Filter out the outliers
        df_no_outliers <- df_outliers %>%h
        filter(anomaly == 'No') %>% 
        select(dateTime, all_of(var_names_2))  # Retain the dateTime and observed data columns

        print(paste("After outlier removal: ", nrow(df_no_outliers)))

        # Rename columns to make them unique (excluding 'dateTime')
        cols_to_rename <- setdiff(colnames(df_no_outliers), "dateTime")
        df_no_outliers <- df_no_outliers %>% rename_at(vars(cols_to_rename), ~paste0(., "_", var))


        # Store the result in the list
        df_list[[var]] <- df_no_outliers
    }

    # Combine all data frames in the list by dateTime
    df_combined <- Reduce(function(...) merge(..., by = "dateTime", all = TRUE), df_list)
    print(paste("Final data frame: ", nrow(df_combined)))

    df_combined <- df_combined %>%
    pivot_longer(
        cols = -dateTime,  # keep dateTime column as it is
        names_to = c("side", "depth"),  # create new columns
        names_pattern = "observed_water_content_(.*)_([0-9]+)_cm",  # split column names
        names_transform = list(
            depth = ~ readr::parse_double(.x)),
        values_to = "wc") %>% 
    mutate(patch = patch)

  # Define the output file path
  output_file_path <- str_replace(file_path, "sensor_data", "sensor_data_clean")

  # Save cleaned data back to CSV
  write_csv(df_combined, path = output_file_path)
}

# Apply the function to all CSV files in the directory
lapply(csv_files, clean_data)
