---

**This notebook is used to process LocationOutput.csv files generated by the python script during the slurm phase of the pipeline. `<File_ID>_LocationOutput.csv` are saved in each nested individual build file and need to be copyied in `./build/LocationOutputs/` first. Then, R tidyverse is used to process the data and perform a statistical analysis.**

---

# Setup

## R environment

In [None]:
## Load tidyverse
library('tidyverse')

In [None]:
# Beautiful theme
my.theme <- theme(legend.position="none",
                      axis.line=element_line(linewidth=0.25),
                      axis.ticks=element_line(linewidth=0.25),
                      axis.title=element_text(size=8),
                      axis.text=element_text(size=6),
                      strip.placement="outside",
                      strip.text=element_text(size=8),
                      strip.background=element_blank())
    update_geom_defaults("point", list(fill="white", shape=21, size=1.1))
    update_geom_defaults("line", list(linewidth=0.25))

## Make tables & define sample

In [None]:
# Define the variables to loop through
#all_birds <- c("Rb318", "Rb331", "Rb334", "Rb383", "Rb384", "Rb385")
#analysis_vars <- c("Rb346", "Rb349", "Rb3830", "Rb384", "Rb385", "Rb389", "Rb390")
analysis_vars <- c("Rb346", "Rb349", "Rb384", "Rb385") # Testing with the sample already processed
# Removed from the analysis "Rb318", "Rb331", "Rb334", 

In [None]:
# Import and combine dataframes
folder_path <- "../build/LocationOutputs/"
csv_files <- list.files(path = folder_path, pattern = "\\.csv", full.names = TRUE)
list_of_dataframes <- lapply(csv_files, read.csv)
combined_data <- bind_rows(list_of_dataframes)

combined_data <- (
    combined_data %>% mutate(File = str_sub(File, end = -5))
)

In [None]:
combined_data

In [None]:
# Import reference table
reference_path <- "../tables/videos.csv"
reference_csv <- read.csv(reference_path)
#reference_csv

In [None]:
# Import the surgery reference
virus_path <- "../tables/all-birds.csv"
virus_csv <- read.csv(virus_path)

# Keep only the interesting vars
virus_csv <- virus_csv %>% select(bird, injected_with)
#virus_csv

In [None]:
# Create mastersheet
mastersheet <- inner_join(reference_csv, combined_data, by="File")
mastersheet <- left_join(mastersheet, virus_csv, by="bird")

# Filter_out outliers
mastersheet <- (
    mastersheet %>% filter (Frame < 20000, Distance_px < 80)
)
names(mastersheet)

In [None]:
# Save mastersheet in output
# Sanity check
write.csv(mastersheet, "../output/mastersheet.csv")

# Analysis

In [None]:
# Modify Distance_px to 0 where Frame == 0 for each bird
mastersheet <- mastersheet %>%
  mutate(Distance_px = ifelse(Frame == 0, 0, Distance_px))

In [None]:
# loop through analysis_vars
for (var_name in analysis_vars) {
    #print(paste("Processing variable:", var_name))
    filter <- (
        mastersheet %>% filter(bird == var_name)
        %>% filter(Frame < 20000, Distance_px < 80)
    )
    options(repr.plot.width=9, repr.plot.height=2)
    plot <- (
        ggplot(filter, aes(x=Frame, y=Distance_px))
        + geom_line()
        + ylim(0, 80)                             # Keep the same scale for all plots
        + facet_grid(. ~ File)
        )
    p2 <- (plot + ggtitle(paste0(var_name, " distance travelled depending on broadcast")))
    ggsave(paste0("../output/distance_travelled/", var_name, ".pdf"), plot = p2, width = 7, height = 5, units = "in")
    print(p2)
}

In [None]:
# loop through analysis_vars
for (var_name in analysis_vars) {
    #print(paste("Processing variable:", var_name))
    filter <- (
        mastersheet %>% filter(bird == var_name)
        %>% filter(Frame < 20000, Distance_px < 80)
    )
    #print(head(filter))
    options(repr.plot.width=9, repr.plot.height=3.5)
    p <- (
        ggplot(filter, aes(x=X, y=Y))
        + geom_point()
        + facet_grid(. ~ File)
        + my.theme
        + ggtitle(paste(var_name, " path depending on broadcast"))
    )
    ggsave(paste0("../output/path_travelled/", var_name, ".pdf"), plot = p2, width = 7, height = 5, units = "in")
    print(p)
}

# Statistics

In [None]:
bird_avg_distances <- sapply(analysis_vars, function(var_name) {
  filtered_data <- mastersheet %>%
    filter(bird == var_name, Frame < 20000, Distance_px < 80)
  
  mean(filtered_data$Distance_px, na.rm = TRUE)
})

# Result: a named numeric vector
print(bird_avg_distances)


## Per AAV-construct

In [None]:
# Filter and compute averages by Bird and Condition
bird_condition_avg <- mastersheet %>%
  filter(bird %in% analysis_vars, Frame < 20000, Distance_px < 80) %>%
  group_by(injected_with, bird, epoch) %>%
  summarise(Avg_Distance = mean(Distance_px, na.rm = TRUE)) %>%
  ungroup()

# View result
avg_speeds <- bird_condition_avg

In [None]:
avg_speeds$Condition <- factor(avg_speeds$Condition, levels = c("pre_silence", "colony", "post_silence"))

avg_plot <- ggplot(
    avg_speeds, aes(x=Condition, y=Avg_Distance, color=Virus),
) + geom_point()

avg_plot