In [1]:
library("tidyverse")
library("data.table")
library("cowplot")
library("ggh4x")
library("ggrastr")

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘data.table’


The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    y

In [3]:
strain_labels <- c("iCab" = "icab", "HdrR" = "hdr", "Ho5" = "ho5", "Kaga" =  "kaga", "HNI" = "hni")

In [4]:
df <- fread("/nfs/research/birney/users/saul/nextflow/medaka_behaviour_pilot/hmm/time_step0.08_n_states15_hmm.csv.gz")
df[, mean_dist := log10(mean(distance)), by = hmm_state]
tmp <- df[, .(hmm_state, mean_dist)] |> distinct() |> as.data.table()
tmp[, hmm_state_recoded := rank(mean_dist)]
df <- merge(df, tmp, by = c("hmm_state", "mean_dist"))
df[, hmm_state := hmm_state_recoded]
df[, hmm_state_recoded := NULL]
df <- df |>
    separate(
        id,
        into = c("date", "time_string", "ref_strain", "test_strain", "tank_side", "assay", "quadrant", "fish_type"),
        sep = "_",
        remove = FALSE
    ) |>
    as.data.table()
df[, test_strain := fct_recode(test_strain, !!!strain_labels)]
df[, test_strain := fct_relevel(test_strain, function(x){names(strain_labels)})]
head(df)

hmm_state,mean_dist,id,date,time_string,ref_strain,test_strain,tank_side,assay,quadrant,fish_type,frame_n,time_s,distance,angle
<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<fct>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>
6,0.1005448,20190611_1331_icab_icab_R_no_q1_ref,20190611,1331,icab,iCab,R,no,q1,ref,757,25.23333,0.2083191,0.0
6,0.1005448,20190611_1331_icab_icab_R_no_q1_ref,20190611,1331,icab,iCab,R,no,q1,ref,761,25.36667,0.4425455,0.0
6,0.1005448,20190611_1331_icab_icab_R_no_q1_ref,20190611,1331,icab,iCab,R,no,q1,ref,1755,58.5,1.2756671,-0.4597774
6,0.1005448,20190611_1331_icab_icab_R_no_q1_ref,20190611,1331,icab,iCab,R,no,q1,ref,1779,59.3,1.3794776,-0.190045
6,0.1005448,20190611_1331_icab_icab_R_no_q1_ref,20190611,1331,icab,iCab,R,no,q1,ref,1781,59.36667,1.3745458,-0.3463498
6,0.1005448,20190611_1331_icab_icab_R_no_q1_ref,20190611,1331,icab,iCab,R,no,q1,ref,1787,59.56667,1.4936201,-0.3414281


In [19]:
run_kw <- function(the_state, the_assay, data){
    fit <- kruskal.test(f ~ test_strain, data = data[assay == the_assay & hmm_state == the_state])
    ret <- data.table(
        assay = the_assay,
        hmm_state = the_state,
        pval = fit[["p.value"]]
    )
    return(ret)
}

In [6]:
# NOTE: here we run each id independently differently from fig7!
summ <- df[
    , .(id = str_remove(id, "_(ref|test)$"), test_strain, fish_type, assay, frame_n, hmm_state)
]
summ <- dcast(summ, id + test_strain + assay + frame_n ~ fish_type, value.var = "hmm_state")[!is.na(test) & !is.na(ref)]
summ <- summ[, .(n = .N), by = c("ref", "test", "test_strain", "assay", "id")]
summ <- summ[, .(test, ref, n, n_tot = sum(n)), by = c("test_strain", "assay", "id")]
summ[, f := n/n_tot]
summ <- summ[ref == test, .(test_strain, assay, hmm_state = test, f)]
head(summ)

test_strain,assay,hmm_state,f
<fct>,<chr>,<dbl>,<dbl>
iCab,no,1,0.035341187
iCab,no,4,0.01511447
iCab,no,7,0.005779062
iCab,no,2,0.006001334
iCab,no,3,0.006668148
iCab,no,6,0.007890642


In [20]:
res <- c(
    lapply(summ[, unique(hmm_state)], run_kw, the_assay = "of", data = summ),
    lapply(summ[, unique(hmm_state)], run_kw, the_assay = "no", data = summ)
) |>
    rbindlist()

In [21]:
res[, q := p.adjust(pval, method = "fdr")]
res[, sig := q < 0.05]

In [22]:
res[, .(n = .N, min = min(q) |> signif(3), max = max(q) |> signif(3)), by = c("sig", "assay")]

sig,assay,n,min,max
<lgl>,<chr>,<int>,<dbl>,<dbl>
True,of,7,1.68e-07,0.0483
False,of,8,0.282,0.783
True,no,5,2.54e-07,0.0483
False,no,10,0.091,0.88


In [23]:
res <- res |> rstatix::add_significance(p.col = "q") |> as.data.table()
head(res)

assay,hmm_state,pval,q,sig,q.signif
<chr>,<dbl>,<dbl>,<dbl>,<lgl>,<chr>
of,1,5.60054e-09,1.680162e-07,True,****
of,4,1.064606e-06,7.984543e-06,True,****
of,7,0.003935143,0.01298025,True,*
of,2,0.003871338,0.01298025,True,*
of,3,0.001605148,0.00802574,True,**
of,6,0.3797154,0.5424506,False,ns


In [27]:
pretty_df <- res[
    , .(
        `Assay component` = ifelse(assay == "of", "Open field", "Novel object"),
        `HMM State` = hmm_state,
        `p-value (FDR-adjusted)` = signif(q, 3) |> format(scientific = TRUE),
        `Significance` = q.signif
    )
]

pretty_df

Assay component,HMM State,p-value (FDR-adjusted),Significance
<chr>,<dbl>,<chr>,<chr>
Open field,1,1.68e-07,****
Open field,4,7.98e-06,****
Open field,7,0.013,*
Open field,2,0.013,*
Open field,3,0.00803,**
Open field,6,0.542,ns
Open field,5,0.685,ns
Open field,8,0.282,ns
Open field,9,0.369,ns
Open field,12,0.307,ns


In [28]:
fwrite(pretty_df, "tableS3.csv")