In [None]:
library(tidyverse)

In [None]:
paramspace = read.csv("../data/metadata/paramspace_cfDNA_phaseI.csv")
paramspace  <- paramspace %>% rename(patient_id = pt_id)
head(paramspace)

In [None]:
paramspace <- paramspace %>% 
  mutate(tmp_chunks = stringr::str_split(cfDNA_folder, stringr::fixed("_"),  n = 3)) %>%
  mutate(sampleID = map_chr(tmp_chunks, 1),
         sub_value = map_chr(tmp_chunks, 2), 
         sub_val = map_chr(tmp_chunks, 3)) %>%
  select(-c(tmp_chunks, sub_value, sub_val))

In [None]:
sample_timepoint <- read.csv("../data/metadata/clin_data/sample_timepoint_formatted.csv")

In [None]:
head(sample_timepoint)
sample_timepoint_days <- sample_timepoint %>% filter(phase == "phaseI")
sample_timepoint_days <- sample_timepoint_days %>% select(sampleID, sample_timepoint_days_since_OP)
sample_timepoint_days %>% filter(is.na(sample_timepoint_days_since_OP))
dim(sample_timepoint_days)
head(sample_timepoint_days)

In [None]:
paramspace$sampleID <- as.character(paramspace$sampleID)
res <- left_join(paramspace, sample_timepoint_days, by = c("sampleID"))
head(res)
dim(res)

In [None]:
clinical_data <- read.csv("../data/metadata/clin_data/clinical_data_formatted.csv")
clinical_data_relapse <- clinical_data %>% select(patient_id, time_to_relapse_days, adjuvant_chemo_start_days, adjuvant_chemo_end_days)
head(clinical_data_relapse)

In [None]:
res <-left_join(res, clinical_data_relapse, by="patient_id")
head(res)
dim(res)

In [None]:
# 0 - negative sample (patient is healthy)
# 1 - positive sample (patient is sick)
res$sample_timepoint_days_since_OP <- as.numeric(res$sample_timepoint_days_since_OP)

phaseIpt <- read.csv("../phaseI_pt.csv", header=FALSE)
phaseIpt <- as.character(unlist(c(phaseIpt[1,])))

phaseIpt_R <- read.csv("../phaseI_pt_R.csv", header=FALSE)
phaseIpt_R <- as.character(unlist(c(phaseIpt_R[1,])))

phase_I_pts = phaseIpt
relapse_pt = phaseIpt_R

res_new <- NULL

for (ind in phase_I_pts){
    print(ind)
    res_ind <- res %>% filter(patient_id == ind)
    res_ind <- res_ind %>% arrange(sample_timepoint_days_since_OP)
    #print(res_ind %>% select(-c("cfDNA_file", "cfDNA_folder", "cfDNA_bam_path", "unique_kmers_folder")))
    if (ind %in% relapse_pt){
        res_ind$pos_neg_samples <- c(1, rep(NA, (nrow(res_ind)-1)))
        
        res_ind_relapse <- res_ind %>% filter(sample_timepoint_days_since_OP <= time_to_relapse_days) %>% filter(sample_timepoint_days_since_OP == max(sample_timepoint_days_since_OP))
        print(head(res_ind_relapse$sample_timepoint_days_since_OP))
        print(which(res_ind$sample_timepoint_days_since_OP == res_ind_relapse$sample_timepoint_days_since_OP, arr.ind=TRUE))
        res_ind[which(res_ind$sample_timepoint_days_since_OP == res_ind_relapse$sample_timepoint_days_since_OP, arr.ind=TRUE), "pos_neg_samples"] = 1
        
    } else {
        res_ind$pos_neg_samples <- c(1, rep(NA, (nrow(res_ind)-3)), 0, 0)
    }
    res_new <- rbind(res_new, res_ind)
}

head(res_new)
nrow(res_new %>% filter(pos_neg_samples == 1))
nrow(res_new %>% filter(pos_neg_samples == 0))

In [None]:
res_new_min <- res_new %>% select(sampleID, pos_neg_samples)
head(res_new_min)

In [None]:
write.csv(res_new_min, "../data/metadata/pos_neg_samples_phaseI.csv", row.names = FALSE)

### Positive/negative samples version 2

#### Positives: relapse patients from the second post op sample until they relapse (until the sample before the relapse)
#### Negatives: no_relapse patients from the second post op sample until the end

In [None]:
phase_I_pts = phaseIpt
relapse_pt = phaseIpt_R

res_new_v2 <- NULL

for (ind in phase_I_pts){
    print(ind)
    res_ind <- res %>% filter(patient_id == ind)
    res_ind <- res_ind %>% arrange(sample_timepoint_days_since_OP)
    if (ind %in% relapse_pt){
        res_ind$pos_neg_samples <- c(NA, NA, rep(NA, (nrow(res_ind)-2)))
        
        res_ind_relapse <- res_ind %>% filter(sample_timepoint_days_since_OP <= time_to_relapse_days) %>% filter(sample_timepoint_days_since_OP == max(sample_timepoint_days_since_OP))
        print(head(res_ind_relapse$sample_timepoint_days_since_OP))
        print(which(res_ind$sample_timepoint_days_since_OP == res_ind_relapse$sample_timepoint_days_since_OP, arr.ind=TRUE))
        res_ind[3:which(res_ind$sample_timepoint_days_since_OP == res_ind_relapse$sample_timepoint_days_since_OP, arr.ind=TRUE), "pos_neg_samples"] = 1
        res_ind[which(res_ind$sample_timepoint_days_since_OP == res_ind_relapse$sample_timepoint_days_since_OP, arr.ind=TRUE):nrow(res_ind), "pos_neg_samples"] = 0
    } else {
        res_ind$pos_neg_samples <- c(NA, NA, rep(0, (nrow(res_ind)-2)))
    }
    res_new_v2 <- rbind(res_new_v2, res_ind)
}

head(res_new_v2)
nrow(res_new_v2 %>% filter(pos_neg_samples == 1))
nrow(res_new_v2 %>% filter(pos_neg_samples == 0))

In [None]:
res_new_v2

In [None]:
res_new_v2_min <- res_new_v2 %>% select(sampleID, pos_neg_samples)
head(res_new_v2_min)

In [None]:
write.csv(res_new_v2_min, "../data/metadata/pos_neg_samples_phaseI_V2.csv", row.names = FALSE)