In [1]:
wd <- dirname(dirname(dirname(dirname(getwd()))))
source(paste0(wd,"/mission_control/treasure_map.R"))
library(tidyverse)

I_DIR <- paste0(E_DIR, "/val_hebron/clinical/raw/")
O_DIR <- paste0(E_DIR, "/val_hebron/clinical/clean/")

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.2     ✔ dplyr   1.0.6
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.1
“package ‘forcats’ was built under R version 3.6.3”── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


### 0 - Read the data

In [2]:
a <- read.csv( paste0(I_DIR, "V_Hebron_pulmon.csv"), sep = ";", skip = 1, stringsAsFactors = FALSE)

### 1 - Clean the data

#### Helpers

In [3]:
trt_map <- list(
    "atezolizumab" = "pdl",
    "atezoliumab" = "pdl",
    "avelumab" = "pdl",
    "anti-pdl" = "pdl",
    "bintrafusp" = "pdl",
    "durvalumab" = "pd",
    "pembrolizumab" = "pd", 
    "nivolumab" = "pd",
    "ipilimumab" = "ctla",
    "cemiplimab" = "pd"    
)
last_treatment <- function(i){
    if( i[6] == 1){
        "6"
    } else if ( i[5] == 1 ){
        "5"
    } else if ( i[4] == 1 ){
        "4"
    } else if ( i[3] == 1 ){
        "3"
    } else if ( i[2] == 1 ){
        "2"
    } else if ( i[1] == 1 ){
        "1"
    }    
}
maker <- function( df, i ){
    df_left <- df %>% select( -contains("TREAT"),-contains("X"))
    df_right <- df %>% select( contains(as.character(i)), -contains("X"))
    colnames(df_right ) <- c("treatment", "trt_start_date", "trt_stop_date", "trt_stop_reason", "recist")
    cbind(df_left, df_right)
}
matcher <- function( i ) { sum(sapply(names(trt_map), grepl, tolower(i)))}

pre_treated <- function(i) { ifelse(sum(i)> 1, 1, 0) }

pinch <- function( i, k ){
    pieces <- strsplit(i, "/")[[1]]
    pieces[3] <- paste0(k,pieces[3])
    paste( pieces, collapse = "/")
}
nice_date <- function(i) { 
    if( is.na(i)){
        NA
    } else if ( i %in% c("UNK","")) {
        NA
    } else if ( i == "10apr2021"){
        as.Date("10-apr-21", tryFormats = c("%m/%d/%Y", "%d-%b-%y", "%m/%d/%Y")) 
    } else {
    as.Date(i, tryFormats = c("%m/%d/%Y", "%d-%b-%y", "%m/%d/%Y")) 
    }
}

#### Create index of last applied immunotherapy
- Derive which therapy was CPI
- Derive pre-treatment field

In [4]:
a[a$NHC == "20860412", "TREATMENT.4"] <- ""
tmp <- a %>% select(contains("TREATMENT"))
tmp$idx1 <- unlist(lapply( tmp$TREATMENT1, matcher))
tmp$idx2 <- unlist(lapply( tmp$TREATMENT2, matcher))
tmp$idx3 <- unlist(lapply( tmp$TREATMENT3, matcher))
tmp$idx4 <- unlist(lapply( tmp$TREATMENT.4, matcher))
tmp$idx5 <- unlist(lapply( tmp$TREATMENT5, matcher))
tmp$idx6 <- unlist(lapply( tmp$TREATMENT6, matcher))
a$trts <-     apply(tmp %>% select(contains("idx")), 1, last_treatment); 
#a$pretreat <- apply(tmp %>% select(contains("idx")), 1, pre_treated)

#### Re-format DF to highlight most recent ICI treatment

In [5]:
b <- data.frame()
for( i in as.character(seq(6))){
    b <- rbind(b, maker( a %>% filter(trts == i), i ))
}

#### Dates
- Age, OS Days, Check Biopsy vs Treatment

In [6]:
b$Date.Of.Birth <- unlist(lapply( as.character(b$Date.Of.Birth), pinch, "19" ))
b$BIOPSY.DATE <- unlist(lapply( as.character(b$BIOPSY.DATE), pinch, "20" ))

for( i in c("Date.Of.Birth", "BIOPSY.DATE", "trt_start_date", "DATE.OF.DEATH")){
    b[,paste0(i,"_ct")] <- unlist(lapply( as.character(b[,i]), nice_date))
}

In [7]:
b$DATE.OF.DEATH_ct <- ifelse(is.na(b$DATE.OF.DEATH), max(b$DATE.OF.DEATH_ct, na.rm = TRUE)+100, b$DATE.OF.DEATH_ct)
b$clinical_age <- round((b$trt_start_date_ct - b$Date.Of.Birth_ct)/365)
b$os_days <- b$DATE.OF.DEATH_ct - b$trt_start_date_ct
b$biopsy_vs_trt <- b$trt_start_date_ct - b$BIOPSY.DATE_ct

In [8]:
c <- b %>% select(-contains("date"))

#### Formats 

In [9]:
d <- c %>% transmute(
    patient_id = NHC, 
    clinical_age, 
    clinical_gender = GENDER,
    clinical_biopsy_location = tolower(BIOPSY.LOCATION), 
    clinical_tumor_location = tolower(PRIMARY.TUMOR.LOCATION),
    trts, 
    clinical_recist = recist, 
    os_event = LIVE.STATUS, 
    os_days,
    treatment
)
d$clinical_pretreat <- ifelse(d$trts>1, 1, 0)
d$bor <- ifelse(tolower(d$clinical_recist) %in% c("cr", "pr"),1,0)
d$os_event <- unlist(lapply( d$os_event, function(i) {
    if( i == "ALIVE") {
        0
    } else if (i == "EXITUS") {
        1
    } else {
        NA
    }
}))
d <- d %>% select(-trts)
d$clinical_mechanism <- "pd"
d <- d %>% filter(!is.na(clinical_recist)); 

### 2 - Send it!

In [10]:
saveRDS( d, paste0( O_DIR, "lung.Rds"))