In [1]:
wd <- dirname(dirname(dirname(dirname(getwd()))))
source(paste0(wd,"/mission_control/treasure_map.R"))
library(tidyverse)

I_DIR <- paste0(E_DIR, "/val_hebron/clinical/raw/")
O_DIR <- paste0(E_DIR, "/val_hebron/clinical/clean/")

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.2     ✔ dplyr   1.0.6
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.1
“package ‘forcats’ was built under R version 3.6.3”── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


### 0 - Read raw data

In [2]:
a <- read.csv( paste0(I_DIR, "llistat_pacients_mol_profiled_bladder.csv"), 
               sep = ";",
               stringsAsFactors = FALSE)

### 1 - Cleaning

In [3]:
b <- a %>% transmute( 
            patient_id = NHC, 
            birth_date = Fecha.Nacimiento,
            biopsy_date = Fecha.biopsia, 
            biopsy_location = Localización.biopsia,
            tumor_location = Localización.primario.Tumor,
            tumor_type = Tipo.de.tumor.primario, 
            trt1_start_date = Fecha.inicio.Tto,
            trt1_end_date = Fecha.fin.Tto,
            trt1_stop_reason = Suspensión.tratamiento, 
            trt1 = Esquema.Tto, 
            recist = RECIST,
            recist_date = Fecha.RECIST, 
            last_date = Fecha.último.seguimiento...defunción,
            os_event = Status
    ) %>% filter(!grepl("general", recist_date))

In [4]:
b

patient_id,birth_date,biopsy_date,biopsy_location,tumor_location,tumor_type,trt1_start_date,trt1_end_date,trt1_stop_reason,trt1,recist,recist_date,last_date,os_event
20084415,18.04.1949,11.11.2020,hepatica,vejiga,carcinoma urotelial,03.03.2020,10.10.2020,0,atezolizumab/enfortumab vedotin,1,18.05.2020,22.01.2021,1
17327575,01.06.1950,06.08.2020,colón,vejiga,carcinoma urotelial,21.04.2020,14.07.2020,0,atezolizumab,3,07.07.2020,27.12.2020,1
11710531,07.07.1949,22.09.2016,vejiga,vejiga,carcinoma urotelial,27.08.2018,20.10.2020,2,atezolizumab/cabozantinib,0,31.12.2018,27.06.2021,1
12584309,15.08.1961,08.10.2020,adenopatía,vejiga,carcinoma urotelial,16.08.2021,13.09.2021,2,Bintrafusp,2,08.10.2021,25.10.2022,0
15775641,01.09.1948,05.08.2020,vejiga,vejiga,carcinoma urotelial,09.10.2020,01.02.2021,0,atezolizumab,3,04.02.2021,01.07.2021,1
12449265,02.06.1951,31.07.2020,vejiga,vejiga,carcinoma urotelial,16.09.2020,14.10.2020,2,Durvalumab/QT,1,13.01.2021,26.10.2022,0
14104377,14.04.1945,12.08.2020,vejiga,vejiga,carcinoma urotelial,25.05.2021,31.01.2022,2,Bintrafusp,1,22.07.2021,06.10.2022,0
15630631,28.03.1953,22.09.2022,vejiga,vejiga,carcinoma urotelial,03.11.2021,25.10.2022,3,Enfortumab/pembrolizumab,1,05.01.2022,25.10.2022,0
14866881,02.11.1945,18.05.2021,vejiga,vejiga,carcinoma urotelial,13.01.2022,25.03.2022,0,anti-PDL/CD173,3,24.03.2022,27.10.2022,1
14694847,21.01.1947,11.01.2021,ureter,vejiga,carcinoma urotelial,10.05.2022,02.08.2022,0,Avelumab,3,08.09.2022,25.10.2022,0


#### Format Dates

In [11]:
nice_date <- function(i) as.Date(i, tryFormats = c("%d.%m.%Y", "%d.%m.%y"))

In [12]:
for( i in names( b %>% select(contains("date")) )){
    b[,paste0(i,"_t")] <- unlist(lapply(as.character(b[,i]), nice_date))
}

In [14]:
c <- b %>% mutate(   
                clinical_age = round((trt1_start_date_t - birth_date_t)/365),
                os_days = last_date_t - trt1_start_date_t, 
                biopsy_vs_treat = biopsy_date_t - trt1_start_date_t
        )

#### Maps

In [15]:
trt_map <- list(
    "atezolizumab" = "pdl",
    "avelumab" = "pdl",
    "anti-pdl" = "pdl",
    "bintrafusp" = "pdl",
    "durvalumab" = "pd",
    "pembrolizumab" = "pd"
)
location_map <- list(
    "vejiga" = "bladder",
    "colón" = "colon",
    "ureter" = "ureter",
    "adenopatía" = "adenopathy",
    "hepatica" = "liver",
    "hígado" = "liver",
    "pared abdominal" = "abdominal_wall"
)
recist_map <- list(
    "4" = "UK",
    "3" = "PD",
    "2" = "SD",
    "1" = "PR",
    "0" = "CR"
)
bor_map <- list(
    "4" = NA,
    "3" = 0,
    "2" = 0,
    "1" = 1,
    "0" = 1
)
mechanism_map <- function(i){
    if( grepl('atezolizumab',i) | grepl('avelumab',i) | grepl('anti-pdl',i) | grepl('bintrafusp',i)){
        'pdl'
    } else {
        'pd'
    }
}

In [16]:
c$clinical_tumor_location <-  unlist(lapply(c$tumor_location, function(i) location_map[[i]]))
c$clinical_biopsy_location <- unlist(lapply(c$biopsy_location, function(i) location_map[[i]]))
c$clinical_mechanism <-       unlist(lapply( tolower(c$trt1), function(i) mechanism_map(i)))                                      
c$clinical_recist <-          unlist(lapply( as.character(c$recist), function(i) recist_map[[i]]))
c$bor <-                      unlist(lapply( as.character(c$recist), function(i) bor_map[[i]]))
c$clinical_gender <- NA
c$clinical_pretreat <- NA
c$treatment <- c$trt1

### 2 - Clean and Send it

In [17]:
d <- c %>% select(patient_id, treatment, contains("clinical"), bor, os_event, os_days)

In [18]:
saveRDS( d, paste0( O_DIR, "bladder.Rds"))