### Prepare Data Sets for XGBoost

In [1]:
wd <- dirname(dirname(getwd()))
source(paste0(wd,"/mission_control/treasure_map.R"))
library(fastDummies)
library(tidyverse)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.2     ✔ dplyr   1.0.6
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.1
“package ‘forcats’ was built under R version 3.6.3”── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


### 0 - Read data CPI study data

In [2]:
all <- readRDS(paste0(I_DIR, "cpi_go.Rds")) %>% mutate(tmb_bin = as.numeric(exp(somatic_summary_tmbPerMb)-1 > 10)) 

In [4]:
response_tissue <- c(
                     "Y_best_response_binary", 
                     "Survival_pfs_event",
                     "Survival_time_to_pfs_event", 
                     "Survival_os_event",
                     "Survival_time_to_os_event", 
                     "pfs", 
                     "os", 
                     "tissue",
                     "Filter_meta_responseMeasured"
                    )

In [5]:
clinical_features <- c( "pretreat",
                        "pretreat_comp",
                        "clinical_pre_treated",
                        "clinical_meta_hasSystemicPreTreatment2",
                        "age",
                        "clinical_biopsy_distal_proximal",
                        "clinical_cpi_mechanism3",
                        "hla_lilac_del_hla",
                        "cnv_summary_wholeGenomeDuplication",
                        "sv_summary_svTumorMutationalBurden", 
                        "purity"
                    )

In [6]:
genomic_features <- c( "tmb_bin", "tmb", "tcell", "prolif", "tgfb", "pdl1" )

In [7]:
latent_features <- c(
       "somatic_TMB_clonal",
       "somatic_TMB_vhio",
       "somatic_TMB_exome",
       "isofox_gene_set_t_cell_effector", 
       "isofox_gene_set_prolif", 
       "isofox_gene_set_Pan_TBRS",
       "isofox_gene_set_t_cell_gep_18",
       "isofox_gene_set_mariathan_Cell_cycle", 
       "isofox_gene_set_mariathan_EMT2",
       "isofox_gene_set_vhio_tgfb",
       "isofox_gene_set_vhio_prolif", 
       "isofox_gene_set_vhio_tcell"
)

In [8]:
all$pfs <- ifelse( all$Survival_pfs_event == 0,   
                  -all$Survival_time_to_pfs_event,     
                   all$Survival_time_to_pfs_event)
all$os <- ifelse(  all$Survival_os_event == 0, 
                  -all$Survival_time_to_os_event, 
                   all$Survival_time_to_os_event)

mini <- (
    all %>% column_to_rownames('sampleId')
        %>% select( all_of(response_tissue),   all_of(genomic_features), 
                    all_of(clinical_features), all_of(latent_features) )
        %>% rename( response = Y_best_response_binary)
)

### 1 - Create data structures for XGBoost study 

In [9]:
builder <- function( mini ){

    xg_lr <- ( mini 
              %>% select(-pfs,-os, 
                         -Survival_pfs_event, -Survival_time_to_pfs_event, 
                         -Survival_os_event, -Survival_time_to_os_event)
              %>% filter(Filter_meta_responseMeasured == "Yes")
              %>% select(-Filter_meta_responseMeasured)
              %>% drop_na(response))

    xg_pfs <- ( mini 
              %>% select(-response, -os, -Survival_os_event, -Survival_time_to_os_event) 
              %>% rename( event_status = Survival_pfs_event, 
                          time_to_event = Survival_time_to_pfs_event)
              %>% select(-Filter_meta_responseMeasured) 
              %>% drop_na(pfs))

    xg_os <- ( mini 
              %>% select(-response, -pfs, -Survival_pfs_event, -Survival_time_to_pfs_event) 
              %>% rename( event_status = Survival_os_event, 
                          time_to_event = Survival_time_to_os_event)
              %>% select(-Filter_meta_responseMeasured)
              %>% drop_na(os))
    
    ### storage closet ### 
    eval_closet <- list()
    eval_closet[['lr']][['df']] <- xg_lr
    eval_closet[['lr']][['gps']] <- apply(data.frame( xg_lr$response, 
                                                      xg_lr$tissue, 
                                                      is.na(xg_lr$tcell)), 
                                            1, function(i) paste0(i[1],"-",i[2],"-",i[3]))

    eval_closet[['pfs']][['df']] <- xg_pfs
    eval_closet[['pfs']][['gps']] <- apply(data.frame(xg_pfs$event_status, 
                                                            xg_pfs$tissue, 
                                                      is.na(xg_pfs$tcell)), 
                                            1, function(i) paste0(i[1],"-",i[2],"-",i[3]))                                   
                                            
    eval_closet[['os']][['df']] <- xg_os
    eval_closet[['os']][['gps']] <- apply(data.frame(xg_os$event_status, xg_os$tissue, is.na(xg_os$tcell)), 
                                            1, function(i) paste0(i[1],"-",i[2],"-",i[3]))                                                                           
    
    eval_closet
}

### Run it! Voila

In [10]:
saveRDS( builder(mini), paste0(TMP_DIR, "xg-eval-prep.Rds"))