In [1]:
# Load data cleaning libraries
library(dplyr)
library(impute)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




# NCT00079274
four-arm intergroup trial
## Inclusion
KRAS mutation predicts lack of EGFR inhibitor efficacy, so they only enrolled KRAS wt patients in this trial.

## Statistics consideration

> This comparison using all 2910 patients will have 90% power to detect a hazard ratio of 1.27 comparing the two treatment arms, based on a total of 735 events.


|KRAS status | Treatment | Control |
|------------|-----------|---------|
|wt | 5-fluorouracil/Leucovorin + Cetuximab + Irinotecan (Arm B) |  5-fluorouracil/Leucovorin + Irinotecan (Arm E and F) |
|mutant | 5-fluorouracil/Leucovorin + Irinotecan (Arm C) | |
|wt | (FOLFOX) Oxaliplatin + 5-fluorouracil/Leucovorin Regimen (Arm A) | FOLFOX + Cetuximab (Arm D) |  

About the arms:
| Cohort | Patient number |
|-----|----------------|
| A | 1402 |
| B | 111 |
| C | 111 |
| D | 1350 |
| E | 45 |
| F | 46 |
| G | 332 |


According to [clinicaltrials.gov](https://clinicaltrials.gov/ct2/show/NCT00079274), the primary outcome is compared with arm A vs arm D.
But for investigational purpose, we can also do:
- arm B vs arm E&F
- arm B vs arm E&F&C -> this should rediscover if KRAS mut is really a predictor of EGFR inhibitor efficacy

## Outcome measures
- Primary: disease free survival
- Overall survival
- Progression free survival



In [1]:
clinical <- read.csv("/home/alex/Documents/lab/RCT-ITE/dat/PDS/Colorec_Allianc_2004_161_NCT00079274/characteristic.csv")
outcome <- read.csv("/home/alex/Documents/lab/RCT-ITE/dat/PDS/Colorec_Allianc_2004_161_NCT00079274/objectives.csv")
ae <- read.csv("/home/alex/Documents/lab/RCT-ITE/dat/PDS/Colorec_Allianc_2004_161_NCT00079274/tox.csv")

head(clinical)
head(outcome)
head(ae)

Unnamed: 0_level_0,mask_id,ADHERENC,ARM,BWL_OBS,BWL_PERF,EXCLUDED,HISTO_G,NODES,SEX,STAGE_G,⋯,ENDATRSN,NUMCYCLE,OBJ_STAT,LOST2FUP,PS,wild,endat_time,bmi2,racecat,agecat
Unnamed: 0_level_1,<int>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<chr>,<int>,⋯,<int>,<int>,<int>,<chr>,<int>,<int>,<int>,<dbl>,<chr>,<chr>
1,1,2,A,2,2,8.0,2,2,f,2,⋯,1,12,0,,0,1,180,20.97448,oth,40-69
2,2,2,D,2,2,,2,1,m,1,⋯,1,12,8,,0,1,165,29.13717,w,40-69
3,3,2,D,2,2,,2,1,f,2,⋯,1,12,8,,0,1,181,49.27095,w,40-69
4,4,2,A,2,2,,2,2,m,3,⋯,1,12,0,,0,0,157,31.57283,w,40-69
5,5,1,D,2,2,,1,2,m,3,⋯,1,12,0,,0,1,176,16.77694,w,40-69
6,6,2,D,2,2,,2,1,f,2,⋯,1,12,0,,0,0,166,22.69562,w,40-69


Unnamed: 0_level_0,mask_id,fustat8,futime8,pgstat5,pgtime5,dfsstat5,dfstime5
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>
1,1,0,749,1,627,1,627
2,2,0,291,0,291,0,291
3,3,0,1065,0,1042,0,1042
4,4,0,1697,0,1697,0,1697
5,5,0,338,0,338,0,338
6,6,0,792,1,792,1,792


Unnamed: 0_level_0,mask_id,ARM,GRADE,tox
Unnamed: 0_level_1,<int>,<chr>,<int>,<chr>
1,2,D,3,Anorexia
2,5,D,3,Paresthesias
3,6,D,3,Paresthesias
4,11,D,3,Paresthesias
5,11,D,3,Stomatitis/Mucositis
6,13,D,3,Acne/Rash


In [None]:
# Encode AE into categorical indicator, without grade information
ae_count <- ae %>% group_by(mask_id) %>% tally()
colnames(ae_count) <- c("mask_id", "ae_count")

WYX <- left_join(clinical, ae_count, by = c("mask_id"))
WYX$ae_count[is.na(WYX$ae_count)] <- 0

WYX <- left_join(outcome, WYX, by = c("mask_id"))

cat_col <- c("SEX", "racecat", "agecat")
for (categorical_col in cat_col) {
    WYX[[paste0(categorical_col)]] <- as.numeric(as.factor(WYX[[categorical_col]]))
}

# Remove excluded patients
WYX <- WYX[is.na(WYX$EXCLUDED),]
WYX$EXCLUDED <- NULL


In [None]:
# Impute missing covariate data with KNN

informative_col <- c("mask_id", "ADHERENC", "BWL_OBS",  "BWL_PERF", "HISTO_G", "NODES", "SEX", "STAGE_G", "BAD_TOX", "DRG_DTH", "ENDATRSN", "NUMCYCLE", "PS", "wild",  "bmi2", "racecat", "agecat")
imp_X <- dplyr::select(WYX, all_of(informative_col))

# summarize missing data in each col
print(apply(imp_X, 2, function(X) sum(is.na(X))))

# DRG_DTH and BAD_TOX are derivable from the AE dataframe
new_DRG_DTH <- filter(ae, GRADE >= 5) %>% select(mask_id) %>% group_by(mask_id) %>% tally()
colnames(new_DRG_DTH)[2] <- "new_DRG_DTH"
new_BAD_TOX <- filter(ae, GRADE >= 4) %>% select(mask_id) %>% group_by(mask_id) %>% tally()
colnames(new_BAD_TOX)[2] <- "new_BAD_TOX"

imp_X <- left_join(imp_X, new_DRG_DTH, by = c("mask_id"))
imp_X <- left_join(imp_X, new_BAD_TOX, by = c("mask_id"))

imp_X$new_BAD_TOX[is.na(imp_X$new_BAD_TOX)] <- 0
imp_X$new_DRG_DTH[is.na(imp_X$new_DRG_DTH)] <- 0

imp_X <- dplyr::select(imp_X, -c("DRG_DTH", "BAD_TOX"))

# scale these covariate from 0 to 1
imp_X_scaled <- apply(dplyr::select(imp_X, -"mask_id"), MARGIN = 2, FUN = function(X) (X - min(X, na.rm = TRUE))/diff(range(X, na.rm = TRUE)))

imp_X_scaled <- impute.knn(imp_X_scaled, k = 5)
imp_X_scaled <- as.data.frame(imp_X_scaled$data)

for (covar_col in colnames(imp_X_scaled)) {
    org_min <- min(imp_X[[covar_col]], na.rm = TRUE)
    org_diff <- diff(range(imp_X[[covar_col]], na.rm = TRUE))
    imp_X_scaled[[covar_col]] <- imp_X_scaled[[covar_col]] * org_diff + org_min
}
print(apply(imp_X_scaled, 2, function(X) sum(is.na(X))))
imp_X_scaled <- cbind(imp_X$mask_id, imp_X_scaled)
colnames(imp_X_scaled)[1] <- "mask_id"

# apply to original dataset
WYX <- dplyr::select(WYX, -all_of(informative_col[2:length(informative_col)]))
WYX <- left_join(WYX, imp_X_scaled, by = c("mask_id"))

In [None]:
# processing outcome
source("/home/alex/Documents/lab/RCT-ITE/bin/survival_to_Y_func.R")

# convert days to months
methods <- c("datta", "datta.impYn", "pseudo")

Y_DFS <- data.frame(mask_id = WYX$mask_id)
Y_OS <- data.frame(mask_id = WYX$mask_id)
Y_PFS <- data.frame(mask_id = WYX$mask_id)

Y_DFS <- cbind(Y_DFS, surv_to_Y(time = ((WYX$dfstime5)/30.417), event = WYX$dfsstat5, X = dplyr::select(imp_X_scaled, -"mask_id"), method = "all"))
colnames(Y_DFS)[2:4] <- paste0("DFS_", colnames(Y_DFS)[2:4])

Y_OS <- cbind(Y_OS, surv_to_Y(time = ((WYX$futime8)/30.417), event = WYX$fustat8, X = dplyr::select(imp_X_scaled, -"mask_id"), method = "all"))
colnames(Y_OS)[2:4] <- paste0("OS_", colnames(Y_OS)[2:4])

Y_PFS <- cbind(Y_PFS, surv_to_Y(time = ((WYX$pgtime5)/30.417), event = WYX$pgstat5, X = dplyr::select(imp_X_scaled, -"mask_id"), method = "all"))
colnames(Y_PFS)[2:4] <- paste0("PFS_", colnames(Y_PFS)[2:4])

WYX <- left_join(Y_DFS, WYX, by = c("mask_id"))
WYX <- left_join(Y_OS, WYX, by = c("mask_id"))
WYX <- left_join(Y_PFS, WYX, by = c("mask_id"))

remove_censored_data <- c("fustat8", "futime8", "pgstat5","pgtime5",  "dfsstat5","dfstime5", "LOST2FUP", "OBJ_STAT", "endat_time")
WYX <- dplyr::select(WYX, -all_of(remove_censored_data))

head(WYX)

In [None]:
# binary treatment gropus datasets

# the primary outcome arm A vs arm D:
arm_AvD <- filter(WYX, ARM == "A" | ARM == "D")
arm_AvD$W[arm_AvD$ARM == "A"] <- 0
arm_AvD$W[arm_AvD$ARM == "D"] <- 1

In [None]:
# temp assignment
W <- arm_AvD$W
Y <- arm_AvD$FPS_datta.imputeYn 
X <- arm_AvD[,c(12:28)]