In [None]:
library(tidyverse)
library(glmnet)
install.packages("glmnetUtils")
library(glmnetUtils)


In [5]:



class_type <- "Healthy"
data <- readRDS("../../data/ATAC_predictions_train_20_predict_80/Full_data_ATAC_pred_lasso_formatted_standardized.rds")
sample_types <- read.table("../../data/sample_types.txt", header = F, sep = " ")
colnames(sample_types) <- c("sample", "sample_type")

fold <- readRDS("Methylation_Folds/Healthy/Fold_1_for_CV_rep_1_Healthy.rds")
cv_rep <- 1

data <- merge(data, sample_types, by="sample")
data <- data %>% dplyr::select(-sample)

if (class_type != "Healthy"){
    data <- data %>% filter(sample_type != "Healthy")
    message(unique(data$sample_type))
    data %>% group_by(sample_type) %>% summarize(n = n())
    data <-subset(data, sample_type != "Duodenal_Cancer")
    data$sample_type <- as.factor(data$sample_type)
    data = data %>% droplevels("Duodenal_Cancer")
    data <- data %>% mutate(sample_type = ifelse(sample_type == class_type, class_type, "Other"))
    } else {
    data <- data %>% mutate(sample_type = ifelse(sample_type == class_type, class_type, "Cancer"))
}

observed  <- data$sample_type

data$sample_type <- as.factor(data$sample_type)
print("Sample_type levels")
levels(data$sample_type)

print("Head of fold (train rows)")
print(fold[0:6])
        
message(class_type)
message(paste("CV repetition number: ", cv_rep, sep = ""))
set.seed(cv_rep)

    
y <- data %>% dplyr::select(sample_type) %>%  as.matrix()
X <- data %>% dplyr::select(-sample_type) %>% as.matrix() 

testdata  <- X[-fold,]
traindata <- X[fold,]
train_y <- y[fold,]

rows <- data %>% mutate(row_name = row_number()) %>% dplyr::select(row_name)
test_rows <- rows[-fold,]
print("Head of test fold (test_rows)")
print(head(test_rows))

predicted <- tibble(row_predicted = test_rows,
                    second_class_prob = rep(NA, nrow(data) - length(fold)),
                    label_pred = rep(NA, nrow(data) - length(fold)))

        
get_model_params <- function(fit) {
    alpha <- fit$alpha
    lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
    lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
    error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
    best <- which.min(error)
    data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
               lambdaSE = lambdaSE[best], error = error[best])
}
        

########### nested CV to find best alpha and lambda on train folds ###########
set.seed(0) # alpha
lasso_cva <- cva.glmnet(traindata, train_y, nfolds = 10, family = "binomial")
best_params <- get_model_params(lasso_cva)
best_alpha <- best_params$alpha
best_lambda_min <- best_params$lambdaMin

message("best_alpha")
message(best_alpha)
message("best_lambda")
message(best_lambda_min)
coef <- coef(lasso_cva, alpha = best_alpha, , s="lambda.min")
message(coef)
lasso_cva$beta
###############################################################################
#fit       <- glmnet(traindata, train_y, family = "binomial", alpha = best_alpha, lambda = best_lambda_min)
tmp       <- predict(lasso_cva, s="lambda.min", alpha = best_alpha, testdata, type = "response")
tmp_class <- predict(lasso_cva, s="lambda.min", alpha = best_alpha, testdata, type = "class")
predicted[2] <- tmp
predicted[3] <- tmp_class
   


print("Results: ")
head(predicted)

saveRDS(predicted, file = snakemake@output[["predictions"]])


[1] "Sample_type levels"


[1] "Head of fold (train rows)"
[1] 1 2 3 4 5 6


Healthy
CV repetition number: 1


[1] "Head of test fold (test_rows)"
[1]  7 12 16 38 57 74


best_alpha
0.027
best_lambda
0.0660898379791357
-24.067607686403200.0153249859165720000000000.00729855859872157000000.002475302923137520-0.02638141235168750.03784949215043320.03589593471024220000.0031330896925083900000-0.0342505226109195000000-0.063134125613103500.0253670458281225000000.00938396227855132000000000000000-0.033905810836676400000-0.0023522919791423900.0090829931324816900-0.00167689549618378000.007450635707474460-0.0015491547507194400000.0186134813952951-0.0114930107823708000-0.0033302978417252900000000-0.002411399387236120000.04715419149718110000.0323048767814802-0.04587233370562130-0.010666967825569200-0.03177093349681690.1037337290900780000000.028633967830645100000.01619515954128020000-0.01134959567845280000.011662504304482200000000-0.0378070112084198-0.053325828030302-0.02308596381769800000000.047271608468020300000-0.0105058314965225-0.03251302582548230000000-0.00124814642548409000.0411417183953909000.004626927850620060.02139682493355200.0346186226609326-0.0262180345537

[1] "Results: "


row_predicted,second_class_prob,label_pred
7,0.2133504,Cancer
12,0.1232577,Cancer
16,0.586197,Healthy
38,0.8163016,Healthy
57,0.5628691,Healthy
74,0.3299714,Cancer


ERROR: Error in saveRDS(predicted, file = snakemake@output[["predictions"]]): object 'snakemake' not found


In [8]:
IRdisplay::display(coef)

10304 x 1 sparse Matrix of class "dgCMatrix"
                        1
(Intercept) -2.406761e+01
chr10_400    .           
chr10_40     1.532499e-02
chr10_4      .           
chr10_401    .           
chr10_402    .           
chr10_403    .           
chr10_404    .           
chr10_405    .           
chr10_406    .           
chr10_407    .           
chr10_408    .           
chr10_409    7.298559e-03
chr10_410    .           
chr10_41     .           
chr10_411    .           
chr10_412    .           
chr10_413    .           
chr10_414    2.475303e-03
chr10_415    .           
chr10_416   -2.638141e-02
chr10_417    3.784949e-02
chr10_418    3.589593e-02
chr10_419    .           
chr10_420    .           
chr10_42     .           
chr10_421    3.133090e-03
chr10_422    .           
chr10_423    .           
chr10_424    .           
chr10_425    .           
chr10_426    .           
chr10_427   -3.425052e-02
chr10_428    .           
chr10_429    .           
chr10_430    .     