# Moffit Data Delivery

In [3]:
library(ggplot2)
library(tidyverse)
library(ComplexHeatmap)
library(maftools)
library(RColorBrewer)
library(readxl)
library(stats)
library(pROC)
library(stringr)
source("~/Desktop/puffin/R/helper_functions.R")
options(stringsAsFactors = FALSE)
options(repr.matrix.max.cols=75, repr.matrix.max.rows=50)
source("../fix_PIDs.R")

# set some defaults
options(stringsAsFactors = FALSE)
options(repr.matrix.max.cols=75, repr.matrix.max.rows=50)
formals(table)$useNA <- "always"
formals(write.csv)$row.names <- FALSE
formals(write.csv)$as.is <- TRUE

print(Sys.time())
print(sessionInfo())

── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.1     [32m✔[39m [34mtidyr    [39m 1.3.0
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: grid

ComplexHeatmap version 2.16.

[1] "2023-11-08 11:40:53 PST"
R version 4.3.0 (2023-04-21)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.5.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] ggsignif_0.6.4        glue_1.6.2            pROC_1.18.4          
 [4] readxl_1.4.3          RColorBrewer_1.1-3    maftools_2.16.0      
 [7] ComplexHeatmap_2.16.0 lubridate_1.9.2       forcats_1.0.0        
[10] stringr_1.5.0         dplyr_1.1.2           purrr_1.0.1          
[13] readr_2.1.4           tidyr_1.3.0           tibble_3.2.1         
[16] ti

In [17]:
# read in clinical data

## functions for harmonizing input sheets
source("../fix_PIDs.R")

fix_sampleTypes <- function(col){
    #print(table(col, useNA="always"))
    col.new <- gsub("FFPE", "Tissue", col)
    col.new <- gsub("Urine_Supernatant|Supernatant", "Urine", col.new)
    col.new <- gsub("Buffy Coat|PBMC", "Buffy_Coat", col.new)
    print(table(col.new, useNA="always"))
    return(col.new)
}
fix_StudyVisit <- function(col){
    warn_na(col)
    col.new <- sapply(str_split(col, pattern="_"), "[", 2)
    return(col.new)
}


df_clin = read.csv("./clinical_data/NMIBC_clinical_data_merged_2023-10-11.csv", as.is=T) %>%
                mutate(PatientID=as.character(PatientID))
stopifnot(all(!duplicated(df_clin$PatientID)))
print(df_clin$PatientID)
names(df_clin)
table(df_clin$disease.positive) # no clin data on validation cohort samples

 [1] "152237" "161922" "169558" "172530" "173254" "173912" "174091" "174872"
 [9] "175648" "175733" "175980" "176349" "176388" "180264" "182022" "116381"
[17] "165028" "167162" "171374" "171769" "171821" "172153" "172229" "172586"
[25] "172602" "172749" "172851" "173075" "173281" "173334" "173350" "173359"
[33] "173362" "173509" "173975" "173983" "175610" "174083" "174947" "175325"
[41] "175387" "175425" "175492" "175526" "175672" "175725" "175854" "176828"
[49] "176992" "177967" "178886" "175831" "179692" "181439" "181568" "182435"
[57] "182910" "184289" "184485" "184615" "184957" "185014" "185212" "185382"
[65] "185581" "186478" "186652" "187204" "187807" "188867" "189083" "189342"
[73] "190233" "191632" "191692" "192682" "193186"



FALSE  TRUE  <NA> 
   15    35    27 

In [5]:
# Master sample list from manifests
df_samples_all = make_names(read_excel("Manifest_data_all_10112023.xlsx", sheet="Sample information real time")) %>%
                    mutate(SampleID.short = make_SIDshort(SampleID, "remove.suffix"),
                           PatientID = as.character(fix_PIDs(PatientID)),
                      StudyVisit2 = fix_StudyVisit(StudyVisit),
                      StudyVisit = gsub(" ","", ifelse(!is.na(StudyVisit2), StudyVisit2, StudyVisit)),
                      SampleType = fix_sampleTypes(SampleType))
table(df_samples_all$SampleType, df_samples_all$StudyVisit, useNA="always")
nrow(df_samples_all) #558
stopifnot(all(!(duplicated(df_samples_all$SampleID.short))))

SampleID.short format: remove.suffix
 [1] "116381" "152237" "161922" "165028" "167162" "169558" "171374" "171769"
 [9] "171821" "172153" "172229" "172530" "172586" "172602" "172749" "172851"
[17] "173075" "173254" "173281" "173334" "173350" "173359" "173362" "173509"
[25] "173912" "173975" "173983" "174083" "174091" "174872" "174947" "175325"
[33] "175387" "175425" "175492" "175526" "175610" "175648" "175672" "175725"
[41] "175733" "175831" "175854" "175980" "176349" "176388" "176828" "176992"
[49] "177967" "178886" "179692" "180264" "181439" "181568" "182022" "182435"
[57] "182910" "184289" "184485" "184615" "184957" "185014" "185212" "185382"
[65] "185581" "186478" "186652" "187204" "187807" "188867" "189083" "189342"
[73] "190233" "191632" "191692" "192682" "193186"
col.new
        Buffycoat            Plasma            Tissue             Urine 
               73                73               129               142 
Urine Cell Pellet              <NA> 
              141            

                   
                    BenignUrothelium FollowUp1 FollowUp2 FollowUp3 Followup4
  Buffycoat                        0         0         0         0         0
  Plasma                           0         0         0         0         0
  Tissue                           9         0         0         0         0
  Urine                            0        18         9         4         1
  Urine Cell Pellet                0        18         9         4         1
  <NA>                             0         0         0         0         0
                   
                    IndexTURBT PostTURBT Pre-RepeatTURBT RepeatTURBT
  Buffycoat                  0         0              73           0
  Plasma                     0         0              73           0
  Tissue                    72         0               0          48
  Urine                      0        30              77           0
  Urine Cell Pellet          0        29              77           0
  <NA> 

In [7]:
# methylation data

methyl.data = read.csv("./Methylation_data/Nov_6_2023/PRDC-MOFFITT-NMIBC-22001_165_samples_Methylation_summary_info_11082023.csv") # missing batch 3 and 4 urine samples
methyl.data = standardize_names(methyl.data %>% mutate(SampleID=Sample.Name) %>% slice_head(n=165),
                                sid.format = "remove.suffix", input.type="samples") %>%
                select(-SampleType) %>%
                mutate(PatientID=fix_PIDs(SubjectID)) %>%
                left_join(df_samples_all %>% select(SampleID.short, SampleType), by="SampleID.short")
names(methyl.data)
#write.csv(methyl.data, "./Methylation_data/Nov_6_2023/region_summary_for_client_formatted.csv", row.names=F)
write.csv(methyl.data, glue("./data_delivery/Methylation/region_summary_for_client_formatted_{Sys.Date()}.csv"), row.names=F)



SampleID.short format: remove.suffix


Standardize.names WARN:Fill all with NA for SampleType



 [1] "116381" "152237" "161922" "165028" "167162" "169558" "171374" "171769"
 [9] "171821" "172153" "172229" "172530" "172586" "172602" "172749" "172851"
[17] "173281" "173334" "173350" "173362" "173509" "173912" "173975" "173983"
[25] "174083" "174091" "174872" "174947" "175325" "175387" "175425" "175492"
[33] "175526" "175610" "175648" "175672" "175725" "175733" "175854" "175980"
[41] "176349" "176388" "176828" "176992" "177967" "178886" "180264" "182022"
[49] "173359"


## WES+ data merged

In [35]:
## NGSQC
source("~/Desktop/puffin/R/helper_functions.R")

df1.ffpe = read.csv("./data_batch1_and_2/FFPE_WES/bg_rerun_09132023/PRDC-MOFFITT-NMIBC-22001_NGSQC_all_2023-09-13_tissue_5_updated.csv")
df1.urine = read.csv("./data_batch1_and_2/Urine-UCP/Urine_post_analysis/urine_WES_only//WOP00253_WOP00254_PRDC-MOFFITT-NMIBC-22001_NGSQC_all_2023-09-13_urine_supernatant_0.35_updated.csv")
df3.ffpe =read.csv("./data_batch3_02222023/PRDC-MOFFITT-NMIBC-22001-B_NGSQC_all_2023-07-30_ffpe_5_updated.csv")
df.combo = read.csv("./batch1_2_3_urine_WES_baselines/WOP00873_WOP00875_MCC-NMIBC_WES_urine_NGSQC_all_2023-07-30_urine_supernatant_0.35_updated.csv")
df.3.5.ffpe = read.csv("./benign_urothelium/WOP00981_PRDC-MOFFITT-NMIBC-22001_NGSQC_all_2023-09-01_ffpe_5_updated.csv")
df4.ffpe = read.csv("./Validation_cohort/Validation_FFPE/")




Unnamed: 0_level_0,SampleID,Total.reads..M.,Mapped.reads..M.,Mapping.rate....,Consensus.reads..M.,Duplication.rate....,Target.unique.fragment.coverage,Panel2.Target.unique.fragment.coverage,Target.read.coverage,Mean.target.read.coverage,Panel2.Target.read.coverage,Percentage.of.target...20..Mean.coverage,Within.2.fold.....after.dedup,Within.2.fold.....before.dedup,Panel2.Within.2.fold.....after.dedup,Panel2.Within.2.fold.....before.dedup,onTarget.rate....,onTarget.rate.ext200....,gcRatio.low,gcRatio.high,Mean.error.rate.....after.dedup,background.50..after.dedup,background.99..after.dedup,Fragment_Size_Mode,Fragment_Size_Bandwidth,Softclip.ratio,Double.strand.ratio,BC_totalCount,SNP_BC,tmsi_score,msi_status,msi_QC,Specimen_type,Gender,Panel,ProjectDir,analysis_type,lane,work_order,original_adapter,total_dna_input,total_cfdna_input,library_dna_input_ug,enriched_library_fa_or_ba_nm,qubit_ng_ul,pversion,category,sampleFolder,PatientID,externalSampleID,trialVisitNum,Duplicated,Coverage...160....,Coverage...80....,Coverage...40....,Coverage...400....,Coverage...200....,Coverage...100....,QC_Status,Action,Load,SNV_Num,Indel_Num,pTMB,pTMB_norm,pTMB.weighted,pTMB_norm.weighted,pTMB.adj,pTMB_norm.adj,maxAF_TMB,tumorFraction,tumorStatus
Unnamed: 0_level_1,<chr>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<lgl>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<int>,<chr>,<lgl>,<lgl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<lgl>
1,P220085,260,260,99.9,127,51.0,1795,200,3200,3714,341,99.8,53.1,52.3,58.8,59.6,65.9,89.4,0.757,1.8,0.022,0.382,2.19,183.3,135,1.63,28.9,,'00011101101220010020101110102022021',0,MSS,PASS,Tissue,female,PREDICINEATLAS_V2,/prednet/data22/OutputByRun22/220907_A00934_0209_BHCV2YDMXY//dsrun1.7.0/lbwfresult1.7.0/WOP00249_PRDC-MOFFITT-NMIBC-22001,tissue_analysis,2,WOP00249_PRDC-MOFFITT-NMIBC-22001,HBC56_v2,246.25,227.83,2,30.2,8.76,1.7.0,RUO,P220085_220907,173983,06S22036059,IndexTURBT,False,,,,99.8,99.9,100,Pass,,,25,4,12,8.9365,11.541,8.5947,12,8.9365,21.0,0.348,True
2,P220076,257,256,99.8,135,47.3,1840,198,3150,3714,317,99.2,51.5,50.4,57.5,58.4,64.6,89.0,0.819,1.99,0.0394,0.41,2.44,193.7,151,1.72,25.6,,'21101000111021002011220001111021021',2,MSS,PASS,Tissue,male,PREDICINEATLAS_V2,/prednet/data22/OutputByRun22/220902_A01832_0037_BHCKWTDSX5//dsrun1.7.0/lbwfresult1.7.0/WOP00249_PRDC-MOFFITT-NMIBC-22001,tissue_analysis,3,WOP00249_PRDC-MOFFITT-NMIBC-22001,HBC24_v2,283.25,216.09,2,31.0,7.33,1.7.0,RUO,P220076_220902,173334,06S22068063,IndexTURBT,False,,,,99.4,99.9,100,Pass,,,88,0,91,67.769,83.581,62.244,91,67.769,59.8,0.748,True
3,P220070,251,251,99.8,128,49.1,1568,169,2730,3650,280,98.7,43.1,42.8,47.1,47.3,65.1,89.4,0.733,2.43,0.0262,0.345,2.33,188.5,145,1.74,32.2,,'11102001200000001010210010000112021',0,MSS,PASS,Tissue,male,PREDICINEATLAS_V2,/prednet/data22/OutputByRun22/220907_A00934_0209_BHCV2YDMXY//dsrun1.7.0/lbwfresult1.7.0/WOP00249_PRDC-MOFFITT-NMIBC-22001,tissue_analysis,1,WOP00249_PRDC-MOFFITT-NMIBC-22001,HBC46_v2,179.5,160.22,2,33.4,8.7,1.7.0,RUO,P220070_220907,172586,06S22052895,RepeatTURBT,False,,,,98.8,99.9,100,Pass,,,15,4,5,3.7236,4.4055,3.2808,5,3.7236,39.6,0.568,True
4,P220079,246,245,99.8,119,51.6,1821,206,3450,3491,355,99.9,69.0,66.2,73.6,73.3,66.0,89.2,0.837,1.18,0.0243,0.461,2.33,185.7,154,1.65,21.2,,'11102200200111201111211000001111111',0,MSS,PASS,Tissue,male,PREDICINEATLAS_V2,/prednet/data22/OutputByRun22/220907_A00934_0209_BHCV2YDMXY//dsrun1.7.0/lbwfresult1.7.0/WOP00249_PRDC-MOFFITT-NMIBC-22001,tissue_analysis,1,WOP00249_PRDC-MOFFITT-NMIBC-22001,HBC52_v2,184.5,136.75,2,14.3,4.62,1.7.0,RUO,P220079_220907,172530,06S22036103,IndexTURBT,False,,,,99.8,99.9,100,Pass,,,23,2,19,14.15,17.14,12.764,19,14.15,15.0,0.26,True
5,P220069,245,245,99.9,109,55.5,1349,180,3100,3535,339,99.8,59.2,56.5,61.5,61.0,68.7,89.4,0.935,1.61,0.0338,0.474,2.38,156.7,126,2.7,15.6,,'11102001200000001010210010000112021',2,MSS,PASS,Tissue,male,PREDICINEATLAS_V2,/prednet/data22/OutputByRun22/220907_A00934_0209_BHCV2YDMXY//dsrun1.7.0/lbwfresult1.7.0/WOP00249_PRDC-MOFFITT-NMIBC-22001,tissue_analysis,1,WOP00249_PRDC-MOFFITT-NMIBC-22001,HBC45_v2,279.0,211.04,2,13.2,3.72,1.7.0,RUO,P220069_220907,172586,06S22035975,IndexTURBT,False,,,,99.5,99.9,100,Pass,,,15,4,4,2.9788,3.4055,2.5361,4,2.9788,57.3,0.729,True
6,P220086,245,244,99.8,124,49.1,1160,134,2110,3322,226,99.1,49.3,48.7,50.8,50.6,63.4,87.6,0.842,3.43,0.0358,0.35,2.99,192.6,155,2.75,22.3,,'00011101101220010020101110102022021',1,MSS,PASS,Tissue,female,PREDICINEATLAS_V2,/prednet/data22/OutputByRun22/220907_A00934_0209_BHCV2YDMXY//dsrun1.7.0/lbwfresult1.7.0/WOP00249_PRDC-MOFFITT-NMIBC-22001,tissue_analysis,2,WOP00249_PRDC-MOFFITT-NMIBC-22001,HBC57_v2,244.75,203.0,2,20.5,6.04,1.7.0,RUO,P220086_220907,173983,06S22052900,RepeatTURBT,False,,,,98.4,99.9,100,Pass,,,22,3,10,7.4471,9.6252,7.168,10,7.4471,17.1,0.292,True


ERROR: Error in eval(expr, envir, enclos): 


In [34]:

source("~/Desktop/puffin/R/helper_functions.R")

## All Variant Marked
# df1.ffpe = read.csv("./data_batch1_and_2/FFPE_WES/bg_rerun_09132023/PRDC-MOFFITT-NMIBC-22001_Variant_all_2023-09-13_tissue_marked.csv") 
# df1.urine = read.csv("./data_batch1_and_2/Urine-UCP/Urine_post_analysis/urine_WES_only/WOP00253_WOP00254_PRDC-MOFFITT-NMIBC-22001_Variant_all_2023-09-13_urine_marked.csv")
# df3.ffpe =read.csv("./data_batch3_02222023/PRDC-MOFFITT-NMIBC-22001-B_Variant_all_2023-05-03_ffpe_marked.csv")
# df.combo = read.csv("./batch1_2_3_urine_WES_baselines/WOP00873_WOP00875_MCC-NMIBC_WES_urine_Variant_all_2023-07-26_urine_supernatant_marked.csv")
# df.3.5.ffpe = read.csv("./benign_urothelium/WOP00981_PRDC-MOFFITT-NMIBC-22001_Variant_all_2023-09-01_ffpe_marked.csv")
# df4.ffpe = read.csv("./Validation_cohort/Validation_FFPE/WOP01194_WOP01039_PRDC-MOFFITT-NMIBC-22001_Variant_all_2023-10-24_ffpe_marked.csv")

# cols = intersect(intersect(intersect(names(df4.ffpe), names(df1.urine)), names(df.combo)), names(df1.ffpe))
# df.all.marked = rbind(df1.ffpe[,cols], df1.urine[,cols])
# df.all.marked = rbind(df.all.marked, df3.ffpe[,cols])
# df.all.marked = rbind(df.all.marked, df.combo[,cols])
# df.all.marked = rbind(df.all.marked, df.3.5.ffpe[,cols])
# df.all.marked = rbind(df.all.marked, df4.ffpe[,cols])

# df_samples_all = df_samples_all %>% mutate(SampleID.short = make_SIDshort(SampleID, "strict"))

# df.all.marked_ = df.all.marked %>% select(-PatientID) %>% mutate(SampleID.short=make_SIDshort(sampleNames, "strict")) %>%
#                 left_join(df_samples_all %>% select(PatientID, StudyVisit, SampleType, SampleID.short), by="SampleID.short")
# warn_na(df.all.marked_$PatientID)
# warn_na(df.all.marked_$StudyVisit)
# warn_na(df.all.marked_$SampleType)

# df.all.marked.f = df.all.marked_ %>% filter(SampleType != "Buffy_Coat") %>% 
#                                     distinct()
nrow(df.all.marked.f)
table(df.all.marked.f$PatientID, df.all.marked.f$SampleType)

write.csv(df.all.marked.f, glue("./data_delivery/WES/PRDC-MOFFITT-NMIBC-22001_Variant_all_combined_{Sys.Date()}.csv"))

        
         Tissue Urine  <NA>
  116381  42685     0     0
  152237  56605     0     0
  161922  16096     0     0
  165028  21336     0     0
  167162  53029     0     0
  169558  16353     0     0
  171374      0 28545     0
  171769  33265 34606     0
  171821  55762 38216     0
  172229  41776 36921     0
  172530  35003     0     0
  172586  33410 34957     0
  172602  16105 28583     0
  172749  49454 35508     0
  172851  33158 35076     0
  173075  19476 36112     0
  173254      0 35884     0
  173281  27560 37554     0
  173334  40631     0     0
  173350  57523     0     0
  173359      0 33924     0
  173362  16325 35120     0
  173509  33262     0     0
  173912  48446     0     0
  173975  58235     0     0
  173983  34580     0     0
  174083  54690 36000     0
  174091  30682     0     0
  174872  29942     0     0
  174947  31342 39030     0
  175325  59089     0     0
  175387  56408     0     0
  175425  50623 35645     0
  175492  54989     0     0
  175526  5

In [28]:
#table(df.all.marked$)
head(df.all.marked %>% filter(is.na(PatientID)))
names(df.all.marked)

“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”


seqnames,start,end,width,strand,ref,alt,totalDepth,refDepth,altDepth,sampleNames,VariantFreq,SYMBOL,GeneID,Feature,HGVSc,HGVSp,Amino_acids,Codons,BIOTYPE,EXON,INTRON,Consequence,DISTANCE,Existing_variation,IMPACT,VARIANT_CLASS,CLIN_SIG,Clinvar,dbSNP,COSMIC,COSMIC.CNT,genome1000,AF,CANONICAL,filteredCnt,DSCnt,filteredDSCnt,⋯,fc.ds.ratio,CopyNumber,zScore.cnv,coVariant,ol.coVariant,coVariantFilter,sideVariant,sideVariant.AF.ratio,prevalenceInternal,filterPrevalence,filterScore,finalKeep,VariantID,sampleDir,VariantType,VariantType.old,FilterType,sampleFolder,externalSampleID,ID,highFrequent.inbatch,concordant,totalDepth.Baseline,dscnt.Baseline.filtered,altDepth.Baseline,altDepth.Baseline.filtered,AF.Baseline,odds.ratio,pvalue,MAF.diff,finalKeep.old,SampleID.short,StudyVisit.x,SampleType.x,PatientID,StudyVisit.y,SampleType.y
<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<dbl>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<int>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<lgl>,<chr>,<dbl>,<dbl>,<lgl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<lgl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>


In [None]:
## All Variant Short Marked
df1.ffpe = read.csv("./data_batch1_and_2/FFPE_WES/bg_rerun_09132023/") 
df1.urine = read.csv("./data_batch1_and_2/Urine-UCP/Urine_post_analysis/urine_WES_only/WOP00253_WOP00254_PRDC-MOFFITT-NMIBC-22001_Variant_all_2023-09-13_urine_marked.csv")
df3.ffpe =read.csv("./data_batch3_02222023/PRDC-MOFFITT-NMIBC-22001-B_Variant_all_2023-05-03_ffpe_marked.csv")
df.combo = read.csv("./batch1_2_3_urine_WES_baselines/WOP00873_WOP00875_MCC-NMIBC_WES_urine_Variant_all_2023-07-26_urine_supernatant_marked.csv")
df.3.5.ffpe = read.csv("./benign_urothelium/WOP00981_PRDC-MOFFITT-NMIBC-22001_Variant_all_2023-09-01_ffpe_marked.csv")
df4.ffpe = read.csv("./Validation_cohort/Validation_FFPE/WOP01194_WOP01039_PRDC-MOFFITT-NMIBC-22001_Variant_all_2023-10-24_ffpe_marked.csv")

## TMB Marked

## CNV

## NGSQC updated

## Fusion all

## CNV all

stop()





########
## unpatched data
# batch 1+2
all.snv_urine = read.csv("./data_batch1_and_2/Urine-UCP/Urine_post_analysis/urine_WES_only/WOP00253_WOP00254_PRDC-MOFFITT-NMIBC-22001_Variant_short_all_2023-09-13_urine_marked.csv", as.is=T)
# see also a marked file for batch 1 ucp
all.snv_tissue = read.csv(file="./data_batch1_and_2/FFPE_WES/bg_rerun_09132023/PRDC-MOFFITT-NMIBC-22001_Variant_short_all_2023-09-13_tissue_marked.csv",
                       as.is=T) 
all.snv_a = merge.combine(standardize_names(rbind.common(all.snv_urine, all.snv_tissue), sid.format="none"), df_wes_ngsqc.updated, join.type = "left", join.cols.left = "SampleID.short",
                            join.cols.right="SampleID.short", priority="right") %>%
                    mutate(PatientID = fix_PIDs(PatientID))
# batch 3 ffpe
all.ffpe_snv_b = read.csv("./data_batch3_02222023/PRDC-MOFFITT-NMIBC-22001-B_Variant_short_all_2023-05-03_ffpe_marked.csv",
                         as.is=T) %>% filter(SpecimenType=="FFPE") 
all.ffpe_snv_b = merge.combine(standardize_names(all.ffpe_snv_b, sid.format="none"), df_wes_ngsqc.updated, join.type = "left", join.cols.left = "SampleID.short",
                            join.cols.right="SampleID.short", priority="right") %>%
                mutate(SampleType=fix_sampleTypes(SampleType)) %>%
                    mutate(PatientID = fix_PIDs(PatientID))
nrow(all.ffpe_snv_b)
# additional urine WES+
all.urine_wes_snv = read.csv("./batch1_2_3_urine_WES_baselines/WOP00873_WOP00875_MCC-NMIBC_WES_urine_Variant_short_all_2023-06-29_urine_supernatant_marked.csv",
                         as.is=T)
all.urine_wes_snv = merge.combine(standardize_names(all.urine_wes_snv, sid.format="none"), df_wes_ngsqc.updated, join.type = "left", join.cols.left = "SampleID.short",
                            join.cols.right="SampleID.short", priority="right") %>%
                    mutate(SampleType=fix_sampleTypes(SampleType)) %>%
                    mutate(PatientID = fix_PIDs(PatientID))
nrow(all.urine_wes_snv)
# benign samples
all.snv_benign = read.csv("./benign_urothelium/WOP00981_PRDC-MOFFITT-NMIBC-22001_Variant_short_all_2023-09-01_ffpe_marked.csv", 
                          as.is=T)
all.snv_benign = merge.combine(standardize_names(all.snv_benign, sid.format="none"), df_wes_ngsqc.updated, join.type = "left", join.cols.left = "SampleID.short",
                            join.cols.right="SampleID.short", priority="right") %>%
                    mutate(SampleType=fix_sampleTypes(SampleType)) %>%
                    mutate(PatientID = fix_PIDs(PatientID))

all.snv = rbind.common(all.snv_a, all.ffpe_snv_b) 
all.snv = rbind.common(all.snv, all.urine_wes_snv)
all.snv = rbind.common(all.snv, all.snv_benign)
all.snv.wes =  all.snv %>%
                filter(SampleID.short %in% df_wes_ngsqc.updated$SampleID.short) %>% distinct()
nrow(all.snv.wes) #20654 all, 
all.snv.wes.somatic = all.snv.wes %>% filter(grepl("somatic", VariantType, ignore.case=T))
nrow(all.snv.wes.somatic) #9147 somatic

print("wes short som unpatched")
table(all.snv.wes.somatic$PatientID, all.snv.wes.somatic$StudyVisit)

#########
#### patched wes data
print("wes short germ/som patched")
all.snv.wes.patched = standardize_names(read.csv("hybrid_probe_mrd/NMIBC_All_baseline_variants_short_marked_patched_09-13-2023.csv", as.is=T, check.names=F),
                                        sid.format="none")
#print(unique(all.snv.wes.patched$SampleID.short[all.snv.wes.patched$SampleID.short %!in% df_wes_ngsqc.all$SampleID.short]))
stopifnot(all(all.snv.wes.patched$SampleID.short %in% df_wes_ngsqc.updated$SampleID.short))
all.snv.wes.patched =  merge.combine(all.snv.wes.patched, df_wes_ngsqc.updated, priority="right", warn=TRUE) %>%
                    mutate(PatientID = fix_PIDs(PatientID))
all.snv.wes.patched = all.snv.wes.patched %>% 
                mutate(VariantID.p = paste(PatientID, VariantID, sep=":")) %>% distinct()
nrow(all.snv.wes.patched) # 26726
table(all.snv.wes.patched$SampleType, all.snv.wes.patched$StudyVisit)
                       
# filter to variants only in the marked as somatic in the original short file of WES+ variants
print("wes short som patched")
orig.somatic.vars = all.snv.wes.patched %>% filter(grepl("somatic", VariantType, ignore.case=T) & Patched=="FALSE") %>%
                                            pull(VariantID.p)
all.snv.wes.patched.somatic = all.snv.wes.patched %>% filter(VariantID.p %in% unique(orig.somatic.vars))
table(all.snv.wes.patched.somatic$PatientID, all.snv.wes.patched.somatic$StudyVisit, useNA="always")