In [1]:
source("~/sc-online/extraGenes.R")
source("~/sc-online/myExpSetCreatorFn.R")
source("~/sc-online/utils.R")
source("~/sc-online/labelTransfer.R")

library(caret)
library("DropletUtils")
library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(gtable)
library(lisi)
library(Matrix)
library(patchwork)
library(pheatmap)
library(RColorBrewer)
library(rhdf5)
library(rlang)

library(Seurat)
library(tidyr)
library(xml2)

library(viridis)
library(viridisLite)

READ_RAW_READS = TRUE

Loading required package: EnsDb.Hsapiens.v86

Loading required package: ensembldb

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: Genomi

In [4]:
# load data from Vireo
vireo_libs_dated = list(
    '230712_pCalicoPDsHSrSNiPoold230719B',
    '230712_pCalicoPDsHSrSNiPoold230719BD',
    '230712_pCalicoPDsHSrSNiPoold230719C',
    '230712_pCalicoPDsHSrSNiPoold230719CD',
    '230712_pCalicoPDsHSrSNiPoold230719D1',
    '230712_pCalicoPDsHSrSNiPoold230719D2',
    '230712_pCalicoPDsHSrSNiPoold230719DD',
    '230712_pCalicoPDsHSrSNiPoold230719E1',
    '230712_pCalicoPDsHSrSNiPoold230719E2',
    '230712_pCalicoPDsHSrSNiPoold230719ED',
    '230712_pCalicoPDsHSrSNiPoold230719F1',
    '230712_pCalicoPDsHSrSNiPoold230719F2',
    '230712_pCalicoPDsHSrSNiPoold230719FD',
    '230712_pCalicoPDsHSrSNiPoold230719G1',
    '230712_pCalicoPDsHSrSNiPoold230719G2',
    '230712_pCalicoPDsHSrSNiPoold230719GD',
    '230712_pCalicoPDsHSrSNiPoold230719I1',
    '230712_pCalicoPDsHSrSNiPoold230719I2',
    '230712_pCalicoPDsHSrSNiPoold230719ID',
    '230712_pCalicoPDsHSrSNiPoold230719J1',
    '230712_pCalicoPDsHSrSNiPoold230719J2',
    '230712_pCalicoPDsHSrSNiPoold230719JD',
    '230712_pCalicoPDsHSrSNiPoold230719K1',
    '230712_pCalicoPDsHSrSNiPoold230719K2',
    'pCalicoPDsHSrSNA8id230921A8',
    'pCalicoPDsHSrSNA9id230921A9',
    'pCalicoPDsHSrSNB8id230921B8',
    'pCalicoPDsHSrSNB9id230921B9',
    'pCalicoPDsHSrSNC8id230921C8',
    'pCalicoPDsHSrSNC9id230921C9',
    'pCalicoPDsHSrSND8id230921D8',
    'pCalicoPDsHSrSND9id230921D9',
    'pCalicoPDsHSrSNE8id230921E8',
    'pCalicoPDsHSrSNF8id230921F8',
    'pCalicoPDsHSrSNG8id230921G8',
    'pCalicoPDsHSrSNH8id230921H8'
)


donor_ids_list=list()
sce_list=list()

base_path = "/mnt/accessory/seq_data/calico"
donor_ids_basename = "donor_ids.tsv"
sce_basename = "data_sce.rds"

print('Reading donor_ids and sce files...')
for (vl in vireo_libs_dated){
    if (grepl('_', vl)){
        vl = substr(vl, 8, nchar(vl))
    }

    donor_ids_path = paste(base_path, vl, donor_ids_basename, sep='/')
    sce_path = paste(base_path, vl, sce_basename, sep='/')

    tryCatch({
        donor_ids = read.table(donor_ids_path, header = TRUE, sep = "\t")
        donor_ids_list[[vl]] = donor_ids
        
        sce = readRDS(sce_path)
        sce_list[[vl]] = sce
    }, error = function(e) {
        message("Failed to read ", vl, ": ", e$message)
    })
}



print("Reading Demuxlet donor assignments...")
demuxlet_donors_list = list()
demuxlet_no_mt_donors_list = list()

for(name in names(donor_ids_list)){
    p=paste(base_path, name, 'demuxlet_demultiplexed.best', sep='/')
    p_no_mt=paste(base_path, name, 'demuxlet_demultiplexed_no_mt.best', sep='/')
    tryCatch({
        demuxlet_donors_list[[name]] = read.table(p, header=TRUE)
        demuxlet_no_mt_donors_list[[name]] = read.table(p_no_mt, header=TRUE)
    }, error = function(e) {
        message("Failed to read ", p, ": ", e$message)
    })
}


batches = c(rep(1, 24), rep(2, 12))
dapi_nurr = list(
    'NURR', 'DAPI', 'NURR', 'DAPI',
    'NURR', 'NURR', 'DAPI',
    'NURR', 'NURR', 'DAPI',
    'NURR', 'NURR', 'DAPI',
    'NURR', 'NURR', 'DAPI',
    'NURR', 'NURR', 'DAPI',
    'NURR', 'NURR', 'DAPI',
    'NURR', 'NURR', 
    'DAPI', 'DAPI', 'NURR',
    'NURR', 'NURR', 'DAPI',
    'DAPI', 'DAPI', 'DAPI',
    'NURR', 'NURR', 'DAPI')
names(batches) = names(donor_ids_list)
names(dapi_nurr) = names(donor_ids_list)

[1] "Reading donor_ids and sce files..."


[1] "Reading Demuxlet donor assignments..."


In [5]:
dapi_nurr

In [None]:


if (READ_RAW_READS) {

    print("Reading raw data...")
    raw_data_list = list()
    raw_col_data_list = list()
    for (name in names(donor_ids_list)){
        raw_base_path = "/mnt/accessory/seq_data/calico_raw" 
        raw_file_name = "raw_feature_bc_matrix.rds"
        raw_file_path = paste(raw_base_path, name, raw_file_name, sep='/')

        #raw_data_dcg = sum_duplicate_rownames_of_dcg_matrix(readRDS(raw_file_path))
        raw_data_dcg = readRDS(raw_file_path)
        raw_data_list[[name]] = raw_data_dcg
        cd = data.frame(
            row.names=colnames(raw_data_dcg),
            nUMI=colSums(raw_data_dcg),
            log10_nUMI = log10(colSums(raw_data_dcg) + 1),
            nGene=colSums(raw_data_dcg > 0),
            is_in_filtered=colnames(raw_data_dcg) %in% colnames(sce_list[[name]])
        )
        raw_col_data_list[[name]] = cd
        saveRDS(cd, paste(raw_base_path, name, 'raw_feature_bc_matrix_col_data.rds', sep='/'))
    }
}

cd_list = list()
for (name in names(donor_ids_list)){
    cd = as.data.frame(colData(sce_list[[name]]))
    cd$log10_nUMI = log10(cd$nUMI + 1)
    cd$log10_nGene = log10(cd$nGene + 1)
    cd$log10_nRead = log10(cd$nRead + 1)
    cd_list[[name]] = cd
}

In [None]:
# TAKES 20 MINUTES
# molecule_info_list = list()

# for (name in names(donor_ids_list)){
#     molecule_info_list[[name]] = readRDS(paste(base_path, name, 'outs/molecule_info.rds', sep='/'))
# }

In [None]:
getAndPlotConfusionMatrix = function(
    merged_df,
    name,
    col_x,
    col_y,
    non_assignable_categories = c('doublet', 'unassigned'),
    xlab=NULL,
    ylab=NULL,
    title=NULL,
    fig_filename=NULL
){
    # TODO: add norm args for either cols, rows, or the whole matrix

    # Create the confusion matrix
    x_factors = factor(merged_df[[col_x]])
    y_factors = factor(merged_df[[col_y]])

    # "data" and "reference" are misnomers; we're just comparing the two methods
    # "reference" labels displayed on X-Axis, "data" on Y-Axis
    # Below, the xlab MUST correspond to the "reference", and the ylab to the "data"
    conf_mat <- confusionMatrix(data=y_factors, reference=x_factors)

    # Get the table of raw counts and convert it to proportions
    conf_mat_table <- conf_mat$table
    prop_table_col_norm <- conf_mat_table #apply(conf_mat_table, 2, function(x) x / sum(x)) # divide each column by its sum. Need to be specific about axis bc matrix is square

    if (is.null(xlab)) { xlab = col_x }
    if (is.null(ylab)) { ylab = col_y }
    if (is.null(title)){
        title = paste("Confusion Matrix for library", name, '\n', ylab, "vs:", xlab, sep=' ')
    }

    merged_df_doubly_assignable = merged_df[
        !(merged_df[[col_x]] %in% non_assignable_categories) & 
        !(merged_df[[col_y]] %in% non_assignable_categories),]

    conf_mat_doubly_assignable = confusionMatrix(
        data=factor(merged_df_doubly_assignable[[col_y]]), 
        reference=factor(merged_df_doubly_assignable[[col_x]])
    )

    title = paste0(title,                 
        '\nFrac of Cells with Same Predictions: ', round(conf_mat$overall['Accuracy'], 3),
        '\nFrac Assignable by both methods: ', round(dim(merged_df_doubly_assignable)[1] / dim(merged_df)[1], 2),
        '\nFrac of Doubly Assignable Cells with Same Predictions: ', round(conf_mat_doubly_assignable$overall['Accuracy'], 3)
    )

    # Call the function to plot the heatmap
    # the xlab MUST correspond to the reference, and the ylab to the data
    plotConfusionHeatmap(
        prop_table_col_norm, 
        xlab=xlab, 
        ylab=ylab, 
        title=title,
        fig_filename=fig_filename
    )
}

In [None]:
source("~/sc-online/plot.R")

# For every library, plot the following

# 1. Display a df of
#   (a) num cells
#   (b) Med num genes
#   (c) Med num UMIs
#   (d) Min num UMIs
#   (e) Med Reads / UMI

# 2. Knee Plot
# 3. Hist of Doublet Probabilities
# 4. Hist of nUMI colored by Assignable
# 5. Hist of Reads / UMI colored by assignable
# 6. Scatter of Pct Intronic vs nUMI, colored by assignable
# 7. Vireo / Demuxlet Confusion Matrixes
# 8. Vireo / Demuxlet Confusion Matrixes -- Doubly Assignable Cells Only

plot_base_dir = '/mnt/accessory/analysis/plots'

for (name in names(donor_ids_list[1:1])){

    print(name)

    vireo_donors = donor_ids_list[[name]]
    sce = sce_list[[name]]
    demuxlet_donors = demuxlet_donors_list[[name]]
    raw_data = raw_data_list[[name]]
    raw_col_data = raw_col_data_list[[name]]
    #molecule_info = molecule_info_list[[name]]

    df_plot_path = file.path(plot_base_dir, paste0(name, '_1_df_plot.png'))
    knee_plot_path = file.path(plot_base_dir, paste0(name, '_2_knee_plot.png'))
    doublet_prob_hist_path = file.path(plot_base_dir, paste0(name, '_3_doublet_prob_hist.png'))
    num_umi_hist_path = file.path(plot_base_dir, paste0(name, '_4_num_umi_hist.png'))
    reads_per_umi_hist_path = file.path(plot_base_dir, paste0(name, '_5_reads_per_umi_hist.png'))
    pct_intronic_vs_num_umi_path = file.path(plot_base_dir, paste0(name, '_6_pct_intronic_vs_num_umi.png'))
    vireo_confusion_matrix_path = file.path(plot_base_dir, paste0(name, '_7_vireo_demuxlet_confusion_matrix.png'))

    raw_col_data$log10_nUMI = log10(raw_col_data$nUMI + 1)

    cd = as.data.frame(colData(sce))
    cd$log10_nUMI = log10(cd$nUMI + 1)
    cd$log10_nGene = log10(cd$nGene + 1)
    cd$log10_nRead = log10(cd$nRead + 1)
    cd$prob_max = cd$prob_donor # this is just for compatibility with other functions
    cd$reads_per_umi = cd$nRead / cd$nUMI

    cd_singlet = cd[cd$prob_doublet < (1/3), ]

    # build a df that is the merger of demuxlet and vireo, and change column names for ease of use
    merged_df = merge(demuxlet_donors, vireo_donors, by.x='BARCODE', by.y='cell', all.x=TRUE, all.y=TRUE)
    rownames(merged_df) = merged_df$BARCODE
    
    merged_df['is_singlet_vireo'] = as.numeric(merged_df$prob_doublet < (1/3))
    merged_df['is_singlet_demuxlet'] = as.numeric(substr(merged_df$BEST, 1, 3) == 'SNG')

    merged_df['prob_donor_vireo'] = merged_df$prob_max
    merged_df['prob_donor_demuxlet'] = merged_df$PRB.SNG1

    # NOTE: THERE ARE CASES WHEN VIREO'S `DONOR_ID` IS 'UNASSIGNED' EVEN WHEN `PROB_MAX` >= 0.9
    # IN CASES WITH FEWER THAN 10 SNPS. BEST_SINGLET IGNORES THIS CUTOFF
    merged_df['id_donor_vireo'] = merged_df$donor_id
    merged_df$id_donor_demuxlet = ifelse(substr(merged_df$BEST, 1, 3) == 'SNG', 
                                      ifelse(merged_df$PRB.SNG1 >= 0.9, 
                                             substr(merged_df$BEST, 5, nchar(merged_df$BEST)), 
                                             'unassigned'),
                                      ifelse(substr(merged_df$BEST, 1, 3) == 'DBL', 
                                             'doublet', 
                                             'unassigned'))

    merged_df['library'] = name

    # summary df to plot
    summary_df = data.frame(
        'Library' = name,
        'Sort' = dapi_nurr[[name]],
        'Batch' = batches[[name]],
        'Num_Cells' = nrow(cd),
        'Frac_Assignable_Vireo' = round(getFracAssignableVireo(vireo_donors), 2),
        'Frac_Assignable_Demuxlet' = round(getFracAssignableDemuxlet(demuxlet_donors), 2),
        
        'Min_nReads' = min(cd$nRead),
        'Median_nReads' = median(cd$nRead),
        'Mean_nReads' = round(mean(cd$nRead), 2),

        'Min_nUMI' = min(cd$nUMI),
        'Median_nUMI' = median(cd$nUMI),
        'Mean_nUMI' = round(mean(cd$nUMI), 2),

        'Min_nGenes' = min(cd$nGene),
        'Median_nGenes' = median(cd$nGene),
        'Mean_nGenes' = round(mean(cd$nGene), 2),

        'Min_Reads_per_UMI' = round(min(cd$reads_per_umi), 2),
        'Median_Reads_per_UMI' = round(median(cd$reads_per_umi), 2),
        'Mean_Reads_per_UMI' = round(mean(cd$reads_per_umi), 2),

        'Min_frac_intronic' = round(min(cd$pct_intronic), 2),
        'Median_frac_intronic' = round(median(cd$pct_intronic), 2),
        'Mean_frac_intronic' = round(mean(cd$pct_intronic), 2),
        'Max_frac_intronic' = round(max(cd$pct_intronic), 2),

        'Frac_Singlet_Vireo' = round(nrow(cd_singlet) / nrow(cd), 2),
        'Frac_Singlet_Demuxlet' =  round(sum(substr(demuxlet_donors$BEST, 1, 3) == 'SNG') / nrow(demuxlet_donors), 2),
        'Frac_Singlet_Vireo_Assignable' = round(getFracAssignableVireo(vireo_donors[vireo_donors$prob_doublet < (1/3), ]), 2),
        'Frac_Singlet_Demuxlet_Assignable' = round(getFracAssignableDemuxlet(demuxlet_donors[substr(demuxlet_donors$BEST, 1, 3) == 'SNG',]), 2)
    )

    

    # 1. display DF
    plotDfSummary(
        df=as.data.frame(t(summary_df)),
        fig_filename=df_plot_path
    )

    # 2. Knee
    plotKneeSingle(
        df=raw_col_data,
        name=name,
        title=paste0(
            'Knee Plot: ', name,
            '\nNum Cells: ', sum(raw_col_data$is_in_filtered),
            '\nFraction of Cells Assignable: ', round(getFracAssignableVireo(cd), 2)),
        ylim=c(0.5, 6),
        clim=c(.15, .8),
        
        fig_filename=knee_plot_path
    )

    # 3. Doublet Prob Hist
    plotHistColorSingle(
        df=vireo_donors,
        name=name,
        plot_col='prob_doublet',
        color_col='prob_max',
        title=paste0(
            'Vireo Doublet Probability: ', name,
            "\nFrac Predicted Doublet: ", round(sum(vireo_donors$prob_doublet >= (1/3)) / nrow(vireo_donors), 2)), 
        xlim=c(0, 1),
        ylim=c(0, 1),
        bin_width=0.1,
        fig_filename=doublet_prob_hist_path
    )

    # 4. Num UMI Hist
    plotHistColorSingle(
        df=cd_singlet,
        name=name,
        plot_col='log10_nUMI',
        color_col='prob_donor',
        bin_width=0.2,
    xlim=c(2.6, 5),
    ylim=c(0, 0.5),
        title=paste0(
            'Singlet Log10(nUMI) Histogram -- ', name,
            '\nFraction of Singlets Assignable: ', round(getFracAssignableVireo(cd_singlet), 2),
            '\nUMI -- Min: ', min(cd_singlet$nUMI), ' -- Median: ', median(cd_singlet$nUMI), ' -- Mean: ', mean(cd_singlet$nUMI)),
        fig_filename=num_umi_hist_path
    )

    # 5. Reads / UMI Hist
    plotHistColorSingle(
        df=cd_singlet,
        name=name,
        plot_col='reads_per_umi',
        color_col='prob_donor',
        xlim=c(round(min(cd_singlet$reads_per_umi)), round(max(cd_singlet$reads_per_umi))),
        bin_width=round(max(cd_singlet$reads_per_umi) - min(cd_singlet$reads_per_umi)) / 20,
        ylim=c(0, 0.5),
        title=paste0(
            'Singlet nReads/nUMI Histogram -- ', name,
            '\nFraction of Singlets Assignable: ', round(getFracAssignableVireo(cd_singlet), 2),
            '\nUMI -- Min: ', round(min(cd_singlet$reads_per_umi), 1), 
            ' -- Median: ', round(median(cd_singlet$reads_per_umi), 1),
            ' -- Mean: ', round(mean(cd_singlet$reads_per_umi), 1)),
        fig_filename=reads_per_umi_hist_path
    )

    #6. Pct Intronic vs Num UMI Scatter
    plotScatterColorSingle(
        df=cd_singlet,
        name=name,
        x_col='log10_nUMI',
        y_col='pct_intronic',
        color_col='prob_donor',
        xlim=c(2.5, 6),
        ylim=c(0, 1),
        plot_height=8,
        plot_width=8,
        fig_filename=pct_intronic_vs_num_umi_path
    )

    # # 7. Confusion Matrix
    getAndPlotConfusionMatrix(
        merged_df=merged_df,
        name=name,
        col_x='id_donor_vireo', col_y='id_donor_demuxlet',
        xlab='Vireo', ylab='Demuxlet', 
        title=paste0(
            name, ' -- Batch ', batches[[name]], ' -- Sort: ', dapi_nurr[[name]],
            '\nConfusion Matrix:', 
                '\nFrac Assignable Vireo: ', round(getFracAssignableVireo(merged_df), 2),
                ' -- Frac Assignable Demuxlet: ', round(getFracAssignableDemuxlet(merged_df), 2)
            ),
        fig_filename=vireo_confusion_matrix_path
    )
    
}
