In [None]:
source("~/sc-online/extraGenes.R")
source("~/sc-online/myExpSetCreatorFn.R")
source("~/sc-online/utils.R")
source("~/sc-online/labelTransfer.R")

library(caret)
library("DropletUtils")
library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(gtable)
library(lisi)
library(Matrix)
library(patchwork)
library(pheatmap)
library(RColorBrewer)
library(rhdf5)
library(rlang)

library(Seurat)
library(tidyr)
library(xml2)

library(viridis)
library(viridisLite)

In [None]:
READ_RAW_READS = TRUE
WRITE_RAW_READS = FALSE

BASE_PATH = "/mnt/accessory/seq_data"
VIREO_DONOR_IDS_BASENAME = 'vireo_outs/donor_list/donor_ids.tsv'
CB_SCE_BASENAME = "ingested_data/cb_data_sce_FPR_0.01.rds"
DATA_ING_DIR = "ingested_data"
RAW_COUNTS_MATRIX_BASENAME = "raw_feature_bc_matrix.rds"

calico_libs_long = readLines("~/calico-libs-long.txt")
calico_libs_long = calico_libs_long[calico_libs_long != ""]
calico_libs = lapply(calico_libs_long, function(x) {
    split = strsplit(x, split = "_")[[1]]
    return(paste(split[2:length(split)], collapse = "_"))
})
names(calico_libs_long) = calico_libs

gtex_libs_long = readLines("~/gtex-libs-long.txt")
gtex_libs_long = gtex_libs_long[gtex_libs_long != ""]
gtex_libs = lapply(gtex_libs_long, function(x) {
    split = strsplit(x, split = "_")[[1]]
    return(paste(split[2:length(split)], collapse = "_"))
})
names(gtex_libs_long) = gtex_libs

libs_long = c(calico_libs_long, gtex_libs_long)
libs = c(calico_libs, gtex_libs)

lib_info = read.table("~/sc-online/notebook_data/pd/pd_lib_info_20240301.tsv", header = TRUE, sep = "\t", stringsAsFactors = FALSE)
dapi_nurr = setNames(lib_info$sort, lib_info$library)


In [None]:
donor_ids_list=list()
sce_list=list()

print('Reading donor_ids and sce files...')
for (lib in libs){
    if (lib %in% calico_libs){
        this_base_path = paste0(BASE_PATH, "/calico")
    } else if (lib %in% gtex_libs){
        this_base_path = paste0(BASE_PATH, "/gtex")
    } else {
        stop("Unknown library: ", lib)
    }

    
    donor_ids_path = file.path(this_base_path, lib, VIREO_DONOR_IDS_BASENAME)
    sce_path = file.path(this_base_path, lib, CB_SCE_BASENAME)

    tryCatch({
        donor_ids = read.table(donor_ids_path, header = TRUE, sep = "\t")
        donor_ids_list[[lib]] = donor_ids
        
        sce = readRDS(sce_path)
        sce_list[[lib]] = sce
    }, error = function(e) {
        message("Failed to read ", lib, ": ", e$message)
    })
}


In [None]:
if (READ_RAW_READS) {

    print("Reading raw data...")
    raw_data_list = list()
    raw_col_data_list = list()
    for (name in names(donor_ids_list)){
        print(name)
        if (name %in% calico_libs){
            this_base_path = paste0(BASE_PATH, "/calico")
        } else if (name %in% gtex_libs){
            this_base_path = paste0(BASE_PATH, "/gtex")
        } else {
            stop("Unknown library: ", name)
        }
            
        raw_file_path = file.path(this_base_path, name, DATA_ING_DIR, RAW_COUNTS_MATRIX_BASENAME)

        raw_data_dcg = readRDS(raw_file_path)
        raw_data_list[[name]] = raw_data_dcg
        cd = data.frame(
            row.names=colnames(raw_data_dcg),
            nUMI=colSums(raw_data_dcg),
            log10_nUMI = log10(colSums(raw_data_dcg) + 1),
            nGene=colSums(raw_data_dcg > 0),
            is_in_filtered=colnames(raw_data_dcg) %in% colnames(sce_list[[name]])
        )
        raw_col_data_list[[name]] = cd

        if(WRITE_RAW_READS){
            saveRDS(cd, file.path(this_base_path, name,  DATA_ING_DIR, 'raw_feature_bc_matrix_col_data.rds'))
        }
    }
}

cd_list = list()
for (name in names(donor_ids_list)){
    cd = as.data.frame(colData(sce_list[[name]]))
    cd$log10_nUMI = log10(cd$nUMI + 1)
    cd$log10_nGene = log10(cd$nGene + 1)
    cd$log10_nRead = log10(cd$nRead + 1)
    cd_list[[name]] = cd
}

In [None]:
source("~/sc-online/plot.R")

# For every library, plot the following

# 1. Display a df of
#   (a) num cells
#   (b) Med num genes
#   (c) Med num UMIs
#   (d) Min num UMIs
#   (e) Med Reads / UMI

# 2. Knee Plot
# 3. Hist of Doublet Probabilities
# 4. Hist of nUMI colored by Assignable
# 5. Hist of Reads / UMI colored by assignable
# 6. Scatter of Pct Intronic vs nUMI, colored by assignable

plot_base_dir = '/mnt/accessory/analysis/plots/pd/20240304'

for (name in names(donor_ids_list)){

    print(name)

    vireo_donors = donor_ids_list[[name]]
    sce = sce_list[[name]]
    raw_data = raw_data_list[[name]]
    raw_col_data = raw_col_data_list[[name]]

    df_plot_path = file.path(plot_base_dir, paste0(name, '_01_df_plot.png'))
    knee_plot_path = file.path(plot_base_dir, paste0(name, '_02_knee_plot.png'))
    doublet_prob_hist_path = file.path(plot_base_dir, paste0(name, '_03_doublet_prob_hist.png'))
    num_umi_hist_path = file.path(plot_base_dir, paste0(name, '_04_num_umi_hist.png'))
    reads_per_umi_hist_path = file.path(plot_base_dir, paste0(name, '_05_reads_per_umi_hist.png'))
    pct_intronic_vs_num_umi_path = file.path(plot_base_dir, paste0(name, '_06_pct_intronic_vs_num_umi.png'))

    raw_col_data$log10_nUMI = log10(raw_col_data$nUMI + 1)
    cd = as.data.frame(colData(sce))
    cd$log10_nUMI = log10(cd$nUMI + 1)
    cd$log10_nGene = log10(cd$nGene + 1)
    cd$log10_nRead = log10(cd$nRead + 1)
    cd$prob_max = cd$prob_donor # this is just for compatibility with other functions
    cd$reads_per_umi = cd$nRead / cd$nUMI

    cd_singlet = cd[cd$prob_doublet < (1/3), ]

    # build a df that is the merger of demuxlet and vireo, and change column names for ease of use
    rownames(vireo_donors) = vireo_donors$cell 
    
    vireo_donors['is_singlet_vireo'] = as.numeric(vireo_donors$prob_doublet < (1/3))
    vireo_donors['prob_donor_vireo'] = vireo_donors$prob_max
    

    # NOTE: THERE ARE CASES WHEN VIREO'S `DONOR_ID` IS 'UNASSIGNED' EVEN WHEN `PROB_MAX` >= 0.9
    # IN CASES WITH FEWER THAN 10 SNPS. BEST_SINGLET IGNORES THIS CUTOFF
    vireo_donors['id_donor_vireo'] = vireo_donors$donor_id
    vireo_donors['library'] = name

    # summary df to plot
    summary_df = data.frame(
        'Library' = name,
        'Sort' = dapi_nurr[[name]],
        'Num_Cells' = nrow(cd),
        'Frac_Assignable_Vireo' = round(getFracAssignableVireo(vireo_donors), 2),
        
        'Min_nReads' = min(cd$nRead),
        'Median_nReads' = median(cd$nRead),
        'Mean_nReads' = round(mean(cd$nRead), 2),

        'Min_nUMI' = min(cd$nUMI),
        'Median_nUMI' = median(cd$nUMI),
        'Mean_nUMI' = round(mean(cd$nUMI), 2),

        'Min_nGenes' = min(cd$nGene),
        'Median_nGenes' = median(cd$nGene),
        'Mean_nGenes' = round(mean(cd$nGene), 2),

        'Min_Reads_per_UMI' = round(min(cd$reads_per_umi), 2),
        'Median_Reads_per_UMI' = round(median(cd$reads_per_umi), 2),
        'Mean_Reads_per_UMI' = round(mean(cd$reads_per_umi), 2),

        'Min_frac_intronic' = round(min(cd$pct_intronic), 2),
        'Median_frac_intronic' = round(median(cd$pct_intronic), 2),
        'Mean_frac_intronic' = round(mean(cd$pct_intronic), 2),
        'Max_frac_intronic' = round(max(cd$pct_intronic), 2),

        'Frac_Singlet_Vireo' = round(nrow(cd_singlet) / nrow(cd), 2),
        'Frac_Singlet_Vireo_Assignable' = round(getFracAssignableVireo(vireo_donors[vireo_donors$prob_doublet < (1/3), ]), 2)
    )

    # 1. display DF
    plotDfSummary(
        df=as.data.frame(t(summary_df)),
        fig_filename=df_plot_path
    )

    # 2. Knee
    plotKneeSingle(
        df=raw_col_data,
        name=name,
        title=paste0(
            'Knee Plot: ', name,
            '\nNum Cells: ', sum(raw_col_data$is_in_filtered),
            '\nFraction of Cells Assignable: ', round(getFracAssignableVireo(cd), 2)),
        ylim=c(0.5, 6),
        clim=c(.15, .8),
        
        fig_filename=knee_plot_path
    )

    # 3. Doublet Prob Hist
    plotHistColorSingle(
        df=vireo_donors,
        name=name,
        plot_col='prob_doublet',
        color_col='prob_max',
        title=paste0(
            'Vireo Doublet Probability: ', name,
            "\nFrac Predicted Doublet: ", round(sum(vireo_donors$prob_doublet >= (1/3)) / nrow(vireo_donors), 2)), 
        xlim=c(0, 1),
        ylim=c(0, 1),
        bin_width=0.1,
        fig_filename=doublet_prob_hist_path
    )

    # 4. Num UMI Hist
    plotHistColorSingle(
        df=cd_singlet,
        name=name,
        plot_col='log10_nUMI',
        color_col='prob_donor',
        bin_width=0.2,
    xlim=c(2.6, 5),
    ylim=c(0, 0.5),
        title=paste0(
            'Singlet Log10(nUMI) Histogram -- ', name,
            '\nFraction of Singlets Assignable: ', round(getFracAssignableVireo(cd_singlet), 2),
            '\nUMI -- Min: ', min(cd_singlet$nUMI), ' -- Median: ', median(cd_singlet$nUMI), ' -- Mean: ', mean(cd_singlet$nUMI)),
        fig_filename=num_umi_hist_path
    )

    # 5. Reads / UMI Hist
    plotHistColorSingle(
        df=cd_singlet,
        name=name,
        plot_col='reads_per_umi',
        color_col='prob_donor',
        xlim=c(round(min(cd_singlet$reads_per_umi)), round(max(cd_singlet$reads_per_umi))),
        bin_width=round(max(cd_singlet$reads_per_umi) - min(cd_singlet$reads_per_umi)) / 20,
        ylim=c(0, 0.5),
        title=paste0(
            'Singlet nReads/nUMI Histogram -- ', name,
            '\nFraction of Singlets Assignable: ', round(getFracAssignableVireo(cd_singlet), 2),
            '\nUMI -- Min: ', round(min(cd_singlet$reads_per_umi), 1), 
            ' -- Median: ', round(median(cd_singlet$reads_per_umi), 1),
            ' -- Mean: ', round(mean(cd_singlet$reads_per_umi), 1)),
        fig_filename=reads_per_umi_hist_path
    )

    #6. Pct Intronic vs Num UMI Scatter
    plotScatterColorSingle(
        df=cd_singlet,
        name=name,
        x_col='log10_nUMI',
        y_col='pct_intronic',
        color_col='prob_donor',
        xlim=c(2.5, 6),
        ylim=c(0, 1),
        plot_height=8,
        plot_width=8,
        fig_filename=pct_intronic_vs_num_umi_path
    )
    
}


In [None]:
name

In [None]:
gtex_libs

In [None]:
head(libs)

In [None]:
head(donor_ids_list[["pCalicoPDsHSrSNSN_VTAiPoold230719G1"]])

In [None]:
name

In [None]:
faulty

In [None]:
faulty = c()
for (name in names(donor_ids_list)){
    cd = as.data.frame(colData(sce_list[[name]]))
    if (all(is.na(cd$prob_max))){
        faulty = c(faulty, name)
    }
}

In [None]:
length(faulty)

In [None]:
faulty

In [None]:
test_cd = cd_list[[name]]
test_v = donor_ids_list[[name]]
head(test_cd)
head(test_v)
rownames(test_v) = test_v$cell
test_v = test_v[match(rownames(test_cd), rownames(test_v)), ]
head(test_v)

In [None]:
test_v$prob_max

In [None]:
some(test_v$prob_max == test_cd$prob_donor)

In [None]:
colnames(cd)

In [None]:
head(vireo_donors)

In [None]:
cd$prob_singlet