In [None]:
my_conda_initialize
conda activate r4_2

In [None]:


library(Seurat)
library(anndata)



##################################################
### query data:

f_gex = list(
merged = '/lustre1/project/stg_00090/scatac_benchmark/full_5_cellranger/rna_qc/merged__adata.h5'
)

query.list = list()
for(i in 1:length(f_gex)) {
    data <- read_h5ad(f_gex[[i]])
    data <- CreateSeuratObject(counts = t(as.data.frame(data$X)), meta.data = data$obs)
    # if(is.list(tmp_h5)) {
    #     query.list[[i]] = CreateSeuratObject(tmp_h5$`Gene Expression`)
    # } else {
    #     query.list[[i]] = CreateSeuratObject(tmp_h5)
    # }
    data = NormalizeData(data)
    all.genes <- rownames(data)
    data <- ScaleData(data, features = all.genes)
    query.list[[i]] = FindVariableFeatures(data, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
}


names(query.list) = names(f_gex)

##################################################
# transfer

pbmc.integrated <- readRDS('../0_resources/seurat_references/pbmc_ssc_mat__integrated.rds')

for(i in 1:length(query.list)) {
    transfer.anchors <- FindTransferAnchors(reference=pbmc.integrated, query=query.list[[i]], dims=1:30, reference.reduction="pca")
    predictions <- TransferData(anchorset=transfer.anchors, refdata=pbmc.integrated$CellType, dims=1:30)
    query.list[[i]] = AddMetaData(query.list[[i]], metadata = predictions)
}

In [None]:
get_difference_to_next_prediction_score = function(x) {
    y = x[,grep('prediction.score',colnames(x))]
    xcols = grep('prediction.score',colnames(x))
    xcols = xcols[ 1:(length(xcols)-1) ]
    pred_score_next = numeric(nrow(x))
    for(i in 1:nrow(x)) {
        pred_score_next[i] = sort(x$prediction.score.max[i] - as.numeric(x[i,xcols]))[2]
    }
    return(pred_score_next)
}

pred_thr = 0.7
diff_thr = 0.1
for(i in 1:length(query.list)) {
    query.list[[i]]@meta.data$diff_to_next_pred_score =
        get_difference_to_next_prediction_score(query.list[[i]]@meta.data)
    pf = (query.list[[i]]$prediction.score.max>pred_thr) & (query.list[[i]]$diff_to_next_pred_score>diff_thr)
    cat(names(query.list)[i],": ",
        length(query.list[[i]]$prediction.score.max), " | ",
        sum(query.list[[i]]$prediction.score.max>=pred_thr), "> ",pred_thr," | ",
        sum(query.list[[i]]$diff_to_next_pred_score>=diff_thr), "> ",diff_thr," | both:",
        sum(pf), " ", sum(pf)/length(pf), "\n")
}

CNA_10xmultiome_1 :  6664  |  2154 >  0.7  |  5351 >  0.1  | both: 2154   0.3232293
CNA_10xmultiome_2 :  6661  |  505 >  0.7  |  3180 >  0.1  | both: 505   0.07581444
SAN_10xmultiome_1 :  3587  |  341 >  0.7  |  2330 >  0.1  | both: 341   0.09506551
SAN_10xmultiome_2 :  3587  |  341 >  0.7  |  2330 >  0.1  | both: 341   0.09506551
VIB_10xmultiome_1 :  1346  |  124 >  0.7  |  1179 >  0.1  | both: 124   0.09212481
VIB_10xmultiome_1 :  1346  |  124 >  0.7  |  1179 >  0.1  | both: 124   0.09212481

In [None]:
cell.annot = list()
for(i in 1:length(query.list)) {
    md = query.list[[i]]@meta.data

    tmp = data.frame(
          composite_sample_id = paste0(rownames(md),'___',names(query.list)[i]),
          barcode = rownames(md),
          sample_id = names(query.list)[i],
          cell_type = md$predicted.id,
          cell_type_pred_score = md$prediction.score.max
          )
    tmp$cell_type_hiconf_70 = tmp$cell_type
    tmp$cell_type_hiconf_70[tmp$cell_type_pred_score<pred_thr] = 'Unknown'

    cell.annot[[i]] = tmp
}

# Sanger MO samples:
write.table(
            rbind(cell.annot),
            file='rna_qc/merged__seurat_annotations.txt',
            sep='\t', row.names=FALSE, quote=FALSE
            )

# 10x filtered cells

In [None]:


library(Seurat)
library(anndata)



##################################################
### query data:

f_gex = list(
merged = '/lustre1/project/stg_00090/scatac_benchmark/full_5_cellranger/rna_qc/10xmerged__adata.raw.h5'
)

query.list = list()
for(i in 1:length(f_gex)) {
    data <- read_h5ad(f_gex[[i]])
    data <- CreateSeuratObject(counts = t(as.data.frame(data$X)), meta.data = data$obs)
    # if(is.list(tmp_h5)) {
    #     query.list[[i]] = CreateSeuratObject(tmp_h5$`Gene Expression`)
    # } else {
    #     query.list[[i]] = CreateSeuratObject(tmp_h5)
    # }
    data = NormalizeData(data)
    all.genes <- rownames(data)
    data <- ScaleData(data, features = all.genes)
    query.list[[i]] = FindVariableFeatures(data, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
}


names(query.list) = names(f_gex)

##################################################
# transfer

pbmc.integrated <- readRDS('../0_resources/seurat_references/pbmc_ssc_mat__integrated.rds')

for(i in 1:length(query.list)) {
    transfer.anchors <- FindTransferAnchors(reference=pbmc.integrated, query=query.list[[i]], dims=1:30, reference.reduction="pca")
    predictions <- TransferData(anchorset=transfer.anchors, refdata=pbmc.integrated$CellType, dims=1:30)
    query.list[[i]] = AddMetaData(query.list[[i]], metadata = predictions)
}

In [None]:
get_difference_to_next_prediction_score = function(x) {
    y = x[,grep('prediction.score',colnames(x))]
    xcols = grep('prediction.score',colnames(x))
    xcols = xcols[ 1:(length(xcols)-1) ]
    pred_score_next = numeric(nrow(x))
    for(i in 1:nrow(x)) {
        pred_score_next[i] = sort(x$prediction.score.max[i] - as.numeric(x[i,xcols]))[2]
    }
    return(pred_score_next)
}

pred_thr = 0.7
diff_thr = 0.1
for(i in 1:length(query.list)) {
    query.list[[i]]@meta.data$diff_to_next_pred_score =
        get_difference_to_next_prediction_score(query.list[[i]]@meta.data)
    pf = (query.list[[i]]$prediction.score.max>pred_thr) & (query.list[[i]]$diff_to_next_pred_score>diff_thr)
    cat(names(query.list)[i],": ",
        length(query.list[[i]]$prediction.score.max), " | ",
        sum(query.list[[i]]$prediction.score.max>=pred_thr), "> ",pred_thr," | ",
        sum(query.list[[i]]$diff_to_next_pred_score>=diff_thr), "> ",diff_thr," | both:",
        sum(pf), " ", sum(pf)/length(pf), "\n")
}

CNA_10xmultiome_1 :  6664  |  2154 >  0.7  |  5351 >  0.1  | both: 2154   0.3232293
CNA_10xmultiome_2 :  6661  |  505 >  0.7  |  3180 >  0.1  | both: 505   0.07581444
SAN_10xmultiome_1 :  3587  |  341 >  0.7  |  2330 >  0.1  | both: 341   0.09506551
SAN_10xmultiome_2 :  3587  |  341 >  0.7  |  2330 >  0.1  | both: 341   0.09506551
VIB_10xmultiome_1 :  1346  |  124 >  0.7  |  1179 >  0.1  | both: 124   0.09212481
VIB_10xmultiome_1 :  1346  |  124 >  0.7  |  1179 >  0.1  | both: 124   0.09212481

In [None]:
cell.annot = list()
for(i in 1:length(query.list)) {
    md = query.list[[i]]@meta.data

    tmp = data.frame(
          composite_sample_id = paste0(rownames(md),'___',names(query.list)[i]),
          barcode = rownames(md),
          sample_id = names(query.list)[i],
          cell_type = md$predicted.id,
          cell_type_pred_score = md$prediction.score.max
          )
    tmp$cell_type_hiconf_70 = tmp$cell_type
    tmp$cell_type_hiconf_70[tmp$cell_type_pred_score<pred_thr] = 'Unknown'

    cell.annot[[i]] = tmp
}

# Sanger MO samples:
write.table(
            rbind(cell.annot[[1]]),
            file='rna_qc/10xmerged__seurat_annotations.txt',
            sep='\t', row.names=FALSE, quote=FALSE
            )

# filtered and merged cells

In [None]:


library(Seurat)
library(anndata)



##################################################
### query data:

f_gex = list(
merged = '/lustre1/project/stg_00090/scatac_benchmark/full_5_cellranger/rna_qc/filteredandmerged__adata.raw.h5'
)

query.list = list()
for(i in 1:length(f_gex)) {
    data <- read_h5ad(f_gex[[i]])
    data <- CreateSeuratObject(counts = t(as.data.frame(data$X)), meta.data = data$obs)
    # if(is.list(tmp_h5)) {
    #     query.list[[i]] = CreateSeuratObject(tmp_h5$`Gene Expression`)
    # } else {
    #     query.list[[i]] = CreateSeuratObject(tmp_h5)
    # }
    data = NormalizeData(data)
    all.genes <- rownames(data)
    data <- ScaleData(data, features = all.genes)
    query.list[[i]] = FindVariableFeatures(data, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
}


names(query.list) = names(f_gex)

##################################################
# transfer

pbmc.integrated <- readRDS('../0_resources/seurat_references/pbmc_ssc_mat__integrated.rds')

for(i in 1:length(query.list)) {
    transfer.anchors <- FindTransferAnchors(reference=pbmc.integrated, query=query.list[[i]], dims=1:30, reference.reduction="pca")
    predictions <- TransferData(anchorset=transfer.anchors, refdata=pbmc.integrated$CellType, dims=1:30)
    query.list[[i]] = AddMetaData(query.list[[i]], metadata = predictions)
}

In [None]:
get_difference_to_next_prediction_score = function(x) {
    y = x[,grep('prediction.score',colnames(x))]
    xcols = grep('prediction.score',colnames(x))
    xcols = xcols[ 1:(length(xcols)-1) ]
    pred_score_next = numeric(nrow(x))
    for(i in 1:nrow(x)) {
        pred_score_next[i] = sort(x$prediction.score.max[i] - as.numeric(x[i,xcols]))[2]
    }
    return(pred_score_next)
}

pred_thr = 0.7
diff_thr = 0.1
for(i in 1:length(query.list)) {
    query.list[[i]]@meta.data$diff_to_next_pred_score =
        get_difference_to_next_prediction_score(query.list[[i]]@meta.data)
    pf = (query.list[[i]]$prediction.score.max>pred_thr) & (query.list[[i]]$diff_to_next_pred_score>diff_thr)
    cat(names(query.list)[i],": ",
        length(query.list[[i]]$prediction.score.max), " | ",
        sum(query.list[[i]]$prediction.score.max>=pred_thr), "> ",pred_thr," | ",
        sum(query.list[[i]]$diff_to_next_pred_score>=diff_thr), "> ",diff_thr," | both:",
        sum(pf), " ", sum(pf)/length(pf), "\n")
}

CNA_10xmultiome_1 :  6664  |  2154 >  0.7  |  5351 >  0.1  | both: 2154   0.3232293
CNA_10xmultiome_2 :  6661  |  505 >  0.7  |  3180 >  0.1  | both: 505   0.07581444
SAN_10xmultiome_1 :  3587  |  341 >  0.7  |  2330 >  0.1  | both: 341   0.09506551
SAN_10xmultiome_2 :  3587  |  341 >  0.7  |  2330 >  0.1  | both: 341   0.09506551
VIB_10xmultiome_1 :  1346  |  124 >  0.7  |  1179 >  0.1  | both: 124   0.09212481
VIB_10xmultiome_1 :  1346  |  124 >  0.7  |  1179 >  0.1  | both: 124   0.09212481

In [None]:
cell.annot = list()
for(i in 1:length(query.list)) {
    md = query.list[[i]]@meta.data

    tmp = data.frame(
          composite_sample_id = paste0(rownames(md),'___',names(query.list)[i]),
          barcode = rownames(md),
          sample_id = names(query.list)[i],
          cell_type = md$predicted.id,
          cell_type_pred_score = md$prediction.score.max
          )
    tmp$cell_type_hiconf_70 = tmp$cell_type
    tmp$cell_type_hiconf_70[tmp$cell_type_pred_score<pred_thr] = 'Unknown'

    cell.annot[[i]] = tmp
}

# Sanger MO samples:
write.table(
            rbind(cell.annot[[1]]),
            file='rna_qc/filteredandmerged__seurat_annotations.txt',
            sep='\t', row.names=FALSE, quote=FALSE
            )