In [2]:
library(ggplot2) ; library(glmnet) ; library(Biobase)
library(reshape2) ; library(ggpubr); library(SingleCellExperiment)
library(ComplexHeatmap) ; library(circlize) ; library(RColorBrewer)
library(cocor) ; library(ggpmisc) ; library(Seurat)

# load CLIMB and MUSIC
#library(MuSiC) ; library(climb) ; library(BayesPrism)

### Functions 

In [3]:
# METRICS FUNCTIONS
num <- function(x){ return(as.numeric(as.character(x)))}
flatten <- function(x){return(as.vector(as.matrix(x)))}
reformat_strings <- function(vector_string){
    # replace plus and minus (e.g. useful for CD34+, CD34- populations)
    vector_string <- gsub('\\-$', 'minus', vector_string) 
    vector_string <- gsub('\\+', 'plus', vector_string) 
    vector_string <- gsub('\\ ', '\\.', vector_string) 
    # remove all special characters
    vector_string <- gsub('[^[:alnum:] ]','',vector_string)
    return(vector_string)
}
reformat_celltypes <- function(celltype_labels){
    celltype_labels <- reformat_strings(as.vector(celltype_labels))
    celltype_labels <- factor(celltype_labels)
    return(celltype_labels)
}

### Load raw data from Khaliq (CRC sc dataset) - filter out cancer cells only

In [3]:
khaliq.mat = read.csv('data/pseudobulks_climb/raw/crc_khaliq/GSE200997_GEO_processed_CRC_10X_raw_UMI_count_matrix.csv', sep = ',')

In [6]:
rownames(khaliq.mat) = khaliq.mat[,1]

In [7]:
colnames(khaliq.mat) = khaliq.mat[1,]

In [8]:
khaliq.mat = khaliq.mat[,-1]
khaliq.mat = khaliq.mat[-1,]

In [9]:
head(khaliq.mat)

Unnamed: 0_level_0,B_cac10_AAACCTGAGTCAATAG,B_cac10_AAACCTGCACAGCCCA,B_cac10_AAACCTGCACTTCGAA,B_cac10_AAACCTGGTAATTGGA,B_cac10_AAACCTGGTACGAAAT,B_cac10_AAACCTGGTGAAAGAG,B_cac10_AAACCTGTCACGATGT,B_cac10_AAACCTGTCTGCGACG,B_cac10_AAACGGGAGTTGTCGT,B_cac10_AAACGGGCAGCGTTCG,⋯,T_cac9_TTTCCTCTCCGCATCT,T_cac9_TTTCCTCTCTATGTGG,T_cac9_TTTGCGCAGGCTAGCA,T_cac9_TTTGCGCTCGCTAGCG,T_cac9_TTTGGTTCAAATACAG,T_cac9_TTTGGTTCAACACGCC,T_cac9_TTTGGTTCATAGACTC,T_cac9_TTTGGTTCATCTATGG,T_cac9_TTTGGTTTCAATCTCT,T_cac9_TTTGTCAGTCCAACTA
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
AL627309.1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL669831.5,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
FAM87B,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
LINC00115,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
FAM41C,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL645608.3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [14]:
khaliq.meta = read.csv('data/pseudobulks_climb/raw/crc_khaliq/GSE200997_GEO_processed_CRC_10X_cell_annotation.csv')

In [20]:
all(khaliq.meta$X == colnames(khaliq.mat))

In [23]:
sel_cancer = khaliq.meta$Condition == 'Tumor'

In [23]:
khaliq.mat = khaliq.mat[,sel_cancer]

In [24]:
khaliq.meta = khaliq.meta[sel_cancer,]

In [32]:
khaliq.sc.es = ExpressionSet(as.matrix(khaliq.mat))

In [35]:
khaliq.sc.es$SubjectName = khaliq.meta$samples

In [26]:
khaliq.sc.es$cellType_original = khaliq.meta$prediction
khaliq.sc.es$cellType = khaliq.meta$prediction

In [1]:
khaliq.sc.es = readRDS('data/pseudobulks_climb/raw/khaliq_sc_es.RDS')

In [5]:
mat_2_num <- matrix(as.numeric(exprs(khaliq.sc.es)),    # Convert to numeric matrix
                  ncol = ncol(exprs(khaliq.sc.es)))

In [6]:
exprs(khaliq.sc.es) = mat_2_num

In [8]:
# Check that we don't have cells with 0 counts
sum(colSums(exprs(khaliq.sc.es)) <= 0)

In [12]:
# Check if we have no genes with 0 counts
sum(rowSums(exprs(khaliq.sc.es)) <= 0)

In [10]:
# remove gene with 0 expression in all cells 
khaliq.sc.es = khaliq.sc.es[!rowSums(exprs(khaliq.sc.es)) <= 0,]

In [11]:
saveRDS(khaliq.sc.es, 'data/pseudobulks_climb/raw/khaliq_sc_es.RDS')

## Load GBM datasets

In [6]:
non_malignant_cells = readRDS('data/pseudobulks_climb/raw/gbm_10x_smartseq2/non_malignant_cells.rds')
head(non_malignant_cells)

In [7]:
malignant_cells = readRDS('data/pseudobulks_climb/raw/gbm_10x_smartseq2/malignant_cells.rds')
head(malignant_cells)

In [8]:
head(read.csv('data/pseudobulks_climb/raw/gbm_10x_smartseq2/GSE131928_RAW/GSM3828673_10X_GBM_IDHwt_processed_TPM.tsv', sep='\t'))

In [13]:
length(unique(gsub('\\..*','',cells_ids)))

In [14]:
unique(gsub('\\..*','',cells_ids))

## Load Melanoma datasets

In [2]:
rawcounts.mel = read.csv('data/pseudobulks_climb/raw/melanoma_jerby_tirosh/GSE115978_counts.csv',row.names='X')

In [3]:
metadata.mel = read.csv('data/pseudobulks_climb/raw/melanoma_jerby_tirosh/GSE115978_cell.annotations.csv')
head(metadata.mel)

Unnamed: 0_level_0,cells,samples,cell.types,treatment.group,Cohort,no.of.genes,no.of.reads
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>
1,cy78_CD45_neg_1_B04_S496_comb,Mel78,Mal,post.treatment,Tirosh,8258,357919
2,cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb,Mel79,Mal,treatment.naive,Tirosh,2047,5727
3,CY88_5_B10_S694_comb,Mel88,Mal,post.treatment,Tirosh,5375,139218
4,cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_F07_S67_comb,Mel79,Mal,treatment.naive,Tirosh,5648,73996
5,cy78_CD45_neg_3_H06_S762_comb,Mel78,Mal,post.treatment,Tirosh,7409,380341
6,cy79_p1_CD45_neg_PDL1_pos_AS_C1_R1_G01_S73_comb,Mel79,Mal,treatment.naive,Tirosh,6988,92485


In [4]:
all(colnames(rawcounts.mel) == metadata.mel$cells)

In [5]:
unique(metadata.mel$Cohort)

In [6]:
unique(metadata.mel$cell.types)

In [10]:
sel_cells = metadata.mel$cell.types != '?'
rawcounts.mel = rawcounts.mel[,sel_cells]
metadata.mel = metadata.mel[sel_cells,]

In [11]:
rawcounts.tirosh = rawcounts.mel[,metadata.mel$Cohort == 'Tirosh']
dim(rawcounts.tirosh)

In [12]:
metadata.tirosh = metadata.mel[metadata.mel$Cohort == 'Tirosh',]

In [15]:
all(colnames(rawcounts.tirosh) == metadata.tirosh$cells)

In [25]:
# remove genes with 0 counts
rawcounts.tirosh = rawcounts.tirosh[rowSums(rawcounts.tirosh) != 0,]

In [35]:
tirosh.es = ExpressionSet(as.matrix(rawcounts.tirosh))
tirosh.es$cellType_original = metadata.tirosh$cell.types
tirosh.es$SubjectName = metadata.tirosh$samples
tirosh.es

ExpressionSet (storageMode: lockedEnvironment)
assayData: 22631 features, 3998 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: cy78_CD45_neg_1_B04_S496_comb
    cy79_p4_CD45_neg_PDL1_neg_E11_S1115_comb ...
    CY75_1_CD45_CD8_8__S351_comb_BCD8 (3998 total)
  varLabels: cellType_original SubjectName
  varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
Annotation:  

In [44]:
# we remove underscore as Seurat does not like it 
colnames(tirosh.es) = gsub('_','',colnames(tirosh.es))

In [50]:
tirosh.es$cellType_original = gsub('\\.','',tirosh.es$cellType_original)
unique(tirosh.es$cellType_original)

In [73]:
reformat_gene_names = gsub("-","", gsub("_", "", rownames(tirosh.es)))
sel_no_dup_genes = !reformat_gene_names %in% reformat_gene_names[duplicated(reformat_gene_names)]
sum(!sel_no_dup_genes)
tirosh.es = tirosh.es[sel_no_dup_genes,]

In [74]:
saveRDS(tirosh.es, 'data/pseudobulks_climb/raw/MEL_tirosh_allGenes_sc_es.RDS')

In [13]:
rawcounts.jerby = rawcounts.mel[,metadata.mel$Cohort == 'New']
dim(rawcounts.jerby)

In [31]:
# remove genes with 0 counts
rawcounts.jerby = rawcounts.jerby[rowSums(rawcounts.jerby) != 0,]

In [32]:
metadata.jerby = metadata.mel[metadata.mel$Cohort == 'New',]

In [33]:
all(colnames(rawcounts.jerby) == metadata.jerby$cells)

In [37]:
jerby.es = ExpressionSet(as.matrix(rawcounts.jerby))
jerby.es$cellType_original = metadata.jerby$cell.types
jerby.es$SubjectName = metadata.jerby$samples
jerby.es

ExpressionSet (storageMode: lockedEnvironment)
assayData: 22637 features, 2881 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: merck_cd45pos_PL3_S292 merck_cd45pos_PL3_S337 ...
    cy121.1_CD45pos_S328 (2881 total)
  varLabels: cellType_original SubjectName
  varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
Annotation:  

In [46]:
# we remove underscore as Seurat does not like it 
colnames(jerby.es) = gsub('_','',colnames(jerby.es))

In [52]:
# remove special character from cell type labels 
jerby.es$cellType_original = gsub('\\.','',jerby.es$cellType_original)
unique(jerby.es$cellType_original)

In [71]:
reformat_gene_names = gsub("-","", gsub("_", "", rownames(jerby.es)))
sel_no_dup_genes = !reformat_gene_names %in% reformat_gene_names[duplicated(reformat_gene_names)]
jerby.es = jerby.es[sel_no_dup_genes,]

ExpressionSet (storageMode: lockedEnvironment)
assayData: 22633 features, 2881 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: merckcd45posPL3S292 merckcd45posPL3S337 ...
    cy121.1CD45posS328 (2881 total)
  varLabels: cellType_original SubjectName
  varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
Annotation:  

In [72]:
saveRDS(jerby.es, 'data/pseudobulks_climb/raw/MEL_jerby_allGenes_sc_es.RDS')

## Breast cancer datasets 

In [1]:
library(data.table)

In [20]:
gray_raw_counts = fread('data/pseudobulks_climb/raw/breast_gray/rawData.csv')

In [21]:
head(gray_raw_counts)

V1,Human.BRCA1.B_AAACCTGAGAATCTCC,Human.BRCA1.B_AAACCTGAGACTAGGC,Human.BRCA1.B_AAACCTGAGCCAGTTT,Human.BRCA1.B_AAACCTGAGCGACGTA,Human.BRCA1.B_AAACCTGAGCGATTCT,Human.BRCA1.B_AAACCTGAGGGCACTA,Human.BRCA1.B_AAACCTGAGGTTACCT,Human.BRCA1.B_AAACCTGAGTACATGA,Human.BRCA1.B_AAACCTGAGTAGGCCA,⋯,Human.WT.D_TTTGGTTGTTAAGATG,Human.WT.D_TTTGGTTGTTGTGGAG,Human.WT.D_TTTGGTTTCGCAAACT,Human.WT.D_TTTGTCAAGAAGAAGC,Human.WT.D_TTTGTCAAGCTCCTCT,Human.WT.D_TTTGTCACAGTCACTA,Human.WT.D_TTTGTCAGTCTACCTC,Human.WT.D_TTTGTCAGTTACTGAC,Human.WT.D_TTTGTCATCACAGTAC,Human.WT.D_TTTGTCATCCAGATCA
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
AL627309.1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL669831.5,0,0,0,0,0,0,1,1,0,⋯,0,0,0,0,0,0,0,0,0,0
FAM87B,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
LINC00115,0,0,0,0,0,0,0,0,0,⋯,1,0,0,0,1,0,0,0,0,0
FAM41C,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL645608.3,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [22]:
gene_names = as.character(gray_raw_counts$V1)
head(gene_names)

In [23]:
gray_raw_counts = gray_raw_counts[,-1]

In [24]:
metadata=read.csv('data/pseudobulks_climb/raw/breast_gray/metadataInfo.txt', sep='\t')
metadata = metadata[-1,]

In [25]:
head(metadata)

Unnamed: 0_level_0,NAME,biosample_id,cell_subtypes,donor_id,atlas_id,age,genotype,parity,menopause_stage,breast_cancer_history,⋯,race_ontology_label,disease,disease__ontology_label,library_preparation_protocol,library_preparation_protocol__ontology_label,sex,species,species__ontology_label,organ,organ__ontology_label
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
2,Human-BRCA1-B_AAACCTGAGAATCTCC,5. Vascular and lymphatic,14. VL2 Vascular endothelial,Human-BRCA1-B,PM-D,53,BRCA1,G2P1,Surgical menopause,Yes,⋯,White,PATO_0000461,normal,EFO:0009900,10x 5' v2,female,NCBITaxon_9606,Homo sapiens,UBERON:0000310,breast
3,Human-BRCA1-B_AAACCTGAGACTAGGC,5. Vascular and lymphatic,14. VL2 Vascular endothelial,Human-BRCA1-B,PM-D,53,BRCA1,G2P1,Surgical menopause,Yes,⋯,White,PATO_0000461,normal,EFO:0009900,10x 5' v2,female,NCBITaxon_9606,Homo sapiens,UBERON:0000310,breast
4,Human-BRCA1-B_AAACCTGAGCCAGTTT,2. HS,3. HSa,Human-BRCA1-B,PM-D,53,BRCA1,G2P1,Surgical menopause,Yes,⋯,White,PATO_0000461,normal,EFO:0009900,10x 5' v2,female,NCBITaxon_9606,Homo sapiens,UBERON:0000310,breast
5,Human-BRCA1-B_AAACCTGAGCGACGTA,3. BA,7. BAb,Human-BRCA1-B,PM-D,53,BRCA1,G2P1,Surgical menopause,Yes,⋯,White,PATO_0000461,normal,EFO:0009900,10x 5' v2,female,NCBITaxon_9606,Homo sapiens,UBERON:0000310,breast
6,Human-BRCA1-B_AAACCTGAGCGATTCT,5. Vascular and lymphatic,15. VL3 Pericyte,Human-BRCA1-B,PM-D,53,BRCA1,G2P1,Surgical menopause,Yes,⋯,White,PATO_0000461,normal,EFO:0009900,10x 5' v2,female,NCBITaxon_9606,Homo sapiens,UBERON:0000310,breast
7,Human-BRCA1-B_AAACCTGAGGGCACTA,1. AV,1. BL,Human-BRCA1-B,PM-D,53,BRCA1,G2P1,Surgical menopause,Yes,⋯,White,PATO_0000461,normal,EFO:0009900,10x 5' v2,female,NCBITaxon_9606,Homo sapiens,UBERON:0000310,breast


In [26]:
head(colnames(gray_raw_counts))

In [27]:
metadata$NAME = gsub('-','\\.',metadata$NAME)

In [28]:
all(metadata$NAME == colnames(gray_raw_counts))

In [29]:
unique(gsub('.*\\. ', '', metadata$cell_subtypes))

In [30]:
celltype_names = reformat_celltypes(gsub('.*\\. ', '', metadata$cell_subtypes))

In [31]:
sample_names = reformat_celltypes(metadata$donor_id)

In [32]:
gray.sc.es = ExpressionSet(as.matrix(gray_raw_counts))

In [33]:
rownames(gray.sc.es) = gene_names
head(rownames(gray.sc.es))

In [34]:
gray.sc.es$cellType_original = factor(celltype_names)
gray.sc.es$cellType = factor(celltype_names)
gray.sc.es$SubjectName = factor(sample_names)

In [35]:
saveRDS(gray.sc.es, 'data/pseudobulks_climb/raw/BREAST_gray_sc_es.RDS')

## Breast Wu

In [5]:
breast.wu.srt = Read10X('data/pseudobulks_climb/raw/breast_wu/BrCa_Atlas_Count_out')

In [9]:
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,limit (Mb),max used,(Mb).2
Ncells,7975572,426,12276455,655.7,,11710372,625.5
Vcells,3256349215,24844,7479939617,57067.5,102400.0,6234621398,47566.4


In [10]:
breast.wu.es = ExpressionSet(as.matrix(breast.wu.srt))

“sparse->dense coercion: allocating vector of size 22.2 GiB”


In [11]:
breast.wu.es

ExpressionSet (storageMode: lockedEnvironment)
assayData: 29733 features, 100064 samples 
  element names: exprs 
protocolData: none
phenoData: none
featureData: none
experimentData: use 'experimentData(object)'
Annotation:  

In [14]:
metadata = read.csv('data/pseudobulks_climb/raw/breast_wu/Whole_miniatlas_meta_format.csv')
metadata = metadata[-1,]

In [15]:
all(metadata$NAME == colnames(breast.wu.es))

In [24]:
celltype_names = reformat_celltypes(metadata$celltype_minor)

In [27]:
subject_names = metadata$Patient

In [28]:
breast.wu.es$cellType = factor(celltype_names)
breast.wu.es$cellType_original = factor(celltype_names)
breast.wu.es$SubjectName = factor(subject_names)

In [29]:
saveRDS(breast.wu.es, 'data/pseudobulks_climb/raw/BREAST_wu_sc_es.RDS')

## GBM dataset Nefty et al.

The dataset has smartseq2 and 10x data from the same context. Unfortunately only cell-type labels on SS2 data are provided. We will thus only do SS2->10X cross-dataset analysis.

In [7]:
ss2_mat = read.csv('data/pseudobulks_climb/raw/gbm_neftel_10x_SS2/GSE131928_RAW/GSM3828672_Smartseq2_GBM_IDHwt_processed_TPM.tsv',
                  sep='\t', row.names='GENE')

In [15]:
colnames(ss2_mat) = reformat_celltypes(colnames(ss2_mat))

In [13]:
ss2_metadata = read.csv('data/pseudobulks_climb/raw/gbm_neftel_10x_SS2/IDHwt.GBM.Metadata.SS2.txt', sep='\t')
ss2_metadata = ss2_metadata[-1,]

In [16]:
ss2_metadata$NAME_ = reformat_celltypes(ss2_metadata$NAME)

In [20]:
rownames(ss2_metadata) = ss2_metadata$NAME_

In [22]:
ss2_metadata = ss2_metadata[colnames(ss2_mat),]

In [23]:
all(colnames(ss2_mat) == ss2_metadata$NAME_)

In [58]:
mat_cancer_assignment = as.matrix(ss2_metadata[,8:13])

In [60]:
cancer_cell_types = colnames(mat_cancer_assignment)

In [61]:
res = apply(as.matrix(ss2_metadata[,8:13]), 1, x <- function(x){ cancer_cell_types[grep(max(num(x)), num(x))[1]] } )# function(x) grep(num(x) == max(num(x))))

In [63]:
ss2_metadata$cancer_type = res

In [65]:
ss2_metadata$celltype = ss2_metadata$CellAssignment

In [67]:
ss2_metadata$celltype[ss2_metadata$celltype == 'Malignant'] <- ss2_metadata$cancer_type[ss2_metadata$celltype == 'Malignant']

In [71]:
ss2_metadata$celltype[is.na(ss2_metadata$celltype)] <- 'Unknown'

In [72]:
unique(ss2_metadata$celltype)

In [74]:
unique(ss2_metadata$Sample)

In [75]:
neftel.ss2.es = ExpressionSet(as.matrix(ss2_mat))

In [76]:
neftel.ss2.es$cellType = factor(ss2_metadata$celltype)
neftel.ss2.es$cellType_original = factor(ss2_metadata$celltype)

In [77]:
neftel.ss2.es$SubjectName = factor(ss2_metadata$Sample)

In [83]:
saveRDS(neftel.ss2.es, 'data/pseudobulks_climb/raw/GBM_neftelSS2_sc_es.RDS')

#### Load 10 X data

For this dataset we unfortenately not have cell-type labels, the corresponding authors could not find them. We will thus only generate an ExpressionSEt object with SampleName. It will thus be use for SS2->10X analysis

In [84]:
TENx_mat = read.csv('data/pseudobulks_climb/raw/gbm_neftel_10x_SS2/GSE131928_RAW/GSM3828673_10X_GBM_IDHwt_processed_TPM.tsv',
                  sep='\t', row.names='GENE')

In [85]:
head(colnames(TENx_mat))

In [89]:
sample_names = gsub('_.*','',colnames(TENx_mat))

In [91]:
neftel.10x.es = ExpressionSet(as.matrix(TENx_mat))

In [92]:
neftel.10x.es$SubjectName = sample_names

In [94]:
neftel.10x.es

ExpressionSet (storageMode: lockedEnvironment)
assayData: 30314 features, 16201 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: X102_1 X102_2 ... X126_2_245 (16201 total)
  varLabels: SubjectName
  varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
Annotation:  

In [93]:
saveRDS(neftel.10x.es, 'data/pseudobulks_climb/raw/GBM_neftel10X_sc_es.RDS')