## Run inferCNV on epi cells


In [1]:
library(tidyverse)
library(ggplot2)
library(dplyr)
library(patchwork)
library(cowplot)

library(rhdf5)

library(infercnv)

library(Seurat)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.2     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.2.1     [32m✔[39m [34mdplyr  [39m 1.1.2
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.3     [32m✔[39m [34mforcats[39m 1.0.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
“package ‘patchwork’ was built under R version 4.1.3”

Attaching package: ‘cowplot’


The following object is masked from ‘package:patchwork’:

    align_plots


Attaching SeuratObject



In [2]:
source('function_single_cell_conversion.R')

In [31]:
sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Rocky Linux 8.7 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /fast/work/users/twei_m/miniconda/envs/sc/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] SeuratObject_4.1.3 Seurat_4.3.0       infercnv_1.10.1    rhdf5_2.38.1      
 [5] cowplot_1.1.1      patchwork_1.1.2    forcats_1.0.0      stringr_1.5.0     
 [9] dplyr_1.1.2        purrr_1.0.1        readr_2.1.3        tidyr_1.3.0       
[13] tibble_3.2.1       ggplot2_3.4.2    

In [3]:
getwd()

In [9]:
adata_h5 = H5Fopen("../datasets_new_preprocessing/202305_CB_all_cells.h5")

In [6]:
adata_all = read_scanpy_h5('../datasets_new_preprocessing/202305_CB_all_cells.h5',
                           if_pca = TRUE, if_umap = TRUE, if_CB_counts=TRUE, if_raw_counts=FALSE)

“Keys should be one or more alphanumeric characters followed by an underscore, setting key from raw_counts_ to rawcounts_”


In [97]:
Assays(adata_all)

In [None]:
saveRDS(adata_all@assays$raw_counts@counts, '../datasets_new_preprocessing/all_cell_CB_counts.rds')

In [98]:
# get cell type label
cat = c(adata_h5$obs$'celltype_1a'$categories)
cat_obs = data.frame(cat_name = cat[apply(adata_h5$obs$'celltype_1a'$codes, 1, function(x)x+1)],
                                              row.names = adata_h5$obs$`_index`)

adata_all = AddMetaData(object = adata_all, metadata = cat_obs, col.name = 'celltype_1a')

In [99]:
# get sample label
cat = c(adata_h5$obs$'sample'$categories)
cat_obs = data.frame(cat_name = cat[apply(adata_h5$obs$'sample'$codes, 1, function(x)x+1)],
                                              row.names = adata_h5$obs$`_index`)

adata_all = AddMetaData(object = adata_all, metadata = cat_obs, col.name = 'sample')

In [100]:
adata_epi = subset(x = adata_all, subset = celltype_1a == 'epi')

In [None]:
saveRDS(adata_epi@assays$raw_counts@counts, '../datasets_new_preprocessing/all_epi_cell_CB_counts.rds')

In [7]:
h5closeAll()

In [None]:
# epi in tumour assume as malignant_{patient} for inferCNV internal clustering
adata_all@meta.data[c('sample','celltype_1a')] = adata_all@meta.data[c('sample','celltype_1a')] %>% 
mutate(celltype_1a = case_when((celltype_1a == 'epi' & grepl('t', sample)) ~ paste0('malignant_',sample),
                               TRUE ~ celltype_1a))

In [None]:
write.table(adata_all@meta.data['celltype_1a'], '../datasets_new_preprocessing/all_cell_anno.txt', 
            row.names = TRUE)

In [None]:
rm(adata_all)

In [104]:
# epi in tumour assume as malignant_{patient} for inferCNV internal clustering
adata_epi@meta.data[c('sample','celltype_1a')] = adata_epi@meta.data[c('sample','celltype_1a')] %>% 
mutate(celltype_1a = case_when((celltype_1a == 'epi' & grepl('t', sample)) ~ paste0('malignant_',sample),
                               TRUE ~ celltype_1a))

In [None]:
write.table(adata_epi@meta.data['celltype_1a'], '../datasets_new_preprocessing/all_epi_cell_anno.txt', 
            row.names = TRUE)

In [12]:
# create the infercnv object
infercnv_obj = CreateInfercnvObject(raw_counts_matrix=adata_epi@assays$raw_counts@counts,
                                    annotations_file=adata_epi@meta.data['celltype_1a'],
                                    delim="\t",
                                    gene_order_file="../datasets/gencode_v21_gen_pos.complete.txt",
                                    ref_group_names=c("epi"))

INFO [2023-05-22 15:03:52] Parsing gene order file: ../datasets/gencode_v21_gen_pos.complete.txt
INFO [2023-05-22 15:03:52] ::order_reduce:Start.
INFO [2023-05-22 15:03:55] .order_reduce(): expr and order match.
INFO [2023-05-22 15:04:13] ::process_data:order_reduce:Reduction from positional data, new dimensions (r,c) = 29667,75097 Total=1011772672 Min=0 Max=52153.
INFO [2023-05-22 15:04:15] num genes removed taking into account provided gene ordering list: 9018 = 30.397411265042% removed.
INFO [2023-05-22 15:04:15] -filtering out cells < 100 or > Inf, removing 0 % of cells
INFO [2023-05-22 15:04:29] validating infercnv_obj


In [13]:
# perform infercnv operations to reveal cnv signal
# infercnv will pick up where it left, remember to clean up the repo
infercnv_obj = infercnv::run(infercnv_obj,
                             cutoff=0.1,  # use 1 for smart-seq, 0.1 for 10x-genomics
                             out_dir="../datasets_new_preprocessing/inferCNV/",  # dir is auto-created for storing outputs
                             cluster_by_groups=TRUE,   # cluster
                             denoise=FALSE,
                             HMM=FALSE,
                             num_threads=16
                             )

INFO [2023-05-22 15:04:29] ::process_data:Start
INFO [2023-05-22 15:04:29] Checking for saved results.
INFO [2023-05-22 15:04:29] 

	STEP 1: incoming data

INFO [2023-05-22 15:07:52] 

	STEP 02: Removing lowly expressed genes

INFO [2023-05-22 15:07:52] ::above_min_mean_expr_cutoff:Start
INFO [2023-05-22 15:08:00] Removing 11842 genes from matrix as below mean expr threshold: 0.1
INFO [2023-05-22 15:08:07] validating infercnv_obj
INFO [2023-05-22 15:08:07] There are 8807 genes and 75097 cells remaining in the expr matrix.


“sparse->dense coercion: allocating vector of size 4.9 GiB”


INFO [2023-05-22 15:08:52] no genes removed due to min cells/gene filter
INFO [2023-05-22 15:12:08] 

	STEP 03: normalization by sequencing depth

INFO [2023-05-22 15:12:08] normalizing counts matrix by depth


“sparse->dense coercion: allocating vector of size 4.9 GiB”


INFO [2023-05-22 15:12:41] Computed total sum normalization factor as median libsize: 6779.000000
INFO [2023-05-22 15:12:41] Adding h-spike
INFO [2023-05-22 15:12:41] -hspike modeling of imm


“sparse->dense coercion: allocating vector of size 2.2 GiB”
“sparse->dense coercion: allocating vector of size 2.2 GiB”
“sparse->dense coercion: allocating vector of size 2.2 GiB”
“sparse->dense coercion: allocating vector of size 2.2 GiB”


INFO [2023-05-22 15:16:40] -hspike modeling of str


“sparse->dense coercion: allocating vector of size 2.2 GiB”
“sparse->dense coercion: allocating vector of size 2.2 GiB”
“sparse->dense coercion: allocating vector of size 2.2 GiB”
“sparse->dense coercion: allocating vector of size 2.2 GiB”


INFO [2023-05-22 15:20:35] validating infercnv_obj
INFO [2023-05-22 15:20:35] normalizing counts matrix by depth
INFO [2023-05-22 15:20:35] Using specified normalization factor: 6779.000000
INFO [2023-05-22 15:23:41] 

	STEP 04: log transformation of data

INFO [2023-05-22 15:23:41] transforming log2xplus1()


“sparse->dense coercion: allocating vector of size 4.9 GiB”


saveRDS(infercnv_obj@expr.data, '../datasets_new_preprocessing/inferCNV/inferCNV_expression_data.rds')