In [None]:
library(tidyverse)
library(ggplot2)
library(dplyr)
library(patchwork)
library(cowplot)

library(rhdf5)

library(infercnv)

In [None]:
library(Seurat)

In [None]:
library(Matrix)

In [None]:
source('function_single_cell_conversion.R')

#### h5

In [None]:
getwd()

In [None]:
adata = H5Fopen("../datasets_new_preprocessing/202305_CB_epi_cells.h5")

In [None]:
sobj = read_scanpy_h5('../datasets_new_preprocessing/202305_CB_epi_cells.h5',
                           if_pca = TRUE, if_umap = TRUE, if_CB_counts=TRUE, if_raw_counts=FALSE)

In [None]:
Assays(sobj)

In [None]:
sobj

#### inferCNV

In [None]:
cnv_obj = readRDS('../datasets_new_preprocessing/inferCNV/run.final.infercnv_obj')

In [None]:
# long format: cell, gene, expression level
infercnv_expression = as.data.frame(cnv_obj@expr.data) %>% 
tibble::rownames_to_column(var = 'gene') %>% 
gather(key = 'cell', value = 'expression',-gene)

In [None]:
dim(infercnv_expression)

In [None]:
head(infercnv_expression)

In [None]:
gene_loc_table = as.data.frame(cnv_obj@gene_order) %>% tibble::rownames_to_column(var = 'gene')

In [None]:
dim(gene_loc_table)

In [None]:
head(gene_loc_table)

### tree
using Florian's code: https://itbgit.biologie.hu-berlin.de/uhlitz/sccrc/-/blob/master/_src/infercnv.R

seu_epi_final <- read_rds("../datasets/anno/CRC/3p/seu_epi_final.rds")

In [None]:
# I think pid = patient id
pids = c('p020t', 'p021t', 'p007t', 'p008t', 'p009t1', 'p009t2', 'p013t', 'p014t', 'p016t', 'p026t', 'p035t')

In [None]:
dendro_lines <- read_lines("../datasets_new_preprocessing/inferCNV/infercnv.observations_dendrogram.txt")

#### the order of sample != order in the dendro list..... 
lapply(dendro_lines, function(x) phylogram::read.dendrogram(text = x)) %>% 
       lapply(function(x) enframe(dendextend::cutree(x, k = 2), "cell_id", "clone"))

In [None]:
dendro_list <- lapply(dendro_lines, function(x) phylogram::read.dendrogram(text = x)) %>% setNames(pids)
cell_order <- lapply(dendro_lines, function(x) str_split(x, "\\(|:|,|\\)") %>% unlist %>% .[str_detect(., "p0")]) %>% unlist

In [None]:
# reference cells
epi_anno = read.csv2('../datasets_new_preprocessing/all_epi_cell_anno.txt', sep = ' ')

In [None]:
epi_anno_normal = epi_anno %>% tibble::rownames_to_column(var = 'cell_id') %>% 
filter(str_detect(cell_id, 'n')) %>% mutate(clone  = 'Normal', sample = gsub(':.*', '', cell_id))
epi_anno_normal

In [None]:
# cut the tree and put the reference cells back
clone_table = lapply(dendro_list, function(x) enframe(dendextend::cutree(x, k = 2), "cell_id", "clone")) %>% 
       bind_rows(.id = 'sample') %>%
       mutate(clone = as.character(clone)) %>% 
       mutate(cell_id = str_replace_all(cell_id, '-', ':')) %>% 
       bind_rows(epi_anno_normal[c('sample','cell_id', 'clone')]
                )
       

In [None]:
head(clone_table)

In [None]:
table(clone_table$sample, clone_table$clone)

In [None]:
# make a df to avoid subscript out of bounds issue with Florian's code...
expr_all <- as.data.frame(cnv_obj@expr.data)

In [None]:
sparse_expr = Matrix(as.matrix(expr_all), sparse = TRUE)

writeMM(obj = sparse_expr, file="../datasets_new_preprocessing/inferCNV/inferCNV_expr.mtx")

In [None]:
head(expr_all)

In [None]:
dim(expr_all)

In [None]:
colnames(expr_all) %in% clone_table$cell_id %>% table()

In [None]:
summary(expr_all[1:10])

In [None]:
# mean of reference cells
mean_ref_per_sample = clone_table[which(clone_table$clone == 'Normal'),] %>% group_by(sample) %>%
do(cna_score = mean(colSums(abs(expr_all[,.$cell_id]))/nrow(expr_all)))
mean_ref_per_sample

In [None]:
mean(as.numeric(mean_ref_per_sample$cna_score))

In [None]:
sd(as.numeric(mean_ref_per_sample$cna_score))

In [None]:
max(as.numeric(mean_ref_per_sample$cna_score))

In [None]:
# 99.7, 3 sd away
mean(as.numeric(mean_ref_per_sample$cna_score)) + 3*sd(as.numeric(mean_ref_per_sample$cna_score))

In [None]:
# check p020t
clone_table[which(clone_table$sample == 'p020t'),] %>% group_by(clone) %>%
do(cna_score = mean(colSums(abs(expr_all[,.$cell_id]))/nrow(expr_all)))


In [None]:
# check p020t genes in chr17 and chr4
chr17_genes = gene_loc_table[which(gene_loc_table$chr == 17),]
chr4_genes = gene_loc_table[which(gene_loc_table$chr == 4),]

In [None]:
clone_table[which(clone_table$sample == 'p020t'),] %>% group_by(clone) %>%
do(cna_score = mean(colSums(abs(expr_all[chr17_genes$gene,.$cell_id]))/nrow(chr17_genes)))

In [None]:
clone_table[which(clone_table$sample == 'p020t'),] %>% group_by(clone) %>%
do(cna_score = mean(colSums(abs(expr_all[chr4_genes$gene,.$cell_id]))/nrow(chr4_genes)))

### Florian's original code 
and remove his last two lines

In [None]:
# Calculate the CNA score: average SD in inferCNV / average SD of all normal samples taken together
clone_score = clone_table %>% group_by(sample, clone) %>%
do(cna_score = mean(colSums(abs(expr_all[,.$cell_id]))/nrow(expr_all))) %>% # why abs
ungroup() %>% 
unnest(cna_score) %>% 
spread(clone, cna_score) %>% 
mutate(mean_n = mean(Normal, na.rm = T)) %>%
gather(clone, cna_score, -mean_n, -sample) %>% 
mutate(cna_score = cna_score/mean_n) %>% # why devide not subtract
mutate(max_n = max(cna_score[.$clone == "Normal"], na.rm = T))%>%
# use case when to keep Normal
#mutate(cna_clone = ifelse(cna_score > max_n, "CNA", "CNN")) %>%
mutate(cna_clone = case_when(clone == 'Normal' ~ 'Normal',
                             cna_score > max_n ~ 'CNA',
                             .default =  'CNN')) %>% 
select(sample, cna_score, cna_clone, clone) %>%
# right join can keep the normal sample and cells
right_join(clone_table, by = c("clone", "sample")) 

In [None]:
min(expr_all-1)

In [None]:
clone_table %>% group_by(sample, clone) %>%
do(cna_score = mean(colSums(abs(expr_all[,.$cell_id]))/nrow(expr_all))) %>% # why abs
ungroup() %>% 
unnest(cna_score) %>% 
spread(clone, cna_score) %>% 
mutate(mean_n = mean(Normal, na.rm = T)) %>%
gather(clone, cna_score, -mean_n, -sample) %>% 
#mutate(cna_score = cna_score/mean_n) %>% # why devide not subtract
mutate(max_n = max(cna_score[.$clone == "Normal"], na.rm = T))%>%
# use case when to keep Normal
#mutate(cna_clone = ifelse(cna_score > max_n, "CNA", "CNN")) %>%
mutate(cna_clone = case_when(clone == 'Normal' ~ 'Normal',
                             cna_score > max_n ~ 'CNA',
                             .default =  'CNN')) %>% 
select(sample, cna_score, cna_clone, clone, max_n, mean_n) %>%
arrange(sample)

write_tsv(clone_score, "../datasets_new_preprocessing/inferCNV/infercnv_clone_scores.tsv")

### My modification 

In [None]:
# the abs should be working with centered at 0 ? modify the code by the same concept from Florian
# get average expression per clone (1,2, Normal)
# calculate mean and max of Normal
# if the distance of 1,2 to mean of Normal is larger than 3*normal_SD > CNA

# move the center to 0
expr_all_m1=(expr_all -1)

clone_score_new = clone_table %>% group_by(sample, clone) %>%
do(cna_score = mean(colSums(abs(expr_all_m1[,.$cell_id]))/nrow(expr_all_m1))) %>% 
ungroup() %>% 
unnest(cna_score) %>% 
spread(clone, cna_score) %>% 
mutate(mean_n = mean(Normal, na.rm = T),
      max_n = max(Normal, na.rm = T),
      sd_n = sd(Normal, na.rm = T)) %>%
gather(clone, cna_score, -mean_n, -sample, -max_n, -sd_n) %>%
mutate(cna_clone = case_when(clone == 'Normal' ~ 'normal sample',
                            cna_score - mean_n > 3*sd_n ~ 'CNA',
                            .default = 'CNN')) %>%
select(sample, cna_score, cna_clone, clone) %>%
# right join can keep the normal sample and cells
right_join(clone_table, by = c("clone", "sample"))

In [None]:
clone_table %>% group_by(sample, clone) %>% summarise(n=n()) %>% arrange(sample)

In [None]:
expr_all_m1=(expr_all -1)

clone_table %>% group_by(sample, clone) %>%
do(cna_score = mean(colSums(abs(expr_all_m1[,.$cell_id]))/nrow(expr_all_m1))) %>% 
ungroup() %>% 
unnest(cna_score) %>% 
spread(clone, cna_score) %>% 
mutate(mean_n = mean(Normal, na.rm = T),
      max_n = max(Normal, na.rm = T),
      sd_n = sd(Normal, na.rm = T)) %>%
gather(clone, cna_score, -mean_n, -sample, -max_n, -sd_n) %>%
mutate(cna_clone = case_when(clone == 'Normal' ~ 'normal sample',
                            cna_score - mean_n > 3*sd_n ~ 'CNA',
                            .default = 'CNN')) %>% 
select(sample, cna_score, cna_clone, clone, mean_n ,max_n) %>%
arrange(sample)

In [None]:
clone_score_new

write_tsv(clone_score_new, "../datasets_new_preprocessing/inferCNV/202306_infercnv_clone_scores_new.tsv")

#### how about use test on distritbution

In [None]:
# take reference cells as background distribution and test if clone 1,2 have a different mean?

In [None]:
ref_id = clone_table[which(clone_table$clone == 'Normal'),]
ref_scores = list(colSums(expr_all[,ref_id$cell_id])/nrow(expr_all))

In [None]:
names(ref_scores) = 'all_n'

In [None]:
p007t1_id = clone_table[which(clone_table$sample == 'p007t' & clone_table$clone == 1),]
p007t2_id = clone_table[which(clone_table$sample == 'p007t' & clone_table$clone == 2),]
p007n_id  = clone_table[which(clone_table$sample == 'p007n' & clone_table$clone == 'Normal'),]

In [None]:
p007_scores = list(colSums(expr_all[,p007t1_id$cell_id])/nrow(expr_all),
                   colSums(expr_all[,p007t2_id$cell_id])/nrow(expr_all),
                   colSums(expr_all[,p007n_id$cell_id])/nrow(expr_all))

In [None]:
names(p007_scores) = c('t1', 't2', 'n')

In [None]:
lapply(p007_scores, length)

In [None]:
shapiro.test(p007_scores[['t1']])

In [None]:
shapiro.test(p007_scores[['t2']])

In [None]:
shapiro.test(p007_scores[['n']])

In [None]:
wilcox.test(p007_scores[['t1']], p007_scores[['n']], alternative = "two.sided")

In [None]:
wilcox.test(p007_scores[['t2']], p007_scores[['n']], alternative = "two.sided")

In [None]:
wilcox.test(p007_scores[['t1']], ref_scores[['all_n']], alternative = "two.sided")

In [None]:
wilcox.test(p007_scores[['t2']], ref_scores[['all_n']], alternative = "two.sided")

In [None]:
wilcox.test(p007_scores[['n']], ref_scores[['all_n']], alternative = "two.sided")

In [None]:
t.test(p007_scores[['t1']], p007_scores[['n']], paired = FALSE, alternative = "two.sided")

In [None]:
t.test(p007_scores[['t1']], ref_scores[['all_n']], paired = FALSE, alternative = "two.sided")

In [None]:
t.test(p007_scores[['t2']], p007_scores[['n']], paired = FALSE, alternative = "two.sided")

In [None]:
t.test(p007_scores[['t2']], ref_scores[['all_n']], paired = FALSE, alternative = "two.sided")

In [None]:
t.test(p007_scores[['n']], ref_scores[['all_n']], paired = FALSE, alternative = "two.sided")

#### p020

In [None]:
p020t1_id = clone_table[which(clone_table$sample == 'p020t' & clone_table$clone == 1),]
p020t2_id = clone_table[which(clone_table$sample == 'p020t' & clone_table$clone == 2),]
p020n_id  = clone_table[which(clone_table$sample == 'p020n' & clone_table$clone == 'Normal'),]

In [None]:
p020_scores = list(colSums(expr_all[,p020t1_id$cell_id])/nrow(expr_all),
                   colSums(expr_all[,p020t2_id$cell_id])/nrow(expr_all),
                   colSums(expr_all[,p020n_id$cell_id])/nrow(expr_all))

In [None]:
names(p020_scores) = c('t1', 't2', 'n')

In [None]:
lapply(p020_scores, length)

In [None]:
shapiro.test(p020_scores[['t1']])

In [None]:
shapiro.test(p020_scores[['t2']])

In [None]:
shapiro.test(p020_scores[['n']])

In [None]:
wilcox.test(p020_scores[['t1']], p020_scores[['n']], alternative = "two.sided")

In [None]:
wilcox.test(p020_scores[['t2']], p020_scores[['n']], alternative = "two.sided")

In [None]:
wilcox.test(p020_scores[['t1']], ref_scores[['all_n']], alternative = "two.sided")

In [None]:
wilcox.test(p020_scores[['t2']], ref_scores[['all_n']], alternative = "two.sided")

In [None]:
wilcox.test(p020_scores[['n']], ref_scores[['all_n']], alternative = "two.sided")

In [None]:
t.test(p020_scores[['t1']], p020_scores[['n']], paired = FALSE, alternative = "two.sided")

In [None]:
t.test(p020_scores[['t2']], p020_scores[['n']], paired = FALSE, alternative = "two.sided")

### 20230607: this script can stop here for now

In [None]:
!

In [None]:
segments_tbl <- lapply(dendro_list, function(x) ggdendro::segment(ggdendro::dendro_data(x)) %>% 
                       mutate(helper_var_d = "Dendrogram")) %>% 
  bind_rows(.id = "sample") 


In [None]:
head(segments_tbl)

In [None]:
anno_data <- as_tibble(FetchData(seu_epi_final, c("cell_type_epi_custom", "source_id", "sample_origin", "cell_id"))) %>%
  mutate(cell_id = str_replace_all(cell_id, ":", "_"),
         cell_id = ordered(cell_id, levels = cell_order)) %>%
  filter(sample_origin == "Tumor") %>%
  mutate(helper_var_p = " Patient",
         helper_var_c = " Cell type")



In [None]:
head(anno_data)

In [None]:
rm(seu_epi_final)

In [None]:
gof <- read_tsv("../datasets/gencode_v21_gen_pos.complete.txt", 
                col_names = c("gene", "chr", "start", "end")
               )

In [None]:
chr_order = c('1','2','3','4','5','6','7','8','9','10',
              '11','12','13','14','15','16','17','18','19','20',
              '21','22','M','X','Y')

In [None]:
## joining data 
plot_data <- as.data.frame(cnv_obj@expr.data[,str_detect(colnames(cnv_obj@expr.data), "^p0[0-9][0-9]t")]) %>% 
  as_tibble(rownames = "gene") %>%
  gather(cell_id, value, -gene) %>%
  left_join(gof, by = "gene") %>% 
  arrange(factor(chr, levels = chr_order)) %>% # why is the chr so messsssssy
  mutate(cell_id = ordered(cell_id, levels = cell_order)) %>%
  left_join(anno_data, by = "cell_id") %>% 
  mutate(cell_id = ordered(cell_id, levels = cell_order)) %>% 
  mutate(value_cut = ifelse(value > 1.15, 1.15, ifelse(value < 0.85, 0.85, value))) %>% 
  group_by(chr) %>% 
  mutate(rank = rank(start)) %>%
  ungroup


In [None]:
head(plot_data)

In [None]:
dim(plot_data)

In [None]:
plot_data$chr = factor(plot_data$chr, levels = chr_order)# why is the chr so messsssssy


In [None]:
plot_cnv <- ggplot(plot_data) +
  geom_tile(aes(rank, cell_id, fill=value_cut)) +
  theme_void() +
  facet_grid(source_id~chr, scales = "free", space = "free") +
  scale_fill_gradient2(midpoint = 1, low = scales::muted("blue"), high = scales::muted("red")) +
  theme(panel.spacing.x = unit(0, "npc"),
        panel.spacing.y = unit(0.003, "npc"),
        strip.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5),
        strip.text.y = element_blank()) +
  labs(fill = "inferCNV\nexpression")
plot_cnv

In [None]:
cell_type_colors = setNames(c('#999900', '#d6d680', '#336600', '#7c9d5c', '#CCD9BF', '#660099',
       '#990099', '#CC80CC', '#00CCFF', '#0072b1', '#6b9fcb', '#c3c3e0',
       '#AA0000', '#FF0000', '#FF0099', '#EB9999', '#FF9900'),
                           levels(anno_data$cell_type_epi_custom))

In [None]:
plot_anno_cell <- ggplot(anno_data) +
  geom_tile(aes(helper_var_c, cell_id, fill = cell_type_epi_custom)) +
  theme_void() +
  facet_grid(source_id~helper_var_c, scales = "free", space = "free") +
  scale_fill_manual(values = cell_type_colors) +
  guides(fill = F) +
  theme(panel.spacing.x = unit(0, "npc"),
        panel.spacing.y = unit(0.003, "npc"),
        strip.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5),
        strip.text.y = element_blank()) 
plot_anno_cell

In [None]:
cols.use <- c(
  p001 = '#E69F00', #palette_OkabeIto[1],
  p007 = '#E69F00', #palette_OkabeIto[1],
  p008 = '#56B4E9', #palette_OkabeIto[2],
  p009 = '#009E73', #palette_OkabeIto[3],
  p012 = "#ff0198",
  p013 = '#0072B2', #palette_OkabeIto[5],
  p014 = '#D55E00', #palette_OkabeIto[6],
  p016 = '#CC79A7', #palette_OkabeIto[7],
  p017 = "#451077",
  p020 = "#cccc80",
  p021 = "#FF0000", 
  p025 = "#f6cbcc",
  p026 = "#F0E442",
  CMS1 = "#eba83a", CMS2 = "#027eb5", CMS3 = "#d684ae", CMS4 = "#00a881",
  `CMS1,CMS2` = "#779378", `CMS1,CMS3` = "#E19674", `CMS1,CMS4` = "#76A85E", 
  `CMS2,CMS3` = "#6C81B2", `CMS2,CMS4` = "#01939B",  
  G1 = "#94b6d2",
  S = "#dc4040",
  G2M = "#7aa77f",
  `NA` = "grey",
  Normal = "steelblue",
  Tumor = "red",
  PDO = "steelblue",
  CNA = "red",
  CNN = "grey"
)



In [None]:
plot_anno_patient <- ggplot(anno_data) +
  geom_tile(aes(helper_var_p, cell_id, fill = source_id)) +
  theme_void() +
  facet_grid(source_id~helper_var_p, scales = "free", space = "free") +
  scale_fill_manual(values = cols.use) +
  guides(fill = F) +
  theme(panel.spacing.x = unit(0, "npc"),
        panel.spacing.y = unit(0.003, "npc"),
        strip.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5),
        strip.text.y = element_blank())
plot_anno_patient

In [None]:
plot_anno_clone <- ggplot(filter(clone_scores, clone != "Normal")) +
  geom_tile(aes(helper_var_c, cell_id, fill = cna_clone)) +
  theme_void() +
  facet_grid(source_id~helper_var_c, scales = "free", space = "free") +
  scale_fill_manual(values = cols.use) +
  guides(fill = F) +
  theme(panel.spacing.x = unit(0, "npc"),
        panel.spacing.y = unit(0.003, "npc"),
        strip.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5),
        strip.text.y = element_blank())
plot_anno_clone

In [None]:
plot_dendro <- ggplot(segments_tbl) +
  geom_segment(aes(x=-y,y=x,xend=-yend,yend=xend),size=0.5) +
  theme_void() +
  facet_grid(source_id~helper_var_d, scales = "free", space = "free") +
  theme(panel.spacing = unit(0, "npc"),
        panel.spacing.x = unit(0, "npc"),
        panel.spacing.y = unit(0.003, "npc"),
        strip.text = element_blank()) +
  scale_y_continuous(expand=c(0,0))
plot_dendro

In [None]:
pg_anno <- plot_grid(plot_dendro, plot_anno_patient, plot_anno_clone, plot_anno_cell, 
                     nrow = 1, align = "h", rel_widths = c(0.625, 0.125, 0.125, 0.125))
pg_anno

In [None]:
pg <- plot_grid(plot_dendro, plot_anno_patient, plot_anno_clone, plot_anno_cell, plot_cnv, 
                nrow = 1, align = "h", rel_widths = c(0.1, 0.02, 0.02, 0.02, 0.84))
pg

h5closeAll()