In [63]:
suppressPackageStartupMessages({
    library(scater)
    library(scran)
    library(tidyverse)
    library(ComplexHeatmap)
    library(logger)
})

#  Read SCE data

In [64]:
sce <- readRDS("../../data/expression/sce/sce_Smartseq2_scHCC-CD45_featureCounts_qc_clustered.rds")


# Read integration sites

In [65]:
intSiteHBV <- readr::read_tsv("../../data/intSites_HBV/arriba_fusions_CD45N_20200901.tsv") %>%
    dplyr::filter(!grepl("group", group)) %>%
    rename(gene1 = `#gene1`) %>% 
    dplyr::mutate(chr1 = sapply(strsplit(breakpoint1, ":"), `[[`, 1),
                chr2 = sapply(strsplit(breakpoint2, ":"), `[[`, 1),
                bp_coord1 = as.numeric(sapply(strsplit(breakpoint1, ":"), `[[`, 2)),
                bp_coord2 = as.numeric(sapply(strsplit(breakpoint2, ":"), `[[`, 2))) %>%
    dplyr::mutate(split_reads1 = as.numeric(split_reads1),
                split_reads2 = as.numeric(split_reads2)) %>%
    dplyr::mutate(spanning_counts = split_reads1 + split_reads2) %>% 
    dplyr::mutate(chrHuman = ifelse(grepl("HBV", chr1), chr2, chr1 ),
                coordHuman = ifelse(grepl("HBV", chr1), bp_coord2, bp_coord1), 
                chrVir = "HBV",
                coordVir = ifelse(grepl("HBV", chr1), bp_coord1, bp_coord2)) %>%
    dplyr::mutate(siteTypeHuman = ifelse(grepl("HBV", chr1), site2 , site1 )) %>%
    dplyr::mutate(chrHuman = factor(chrHuman, levels = c(1:22, "X"))) %>% 
   #--- Annotating Intergation Sites genes
   dplyr::mutate(geneHuman = ifelse(grepl("HBV", chr1), gene2, gene1)) %>%
   dplyr::mutate(cell.id = gsub("PD10_ZZM_|_arriba_output.tsv", "", group) )
 

Parsed with column specification:
cols(
  .default = col_character()
)

See spec(...) for full column specifications.



# Integrate meta donor

In [66]:
meta <- as.data.frame(colData(sce))
intSiteHBV <- intSiteHBV %>% left_join(meta[, c("cell.id", "donor", "tissue", "Sample", "leiden_global_celltype")])
intSiteHBV <- intSiteHBV[!is.na(intSiteHBV$donor),]

Joining, by = "cell.id"



# Merge Flanking sites

In [67]:
#--- flanking each sites for 500bp
suppressPackageStartupMessages(library(GenomicAlignments))
log_info("Extending sites 100bp")
humanSites <- GRanges(seqnames = intSiteHBV$chrHuman, ranges = IRanges(intSiteHBV$coordHuman, width = 1), spanning_counts = intSiteHBV$spanning_counts)
#shift(humanSites, 5)
gr <- flank(humanSites, 1000, both = T)
regions <- GenomicRanges::reduce(gr)
#--- match site and regions
ov <- findOverlaps(gr, regions)
sites.split <- split(queryHits(ov), subjectHits(ov))
aa <- as.data.frame(regions[subjectHits(ov),])
humanSites$siteClusters <- paste0(aa$seqnames, ":", aa$start, "-", aa$end)

log_info("Merging...")
intSiteHBV$siteClusters <- humanSites$siteClusters
intSiteHBV$siteHuman <- paste0(intSiteHBV$chrHuman, ":", intSiteHBV$coordHuman)

INFO [2020-10-02 21:07:41] Extending sites 100bp
INFO [2020-10-02 21:07:41] Merging...


In [68]:
intSiteHBV$siteHuman

In [58]:
represent_sites <- 
   intSiteHBV %>% dplyr::group_by(siteClusters) %>% count(siteHuman)  %>% top_n(1) %>%
   rename(representSites = siteHuman,
          representCellsInGroup = n)

Selecting by n



In [60]:
intSiteHBV <- left_join(intSiteHBV, represent_sites)

Joining, by = "siteClusters"



In [61]:
readr::write_tsv(intSiteHBV, path = "./data/HBV_intSites.tsv")