In [None]:
# See also cbtn-gsea.ipynb for DSBR sets

# TODO: convert cbtn-gsea DSBR sets to R code.

# Load expression data

In [None]:
# Dependencies
Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer

suppressWarnings(library(magrittr))
suppressWarnings(library(tibble))
suppressWarnings(library(repr))
suppressWarnings(library(stringr))
suppressWarnings(library(dplyr))

sessionInfo()

In [None]:
# imports from external file
gex <- new.env()
source("gex.r", local = gex)

In [None]:
path_to_counts =  "../data/cloud/gex/pbta-rsem-genes.expected_count.tsv"
genes = read.csv(path_to_counts,sep='\t',row.names="Gene_or_Transcript_ID",check.names=FALSE) %>%
    rownames
genes = sub("^ENSG\\d*\\.\\d*_", "", genes)

# Replication-dependent histones

In [None]:
# Seal, R.L., Denny, P., Bruford, E.A. et al. A standardized nomenclature for mammalian histone genes. Epigenetics & Chromatin 15, 34 
# (2022). https://doi.org/10.1186/s13072-022-00467-2
# changes in v2: H1-1, H1-2, H1-3, H1-4, H1-5, H1-6, H4-16 to rdh
#                

H_pseudogenes = c('H1-9P','H1-12P',
                  'H2AC2P','H2AC3P','H2AC5P','H2AC9P','H2AC10P','H2AQ1P','H2AL1MP',
                  'H2BC2P','H2BC16P','H2BC19P','H2BC20P','H2BU2P','H2BL1P', # H2BU2P = H2BC27P
                  'H2BW3P',#'H2BW4P', # note H2BW4P and aliases not in gex dataset
                  'H3C5P','H3C9P','H3P16','H3P44',
                  'H4C10P',
                  'H2AZP7','H2AZ2P1','H2AZP2','H2AZP5','H2BP2','H2BP3','H2BP9','H4P1','H2BP1','H2AL1QP','H3P18','H3P21','H3P3','H3P1','H3P4',
                  'H3P37','H3P31','H3P14','H3P47','H3P6','H3P39','H3P13',
                  "H2AZP1","H2AZP3","H3P28","H3P27","H2ACP1","H2ACP2","H3P26" ,"H2BP7","H3P43","H2BP6","H3P45","H3P9","H3P30","H3P29","H3P10","H3P11",
                  "H3P12","H3P5","H2AZP6","H3P32","H2BP8","H3P36","H3P7","H3P2","H3P17","H3P20","H3P23","H3P22","H3P15","H3P24","H3P34","H2AZP4",
                  "H3P33","H3P35","H3P40","H3P38","H3P41","H3P8","H3P19","H3P42","H3P25","H2BP5","H3P46" ) 
H1 = c('H1-0','H1-7','H1-8','H1-10')
H1_clustered = c('H1-1','H1-2','H1-3','H1-4','H1-5','H1-6')
H2A_clustered = c('H2AC1','H2AC4','H2AC6','H2AC7','H2AC8','H2AC11','H2AC12',
                'H2AC13','H2AC14','H2AC15','H2AC16','H2AC17','H2AC18','H2AC19','H2AC20','H2AC21','H2AW') # H2AW = H2AC25
H2A = c('H2AZ1','H2AZ2','MACROH2A1','MACROH2A2','H2AX','H2AJ','H2AB1','H2AB2','H2AB3','H2AP','H2AL3')
H2B_clustered = c('H2BC1','H2BC3','H2BC4','H2BC5','H2BC6','H2BC7','H2BC8','H2BC9','H2BC10','H2BC11','H2BC12',
                 'H2BC13','H2BC14','H2BC15','H2BC17','H2BC18','H2BC21','H2BU1') # H2BU1 = H2BC26
H2B = c('H2BE1','H2BW1','H2BW2','H2BS1') # H2BS1 = H2BC12L; H2BE1 = H2BK1; H2BN1 and aliases not in gex dataset
H3_clustered = c('H3C1','H3C2','H3C3','H3C4','H3C6','H3C7','H3C8','H3C10','H3C11','H3C12','H3C13','H3C14','H3C15',
                 'H3-4') # H3-4 has a stem loop and is in a cluster
H3 = c('H3-3A','H3-3B','H3-5','H3-2','H3Y1','H3Y2','CENPA') # H3-2 = H3-7
H4_clustered = c('H4C1','H4C2','H4C3','H4C4','H4C5','H4C6','H4C7','H4C8','H4C9','H4C11','H4C12','H4C13','H4C14','H4C15',
                'H4-16') # H4-16 = H4C16 is outside the clusters but has a stem-loop and is cell-cycle regulated.
H4 = c()
rdh = c(H1_clustered,H2A_clustered,H2B_clustered,H3_clustered,H4_clustered)
nrdh = c(H1,H2A,H2B,H3,H4)
all_histones = c(rdh,nrdh,H_pseudogenes)

In [None]:
# gene set validation checks
paste("replication-dependent histone count:",length(rdh))
paste("replication-independent histone count:",length(nrdh))
paste("histone pseudogene count:",length(H_pseudogenes))

# all genes in my sets should be in my gene expression dataset
# requires global variable genes
check_set <- function(name,set){
    pass = all(v %in% genes)
    print(paste(name,'...',pass))
    if (!pass){
        missing = setdiff(v,genes)
        print(paste("  missing",missing,"from gene expression data"))
    }
    return(pass)
}
vectors = list("H1ri"=H1,
          "H2Ari"=H2A,
          "H2Bri"=H2B,
          "H3ri"=H3,
          "H4ri"=H4,
          "H1rd"=H1_clustered,
          "H2Ard"=H2A_clustered,
          "H2Brd"=H2B_clustered,
          "H3rd"=H3_clustered,
          "H4rd"=H3_clustered,
          "HP"=H_pseudogenes
    )
print("Checking each histone class for genes not in expression dataset...")
flags=c()
for (name in names(vectors)){
    v = vectors[[name]]
    pass = check_set(name,v)
    if (!pass){
        flags=c(flags,name)
    }
}

print("Checking for possible unannotated histone genes in expression dataset...")
regex = "^H(1|2|3|4)|^HIST"
candidates = grep(regex,genes,value=TRUE)
candidates = setdiff(candidates,all_histones)
not_histones=c('H19')
candidates = setdiff(candidates,not_histones)
print(candidates)

In [None]:
filepath="out/histone-sets.gmt"
set_list = list(
    "replication_dependent_histones"=rdh,
    "replication_independent_histones"=nrdh,
    "histone_pseudogenes"=H_pseudogenes
)
gex$write_gene_sets(filepath,set_list)

# HOX sets

In [None]:
hoxa = c('HOXA1','HOXA2','HOXA3','HOXA4','HOXA5','HOXA6','HOXA7','HOXA9','HOXA10','HOXA11','HOXA13')
hoxb = c('HOXB1','HOXB2','HOXB3','HOXB4','HOXB5','HOXB6','HOXB7','HOXB8','HOXB9','HOXB13')
hoxc = c('HOXC4','HOXC5','HOXC6','HOXC8','HOXC9','HOXC10','HOXC11','HOXC12','HOXC13')
hoxd = c('HOXD1','HOXD3','HOXD4','HOXD8','HOXD9','HOXD10','HOXD11','HOXD12','HOXD13')

filepath="out/hox-sets.gmt"
set_list = list(
    "HOXA@"=hoxa,
    "HOXB@"=hoxb,
    "HOXC@"=hoxc,
    "HOXD@"=hoxd,
    "HOX"=c(hoxa,hoxb,hoxc,hoxd)
)
gex$write_gene_sets(filepath,set_list)