### load

In [2]:
## input: 1. summarizedExperiement(SE) obj for chromVAR 2. Jaspar matrix 
## output: 1. motif x cell (z score) 2. plot: ranked           
source("./libs.R")

In [3]:
##------------------------------------------------------------
## inputs
##------------------------------------------------------------

input.chromVar.res.list <- readRDS(file = "../dat/output.jaspar.dev.res.Rdata")
input.chromVar.jaspar.z <- assays(input.chromVar.res.list$dev)$z
input.umap.res <- fread("../dat/1908/Islet_123.MNN_corrected.cluster_labels.filt.txt", 
    header = T) %>% separate(cluster, into = c("cell_type_overall", "subtype"), remove = F)


“Expected 2 pieces. Missing pieces filled with `NA` in 600 rows [5, 18, 19, 50, 81, 114, 128, 147, 169, 175, 176, 190, 209, 250, 260, 345, 353, 376, 389, 413, ...].”

In [4]:

input.chromVar.jaspar.z <- assays(input.chromVar.res.list$dev)$z

table(input.umap.res %>% filter(barcodes %in% colnames(input.chromVar.jaspar.z)[-1]) %>% 
    pull(cell_type_overall))

table(input.umap.res %>% pull(cell_type_overall))


      alpha        beta       delta endothelial    exocrine       gamma 
       5535        7108         709         136         113         205 
      glial      immune    stellate 
         34          58         134 


      alpha        beta       delta endothelial    exocrine       gamma 
       5594        7170         718         157         131         206 
      glial      immune    stellate 
         39          71         153 

###  T test (one vs. other)

In [17]:
input.chromVar.jaspar.z <- data.table(assays(input.chromVar.res.list$dev)$z, keep.rownames = T)

# aggregate data --------------------------------------------------------------
# melt
input.chromVar.jaspar.z.agg <- melt(input.chromVar.jaspar.z, id = "rn", variable.name = "barcodes", 
    value.name = "zval")

# add celltype
input.chromVar.jaspar.z.agg <- merge(input.chromVar.jaspar.z.agg, input.umap.res)

table(input.chromVar.jaspar.z.agg%>%pull(cell_type_overall))


      alpha        beta       delta endothelial    exocrine       gamma 
    2136510     2744074      273674       52496       43618       79130 
      glial      immune    stellate 
      13124       22388       51724 

In [18]:
input.chromVar.jaspar.z.agg <- input.chromVar.jaspar.z.agg %>%select(rn,zval,cell_type_overall)
range(input.chromVar.jaspar.z.agg$zval)
sum(!complete.cases(input.chromVar.jaspar.z.agg))
dim(input.chromVar.jaspar.z.agg)
head(input.chromVar.jaspar.z.agg, 1)
table(input.chromVar.jaspar.z.agg %>% pull(cell_type_overall))

rn,zval,cell_type_overall
<chr>,<dbl>,<chr>
MA0025.1_NFIL3,-0.3430543,beta



      alpha        beta       delta endothelial    exocrine       gamma 
    2136510     2744074      273674       52496       43618       79130 
      glial      immune    stellate 
      13124       22388       51724 

In [19]:
(celltype.test.all<- input.chromVar.jaspar.z.agg%>%pull(cell_type_overall)%>%unique())
test.motifs <- input.chromVar.jaspar.z.agg%>%pull(rn)%>%unique()
length(test.motifs)


In [20]:
require(parallel)


ttest.res.ct <- do.call(rbind, lapply(celltype.test.all, function(ntest) {
    celltype.test <- c(ntest, "other")
    ttest.res <- do.call(rbind, mclapply(test.motifs, function(motif) {
        pd <- input.chromVar.jaspar.z.agg %>% filter(rn == motif) %>% mutate(cell_type_overall = ifelse(cell_type_overall == 
            ntest, ntest, "other"))
        test.res <- t.test(pd %>% filter(cell_type_overall == celltype.test[1]) %>% 
            select(zval), pd %>% filter(cell_type_overall == celltype.test[2]) %>% 
            select(zval))
        (data.frame(motif = motif, mean_x = test.res$estimate[1], mean_y = test.res$estimate[2], 
            pval = test.res$p.value/2))
    }, mc.cores = 10)) %>% mutate(test = paste0(ntest, ".vs.other"))
})) %>% group_by(test) %>% mutate(FDR = p.adjust(pval, "BH"), padj = p.adjust(pval, 
    "bonferroni"))
head(ttest.res.ct, 1)

motif,mean_x,mean_y,pval,test,FDR,padj
<fct>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
MA0025.1_NFIL3,0.203819,-0.1937263,9.648261000000001e-99,beta.vs.other,3.687355e-98,3.724229e-96


In [22]:
head(ttest.res.ct, 1)%>% separate(test,into = c("x","y"),sep = ".vs.",remove = F)
ttest.res.ct<- ttest.res.ct%>% separate(test,into = c("x","y"),sep = ".vs.",remove = F)

motif,mean_x,mean_y,pval,test,x,y,FDR,padj
<fct>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>
MA0025.1_NFIL3,0.203819,-0.1937263,9.648261000000001e-99,beta.vs.other,beta,other,3.687355e-98,3.724229e-96


### Motif db

In [25]:
ttest.res.ct%>%head(1)%>%separate(motif,into=c('jaspar.id','motif'),sep = '_')


jaspar.id,motif,mean_x,mean_y,pval,test,x,y,FDR,padj
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>
MA0025.1,NFIL3,0.203819,-0.1937263,9.648261000000001e-99,beta.vs.other,beta,other,3.687355e-98,3.724229e-96


In [125]:
tfclass.db.dic <- readRDS("~/github/atacMotif/db/dic_jaspar_tfclass.rds")
tfclass.db <- readRDS('~/github/atacMotif/db/tfclass.rds')
#str(tfclass.db)

In [128]:
tfclass.db$merge%>%filter(genus.name=="NR2F1")
tfclass.db.dic$merge%>%filter(genus.name=="NR2F1")

genus.id,genus.name,tf.symbol,tf.id,subfamily.id,subfamily.name,subfamily.seq,family.id,family.name
<chr>,<chr>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<fct>


In [129]:
str(tfclass.db)
str(tfclass.db.dic)

List of 7
 $ subfamily :'data.frame':	337 obs. of  3 variables:
  ..$ id  : chr [1:337] "1.1.1.1" "1.1.1.2" "1.1.1.3" "1.1.2.1" ...
  ..$ name: chr [1:337] "Jun" "NFE2" "ATF2" "Fos" ...
  ..$ seq : chr [1:337] "TGAGTCA" "GCTGAGTCA" "TGACGTCA" "TGAGTCA" ...
 $ family    :'data.frame':	110 obs. of  2 variables:
  ..$ id  : chr [1:110] "0.0.1" "0.0.2" "0.0.3" "0.0.4" ...
  ..$ name: chr [1:110] "NULP1" "PHF5" "RFXANK" "RFXAP" ...
 $ genus     :'data.frame':	1453 obs. of  2 variables:
  ..$ id  : chr [1:1453] "3.5.1.1.2" "3.1.8.1.1" "3.1.8.1.2" "2.3.2.4.9" ...
  ..$ name: chr [1:1453] "A-Myb (MYBL1)" "ADNP1" "ADNP2" "AEBP2" ...
 $ genus.dup :'data.frame':	4 obs. of  2 variables:
  ..$ id  : chr [1:4] "3.5.1.3.4" "3.5.1.3.5" "3.5.1.3.6" "3.5.1.3.7"
  ..$ name: chr [1:4] "MTA1" "MTA2" "MTA3" "RERE"
 $ merge     :'data.frame':	1475 obs. of  9 variables:
  ..$ genus.id      : chr [1:1475] "3.1.8.1.1" "3.1.8.1.2" "2.3.2.4.9" "1.2.5.1.1" ...
  ..$ genus.name    : chr [1:1475] "ADNP1" "ADNP2" "AE

In [146]:
data.frame(motif = "NR2F1", subfamily.id = "2.1.3.5", stringsAsFactors = F) %>% left_join(tfclass.db.dic$merged %>% 
    select(subfamily.id, family.id, family.name)%>%unique)

Joining, by = "subfamily.id"


motif,subfamily.id,family.id,family.name
<chr>,<chr>,<chr>,<fct>
NR2F1,2.1.3.5,2.1.3,RXR-related receptors


In [155]:
ttest.res.ct %>% dim
ttest.res.2 <- ttest.res.ct %>% separate(motif, into = c("jaspar.id", "motif"), sep = "_") %>% 
    left_join(tfclass.db.dic$merged %>% select(family.id, family.name, jaspar.name)%>%unique, 
        by = c(motif = "jaspar.name"))
sum(is.na(ttest.res.2$family.id))
tmp <- ttest.res.2[is.na(ttest.res.2$family.id), ]
ttest.res.2[is.na(ttest.res.2$family.id), ] %>% head(2)
ttest.res.2 %>% dim
tmp %>% ungroup %>% select(jaspar.id, motif) %>% unique

## rescue NR2F1
## https://github.com/epigen-UCSD/atacMotif/blob/master/db/rescue_Jaspar.txt
data.frame(motif = "NR2F1", subfamily.id = "2.1.3.5", stringsAsFactors = F) %>% left_join(tfclass.db.dic$merged %>% 
    select(subfamily.id, family.id, family.name)%>%unique)
ttest.res.2[ttest.res.2$motif=='NR2F1',c("family.id","family.name")] <- data.frame(motif = "NR2F1", subfamily.id = "2.1.3.5", stringsAsFactors = F) %>% left_join(tfclass.db.dic$merged %>% 
    select(subfamily.id, family.id, family.name)%>%unique)%>%select(family.id,family.name)
ttest.res.2[ttest.res.2$motif=='NR2F1',]%>%head(1)

jaspar.id,motif,mean_x,mean_y,pval,test,x,y,FDR,padj,family.id,family.name
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<fct>
MA0637.1,CENPB,-0.06876892,0.03029314,6.315258e-09,beta.vs.other,beta,other,8.929266e-09,2.43769e-06,,
MA0017.2,NR2F1,-0.10012182,0.10325331,4.524602e-28,beta.vs.other,beta,other,8.776364000000001e-28,1.746496e-25,,


jaspar.id,motif
<chr>,<chr>
MA0637.1,CENPB
MA0017.2,NR2F1


Joining, by = "subfamily.id"


motif,subfamily.id,family.id,family.name
<chr>,<chr>,<chr>,<fct>
NR2F1,2.1.3.5,2.1.3,RXR-related receptors


Joining, by = "subfamily.id"


jaspar.id,motif,mean_x,mean_y,pval,test,x,y,FDR,padj,family.id,family.name
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<fct>
MA0017.2,NR2F1,-0.1001218,0.1032533,4.524602e-28,beta.vs.other,beta,other,8.776364000000001e-28,1.746496e-25,2.1.3,RXR-related receptors


In [156]:
ttest.res.2 %>% dim
ttest.res.2 %>% filter(is.na(family.id)) %>% ungroup %>% select(motif, family.id) %>% 
    unique
ttest.res.2 <- ttest.res.2 %>% mutate(class.id = sub(".[0-9]+$", "", family.id)) %>% 
    left_join(tfclass.db$class %>% select(-about), by = c(class.id = "id"))%>% rename(class.name = name) 
ttest.res.2 %>% dim
ttest.res.2 %>% head(1)
ttest.res.2 %>% filter(is.na(class.id)) %>% ungroup %>% select(motif, family.id, 
    class.id) %>% unique

motif,family.id
<chr>,<chr>
CENPB,


jaspar.id,motif,mean_x,mean_y,pval,test,x,y,FDR,padj,family.id,family.name,class.id,class.name
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<fct>,<chr>,<chr>
MA0025.1,NFIL3,0.203819,-0.1937263,9.648261000000001e-99,beta.vs.other,beta,other,3.687355e-98,3.724229e-96,1.1.8,CEBP-related,1.1,Basic leucine zipper factors (bZIP)


motif,family.id,class.id
<chr>,<chr>,<chr>
CENPB,,


In [157]:
ttest.res.2 %>% dim
ttest.res.2 <- ttest.res.2 %>% mutate(superclass.id = sub(".[0-9]+$", "", class.id)) %>% 
    left_join(tfclass.db$superclass %>% select(-about), by = c(superclass.id = "id")) %>% 
    rename(superclass.name = "name")
ttest.res.2 %>% dim
ttest.res.2 %>% head(1)
ttest.res.2 %>% filter(is.na(superclass.id)) %>% ungroup %>% select(motif, family.id, 
    class.id,superclass.id) %>% unique

jaspar.id,motif,mean_x,mean_y,pval,test,x,y,FDR,padj,family.id,family.name,class.id,class.name,superclass.id,superclass.name
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<fct>,<chr>,<chr>,<chr>,<chr>
MA0025.1,NFIL3,0.203819,-0.1937263,9.648261000000001e-99,beta.vs.other,beta,other,3.687355e-98,3.724229e-96,1.1.8,CEBP-related,1.1,Basic leucine zipper factors (bZIP),1,Basic domains


motif,family.id,class.id,superclass.id
<chr>,<chr>,<chr>,<chr>
CENPB,,,


In [161]:
(ttest.res.2 %>% head(1) %>% mutate(enrichedIn = ifelse(mean_x > mean_y, x, y)))[, 
    c(1:8, 17, 9:16)]
ttest.res.2 <- (ttest.res.2 %>% mutate(enrichedIn = ifelse(mean_x > mean_y, x, y)))[, 
    c(1:8, 17, 9:16)]

jaspar.id,motif,mean_x,mean_y,pval,test,x,y,enrichedIn,FDR,padj,family.id,family.name,class.id,class.name,superclass.id,superclass.name
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<fct>,<chr>,<chr>,<chr>,<chr>
MA0025.1,NFIL3,0.203819,-0.1937263,9.648261000000001e-99,beta.vs.other,beta,other,beta,3.687355e-98,3.724229e-96,1.1.8,CEBP-related,1.1,Basic leucine zipper factors (bZIP),1,Basic domains


In [163]:
ttest.res.list <- lapply(paste0(celltype.test.all, ".vs.other"), function(ntest) {
    ttest.res.2 %>% filter(test == ntest) %>% group_by(test) %>% arrange(desc(mean_x))
    
})
names(ttest.res.list) <- paste0(celltype.test.all, ".vs.other")
require(writexl)
write_xlsx(ttest.res.list, "~/Dropbox (UCSD_Epigenomics)/workReports/2019-10_islet_rev/fig1E_one_vs_other.xlsx")
fwrite(ttest.res.2 %>% group_by(test) %>% arrange(desc(mean_x)), "~/Dropbox (UCSD_Epigenomics)/workReports/2019-10_islet_rev/fig1E_one_vs_other.csv")