In [1]:
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(BSgenome.Hsapiens.UCSC.hg38)
library(org.Hs.eg.db)
source('./sim-develop/src/mapsequence.r')

Loading required package: GenomicFeatures
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, cbind, colMeans, colnames,
    colSums, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match,
    mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which, which.max, which.min

Loading required package: S4Vectors
Loading 

In [2]:
genesR <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
head(genesR)

GRanges object with 6 ranges and 1 metadata column:
            seqnames                 ranges strand |     gene_id
               <Rle>              <IRanges>  <Rle> | <character>
          1    chr19 [ 58345178,  58362751]      - |           1
         10     chr8 [ 18391245,  18401218]      + |          10
        100    chr20 [ 44619522,  44651742]      - |         100
       1000    chr18 [ 27950966,  28177446]      - |        1000
  100009613    chr11 [ 70072434,  70075433]      - |   100009613
  100009676     chr3 [101676475, 101679217]      + |   100009676
  -------
  seqinfo: 455 sequences (1 circular) from hg38 genome

In [3]:
gr <- GRanges(seqnames(genesR),IRanges(start(genesR),end(genesR)))

In [4]:
genesS <- getSeq(Hsapiens,gr)

In [5]:
gene_counts <-  array(0,dim=c(length(genesS),4)) # Allocate memory
for (i in 1:length(genesS)){
    tmp <- mapsequence(genesS[[i]])
    gene_counts[i,]=c(length(which(!is.na(tmp$ict))),length(which(!is.na(tmp$icl))),length(which(!is.na(tmp$iot))),length(which(!is.na(tmp$iol))))
}

In [6]:
rownames(gene_counts) <- genesR$gene_id

In [7]:
head(gene_counts)

0,1,2,3,4
1,159,138,406,1023
10,120,224,523,1560
100,231,242,604,2323
1000,2701,5215,11812,31968
100009613,7,22,59,240
100009676,17,45,68,253


In [8]:
x <- org.Hs.egSYMBOL
xf <- function(ids) {
    # ids contains gene ids of a given exon
    tmp<-c()
    n0 <- length(which(ids==0)) # How many of the gene ids for current exon are 0 (missing from TxDB)
    if (n0>0) {tmp<-append(tmp,rep("NA",n0))} # If any, append an equal number of "NA"s to tmp
    nMiss <- length(which(!is.element(ids,mappedkeys(x)))) # How many gene ids are not elements of org.Hs.eg.Db
    if (nMiss>0) { # If any, append an equal number of "NA"s to tmp
        tmp<-append(tmp,rep("NA",nMiss))
    }
    return(c(as.list(x[ids[ids!=0 & is.element(ids,mappedkeys(x))]]),tmp))
}

In [9]:
test <- lapply(rownames(gene_counts),xf)

In [10]:
test2<-lapply(names(test), function(x) test[[x]])


In [22]:
rownames(gene_counts) <- unlist(test)

In [23]:
head(gene_counts)

0,1,2,3,4
A1BG,159,138,406,1023
NAT2,120,224,523,1560
ADA,231,242,604,2323
CDH2,2701,5215,11812,31968
ANO1-AS2,7,22,59,240
ZBTB11-AS1,17,45,68,253


In [24]:
colnames(gene_counts) <- c('CT','CL','OT','OL')

In [25]:
head(gene_counts)

Unnamed: 0,CT,CL,OT,OL
A1BG,159,138,406,1023
NAT2,120,224,523,1560
ADA,231,242,604,2323
CDH2,2701,5215,11812,31968
ANO1-AS2,7,22,59,240
ZBTB11-AS1,17,45,68,253


In [69]:
gene_counts <- data.table(gene_counts,keep.rownames=TRUE)
names(gene_counts)[1] <- "Sym"
gene_counts$Len <- width(genesS)
head(gene_counts)

Sym,CT,CL,OT,OL,Len
A1BG,159,138,406,1023,17574
NAT2,120,224,523,1560,9974
ADA,231,242,604,2323,32221
CDH2,2701,5215,11812,31968,226481
ANO1-AS2,7,22,59,240,3000
ZBTB11-AS1,17,45,68,253,2743


In [70]:
save(gene_counts,file='./gene_site_counts.rda')