/
bed2gene.r
72 lines (58 loc) · 2.66 KB
/
bed2gene.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
source('library.r')
library(ChIPpeakAnno)
library(biomaRt)
library(org.Mm.eg.db)
library(doMC)
ncore = multicore:::detectCores()
registerDoMC(cores = ncore)
## If help is needed
# vignette('ChIPpeakAnno')
## Folder containing the bed files
loc <- "bed"
## reading all the bed files
# output: a list of data.frame
bedFiles <- readBed(loc)
## transform bed files to IRanges objects
# output: a list of IRanges objects
peaks <- lapply(bedFiles, BED2RangedData)
## Loading the human annotations
# TSS annotation for human sapiens (NCBI36) obtained from biomaRt
# see vignette('ChIPpeakAnno') for info
## The default annotaion from ChIPpeakAnno are mm9 not mm10
# BUILDING THE MM10 PACKAGE
library(biomaRt)
# ensembl=useMart("ensembl")
# listDatasets(ensembl)
ensembl = useMart("ensembl", dataset="mmusculus_gene_ensembl")
# listFilters(ensembl)
# listAttributes(ensembl)
TSS.mouse.GRCm38 <- getAnnotation(ensembl, featureType = "TSS")
## Feature association are made using TSS: means using start of feature when feature is on plus strand
# and using end of feature when feature is on minus strand. Here the features are genes.
# parallel version
annotatedPeakList <- mclapply(peaks, function(x){annotatePeakInBatch(x, AnnotationData=TSS.mouse.GRCm38, output="both")}, mc.cores = ncore)
# not parallel version
# annotatedPeakList <- lapply(peaks, function(x){annotatePeakInBatch(x,AnnotationData=TSS.mouse.GRCm38, output="both")})
## These matrices needs annotation with geneIDs, symbol and long gene name
# There is 2 ways to do that through biomaRt or the BioConductor annotation package.
# BIOMART annotated more transcripts but with non-entrez gene ID. See test below.
## BIOMART
annotatedPeakList_mart <- mclapply(annotatedPeakList, biomaRtAnnot, mc.cores = ncore)
## Transform to data.frame before export to file
annotatedPeakList_martDF <- mclapply(annotatedPeakList_mart, as.data.frame, mc.cores = ncore)
# suppress line without feature name (aka gene symbol)
for(i in 1:length(annotatedPeakList_martDF)){
theNA <- which(is.na(annotatedPeakList_martDF[[i]]$feature))
if(length(theNA)>0){
annotatedPeakList_martDF[[i]] <- annotatedPeakList_martDF[[i]][-theNA,]
}
}
## adding missing annotation when possible using BioConductor org.Mm.egSYMBOL2EG
## parallel version
# annotatedPeakList_martDF <- mclapply(annotatedPeakList_martDF, addingMissingAnnot, mc.cores = ncore)
## not parallel version
annotatedPeakList_martDF <- lapply(annotatedPeakList_martDF, addingMissingAnnot)
## CLEAN THE ENTREZGENE ID WITH TWO IDS
annotatedPeakList_martDF <- mclapply(annotatedPeakList_martDF, cleanEG, mc.cores = ncore)
## WRITE FILE FOR IMPORT
write.lists(annotatedPeakList_martDF, file="annotatedPeakList.csv")