-
Notifications
You must be signed in to change notification settings - Fork 0
/
gtf_to_exons_vb.R
38 lines (28 loc) · 1.3 KB
/
gtf_to_exons_vb.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env Rscript
require(dplyr)
require(data.table)
args=commandArgs(trailingOnly = T)
if (length(args)<2) {
stop("Usage is: ./gtf_to_exons.R input.gtf.gz output.txt.gz")
}
cat("Reading in ",args[1],"\n")
gtf=fread(paste("zcat <", args[1]), data.table = F, col.names=c("chr","source","feature","start","end","a","strand","b","dat"))
cat("Processing...\n")
gtf = gtf %>% filter( feature=="exon" )
# Get gene_name
gn_where=regexpr("gene_name \"[^ ]+\"" , gtf$dat) # find gene_names in dat
gn_where=gn_where + 11 # ignore "gene_name" label
attr(gn_where,"match.length")=attr(gn_where,"match.length") - 11- 1 # cutoff trailing quote mark
gtf$gene_name=regmatches(gtf$dat, gn_where )
# Get Ensembl_ID
gid_where=regexpr("gene_id \"[^ ]+\"" , gtf$dat) # find gene_ids in dat
gid_where=gid_where + 9 # ignore "gene_name" label
attr(gid_where,"match.length")=attr(gid_where,"match.length") - 9- 1 # cutoff trailing quote mark
gtf$gene_id=regmatches(gtf$dat, gid_where )
#gtf$gene=foreach(s=strsplit(gtf$dat," "), .combine=c) %dopar% { s[which(s=="gene_name")+1] }
#gtf$gene=substr(gtf$gene, 1, nchar(gtf$gene)-1)
gtf = gtf %>% select( chr, start, end, strand, gene_name, gene_id ) %>% distinct()
cat("Saving exons to ",args[2],"\n")
gz=gzfile(args[2],"w")
write.table(gtf, gz, row.names = F, quote=F, sep="\t")
close(gz)