-
Notifications
You must be signed in to change notification settings - Fork 0
/
leafcutter_ds.R
134 lines (105 loc) · 7.58 KB
/
leafcutter_ds.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env Rscript
library(optparse)
library(leafcutter)
library(dplyr)
arguments <- parse_args(OptionParser(usage = "%prog [options] counts_file groups_file", description="LeafCutter differential splicing command line tool. Required inputs:\n <counts_file>: Intron usage counts file. Must be .txt or .txt.gz, output from clustering pipeline.\n <groups_file>: Two+K column file: 1. sample names (must match column names in counts_file), 2. groups (currently only two groups, i.e. pairwise, supported. Some samples in counts_file can be missing from this file, in which case they will not be included in the analysis. Additional columns can be used to specify confounders, e.g. batch/sex/age. Numeric columns will be treated as continuous, so use e.g. batch1, batch2, batch3 rather than 1, 2, 3 if you a categorical variable.",option_list=list(
make_option(c("-o","--output_prefix"), default = "leafcutter_ds", help="The prefix for the two output files, <prefix>_cluster_significance.txt (containing test status, log likelihood ratio, degree of freedom, and p-value for each cluster) and <prefix>_effect_sizes.txt (containing the effect sizes for each intron) [default %default]"),
make_option(c("-s","--max_cluster_size"), default=Inf, help="Don't test clusters with more introns than this [default %default]"),
make_option(c("-i","--min_samples_per_intron"), default=5, help="Ignore introns used (i.e. at least one supporting read) in fewer than n samples [default %default]") ,
make_option(c("-g","--min_samples_per_group"), default=3, help="Require this many samples in each group to have at least min_coverage reads [default %default]"),
make_option(c("-c","--min_coverage"), default=20, help="Require min_samples_per_group samples in each group to have at least this many reads [default %default]"),
make_option(c("-t","--timeout"), default=30, help="Maximum time (in seconds) allowed for a single optimization run [default %default]"),
make_option(c("-p","--num_threads"), default=1, help="Number of threads to use [default %default]"),
make_option(c("-e","--exon_file"), default=NULL, help="File defining known exons, example in data/gencode19_exons.txt.gz. Columns should be chr, start, end, strand, gene_name. Optional, only just to label the clusters."),
make_option(c("--init"), default="smart", help="One of 'smart' (default) or 'random'."),
make_option(c("--seed"), default=12345, help="Random seed if using random initialization."))),
positional_arguments = 2)
opt=arguments$opt
counts_file=arguments$args[1]
groups_file=arguments$args[2]
cat("Loading counts from",counts_file,"\n")
if (!file.exists(counts_file)) stop("File ",counts_file," does not exist")
counts=read.table(counts_file, header=T, check.names = F)
cat("Loading metadata from",groups_file,"\n")
if (!file.exists(groups_file)) stop("File ",groups_file," does not exist")
meta=read.table(groups_file, header=F, stringsAsFactors = F)
colnames(meta)[1:2]=c("sample","group")
counts=counts[,meta$sample]
group_names=unique(meta$group) # keep order from groups_file unless numeric
if (is.numeric(meta$group)) group_names=sort(group_names)
meta$group=factor(meta$group, group_names)
stopifnot(length(group_names)==2)
cat("Encoding as",group_names[1],"=0,",group_names[2],"=1\n")
numeric_x=as.numeric(meta$group)-1
confounders=NULL
if (ncol(meta)>2) {
confounders=meta[,3:ncol(meta),drop=F]
# scale continuous confounders
for (i in seq_len(ncol(confounders)))
if (is.numeric(confounders[,i]))
confounders[,i]=scale(confounders[,i])
# convert factors to one-of-K encoding
confounders=model.matrix( ~., data=confounders )
confounders=confounders[,2:ncol(confounders),drop=F] # remove intercept
}
minimum_group_size=min(sum(numeric_x==0),sum(numeric_x==1))
if (minimum_group_size < opt$min_samples_per_intron)
stop("The number of samples in the smallest group is less than min_samples_per_intron, which means no clusters are testable. You can reduce min_samples_per_intron using the -i option, but note that we have only carefully checked the calibration of leafcutter p-values down to n=4 samples per group.")
if (minimum_group_size < opt$min_samples_per_group)
stop("The number of samples in the smallest group is less than min_samples_per_group, which means no clusters are testable. You can reduce min_samples_per_intron using the -g option, but note that we have only carefully checked the calibration of leafcutter p-values down to n=4 samples per group.")
require(doMC)
registerDoMC(opt$num_threads)
cat("Settings:\n")
print(opt)
cat("Running differential splicing analysis...\n")
results <- differential_splicing(counts, numeric_x, confounders=confounders, max_cluster_size=opt$max_cluster_size, min_samples_per_intron=opt$min_samples_per_intron, min_samples_per_group=opt$min_samples_per_group, min_coverage=opt$min_coverage, timeout=opt$timeout, init=opt$init, seed=opt$seed )
cat("Saving results...\n")
# Make cluster table
cluster_table = cluster_results_table(results)
cluster_table$cluster = add_chr(cluster_table$cluster)
map_clusters_to_genes=function(intron_meta, exons_table) {
gene_df=foreach (chr=sort(unique(intron_meta$chr)), .combine=rbind) %dopar% {
intron_chr=intron_meta[ intron_meta$chr==chr, ]
exons_chr=exons_table[exons_table$chr==chr, ]
exons_chr$temp=exons_chr$start
intron_chr$temp=intron_chr$end
three_prime_matches=inner_join( intron_chr, exons_chr, by="temp")
exons_chr$temp=exons_chr$end
intron_chr$temp=intron_chr$start
five_prime_matches=inner_join( intron_chr, exons_chr, by="temp")
all_matches=rbind(three_prime_matches, five_prime_matches)[ , c("clu", "gene_name", "gene_id")]
all_matches=all_matches[!duplicated(all_matches),]
if (nrow(all_matches)==0) return(NULL)
all_matches$clu=paste(chr,all_matches$clu,sep=':')
all_matches
}
clu_df=gene_df %>% group_by(clu) %>% summarize(genes=paste(gene_name, gene_id, collapse = ","))
class(clu_df)="data.frame"
clu_df
}
# Add gene names to clusters if an exon file is available
if (!is.null(opt$exon_file)) {
cat("Loading exons from",opt$exon_file,"\n")
if (file.exists(opt$exon_file)) {
tryCatch( {
exons_table = read.table(opt$exon_file, header=T, stringsAsFactors = F)
intron_meta = get_intron_meta(rownames(counts))
exons_table$chr = add_chr(exons_table$chr)
intron_meta$chr = add_chr(intron_meta$chr)
clu_gene_map = map_clusters_to_genes(intron_meta, exons_table)
cluster_table = merge(cluster_table, clu_gene_map, by.x="cluster", by.y="clu", all.x=TRUE)
}, error=function(err) warning(as.character(err)) ) # ignore errors here
} else warning("File ",opt$exon_file," does not exist")
} else cat("No exon_file provided.\n")
write.table( cluster_table, paste0(opt$output_prefix,"_cluster_significance.txt"), quote=F, sep="\t", row.names = F)
# Write effect size table
effect_size_table = leaf_cutter_effect_sizes(results)
colnames(effect_size_table)[3:4] = group_names
effect_size_table$intron = add_chr(effect_size_table$intron)
write.table( effect_size_table, paste0(opt$output_prefix,"_effect_sizes.txt"), quote=F, col.names = T, row.names = F, sep="\t")
# Save RData image
# save.image(paste0(opt$output_prefix,"_cluster_significance.RData"));
cat("All done, exiting\n")
home_dir <- getwd()
writeLines(capture.output(sessionInfo()), paste0(home_dir, "/leafcutter_run_session_info.", format(Sys.time(), "%Y%m%d.%H%M"), ".txt"))
# Based on: http://stackoverflow.com/questions/21967254/how-to-write-a-reader-friendly-sessioninfo-to-text-file