Skip to content

Commit

Permalink
Add more upstream segments
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason W. Bacon committed Apr 30, 2022
1 parent d9c3290 commit 047b55d
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 11 deletions.
15 changes: 10 additions & 5 deletions Test/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,33 +24,38 @@ printf "\n1-base overlaps:\n\n"
test-overlaps.tsv
../filter-overlaps test-overlaps.tsv test-filtered.tsv \
five_prime_utr three_prime_utr intron exon \
upstream1000 upstream10000 upstream100000 upstream-beyond
upstream1000 upstream10000 upstream100000 upstream200000 upstream300000 \
upstream400000 upstream500000 upstream600000 upstream700000 upstream800000 upstream-beyond

printf "\n20%% peak overlaps:\n\n"
../peak-classifier --min-peak-overlap 0.2 test.bed.xz \
$gff test-peak-20-overlaps.tsv
../filter-overlaps test-peak-20-overlaps.tsv test-peak-20-filtered.tsv \
five_prime_utr three_prime_utr intron exon \
upstream1000 upstream10000 upstream100000 upstream-beyond
upstream1000 upstream10000 upstream100000 upstream200000 upstream300000 \
upstream400000 upstream500000 upstream600000 upstream700000 upstream800000 upstream-beyond

printf "\n20%% GFF feature overlaps:\n\n"
../peak-classifier --min-gff-overlap 0.2 test.bed.xz \
$gff test-gff-20-overlaps.tsv
../filter-overlaps test-gff-20-overlaps.tsv test-gff-20-filtered.tsv \
five_prime_utr three_prime_utr intron exon \
upstream1000 upstream10000 upstream100000 upstream-beyond
upstream1000 upstream10000 upstream100000 upstream200000 upstream300000 \
upstream400000 upstream500000 upstream600000 upstream700000 upstream800000 upstream-beyond

printf "\n20%% either peak or GFF feature overlaps:\n\n"
../peak-classifier --min-gff-overlap 0.2 --min-gff-overlap 0.2 \
--min-either-overlap test.bed.xz \
$gff test-either-20-overlaps.tsv
../filter-overlaps test-either-20-overlaps.tsv test-either-20-filtered.tsv \
five_prime_utr three_prime_utr intron exon \
upstream1000 upstream10000 upstream100000 upstream-beyond
upstream1000 upstream10000 upstream100000 upstream200000 upstream300000 \
upstream400000 upstream500000 upstream600000 upstream700000 upstream800000 upstream-beyond

printf "\nMidpoints only:\n\n"
../peak-classifier --midpoints test.bed.xz $gff \
test-midpoint-overlaps.tsv
../filter-overlaps test-midpoint-overlaps.tsv test-filtered.tsv \
five_prime_utr three_prime_utr intron exon \
upstream1000 upstream10000 upstream100000 upstream-beyond
upstream1000 upstream10000 upstream100000 upstream200000 upstream300000 \
upstream400000 upstream500000 upstream600000 upstream700000 upstream800000 upstream-beyond
4 changes: 2 additions & 2 deletions filter-overlaps.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ int filter_overlaps(const char *overlaps_file, const char *output_file,

printf("Total unique peaks: %zu\n", unique_peaks);
for (c = 0; features[c] != NULL; ++c)
printf("Overlaps with %-20s: %7zu (%2zu%%)\n", features[c],
feature_overlaps[c], 100 * feature_overlaps[c] / unique_peaks);
printf("Overlaps with %-20s: %7zu (%3.1f%%)\n", features[c],
feature_overlaps[c], 100.0 * feature_overlaps[c] / unique_peaks);
return EX_OK;
}

Expand Down
15 changes: 11 additions & 4 deletions peak-classifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ int main(int argc,char *argv[])
*gff_stream,
*intersect_pipe;
// Default, override with --upstream-boundaries
char *upstream_boundaries = "1000,10000,100000",
char *upstream_boundaries = "1000,10000,100000,200000,300000,400000,500000,600000,700000,800000",
*p,
cmd[PEAK_CMD_MAX + 1],
*redirect_overwrite,
Expand All @@ -46,7 +46,8 @@ int main(int argc,char *argv[])
*end,
*gff_stem,
augmented_filename[PATH_MAX + 1],
sorted_filename[PATH_MAX + 1];
sorted_filename[PATH_MAX + 1],
*sort;
bool midpoints_only = false;
bl_bed_t bed_feature;
struct stat file_info;
Expand Down Expand Up @@ -150,9 +151,15 @@ int main(int argc,char *argv[])
else
{
// LC_ALL=C makes sort assume 1 byte/char, which improves speed
// gsort is faster than other implementations, so use it if
// available
if ( system("which gsort") == 0 )
sort = "gsort";
else
sort = "sort";
snprintf(cmd, PEAK_CMD_MAX, "env LC_ALL=C grep -v '^#' %s | "
"sort -n -k 1 -k 2 -k 3 > %s\n",
augmented_filename, sorted_filename);
"%s -n -k 1 -k 2 -k 3 > %s\n",
augmented_filename, sort, sorted_filename);
fputs("Sorting...\n", stderr);
if ( (status = system(cmd)) != 0 )
{
Expand Down

0 comments on commit 047b55d

Please sign in to comment.