From 293069b41f18c3b9b7bafa3007dc4b3c1df6f782 Mon Sep 17 00:00:00 2001 From: Orion Buske Date: Sat, 8 Nov 2014 16:16:01 -0500 Subject: [PATCH] Adding script for capture-c probe filtering --- src/capture_c/filter_probe_repeats.sh | 70 +++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 src/capture_c/filter_probe_repeats.sh diff --git a/src/capture_c/filter_probe_repeats.sh b/src/capture_c/filter_probe_repeats.sh new file mode 100644 index 0000000..e4cc193 --- /dev/null +++ b/src/capture_c/filter_probe_repeats.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +set -eu +set -o pipefail + +# Kmer size for repeat identification +kmer=40 +# Maximum kmer mappings across the genome +kmer_max_n=5 +# Maximum bp overlap between a probe and repetitive regions +max_overlap=50 +window=$(python -c "print(${kmer}/2)") + +region=region.bed +candidates=probes.bed +mappability="wgEncodeCrgMapabilityAlign${kmer}mer.region.wig" +repeats="${kmer}mer.max${kmer_max_n}.bed" +overlap="probes.max${kmer_max_n}.overlap" +probes="probes.${kmer}mer.max${kmer_max_n}.${max_overlap}bp.txt" +bedgraph=$(basename "${probes}" .txt).bedGraph + +# Select region of interest +echo "chr2 136540000 136650000" > "${region}" + +# Generate probe file +if [[ ! -e "${candidates}" ]]; then + cat < "${repeats}" + +# Measure coverage with calculated probes +bedtools coverage -b "${candidates}" -a "${repeats}" > "${overlap}" + +# Identify non-repetitive probes +awk -v max_bp="${max_overlap}" '$6 < max_bp' "${overlap}" \ + | sort -k 2,2n \ + | cut -f -4 \ + > "${probes}" + +# Make bedGraph file for Genome Browser upload +name="${kmer}mer max${kmer_max_n} ${max_overlap}bp" +echo "track type=bedGraph name='${name}' description='${name}'" > "${bedgraph}" +cut -f -3 "${probes}" \ + | bedtools coverage -d -b "${region}" -a stdin \ + | awk 'BEGIN{OFS="\t"}{print $1, $2+$4-1, $2+$4, $5}' \ + >> "${bedgraph}"