-
Notifications
You must be signed in to change notification settings - Fork 0
/
nodeSeqs.sh
158 lines (136 loc) · 4.79 KB
/
nodeSeqs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/bin/bash
# https://github.com/acvill/nodeSeqs
# https://bioinformatics.stackexchange.com/questions/18476
# defaults
outdir=$(pwd)
context=100
kmer=55
mincov=10
# usage statement
usage() {
printf "Usage: $0 -g <file> -d <integer> [-o,-c,-k,-m,-t]
-h print this usage and exit
-g (REQUIRED) GFA file
-d (REQUIRED) minimum number of links (degree) needed to pull a segment (node)
-o output path (default: pwd)
-c nucleotide sequence context to extract from segment ends (default: 100)
-k k-mer size used in assembly (default for metaSPAdes: 55)
-m minimum coverage filter for segments with degree >= d (default: 10)
-t keep all temporary files (exclude -t to delete temp folder)
"
1>&2
exit 1
}
# get inputs
while getopts "hg:d:o:c:k:m:t" opt; do
case ${opt} in
g) gfa=${OPTARG};;
d) degree=${OPTARG};;
o) outdir=${OPTARG};;
c) context=${OPTARG};;
k) kmer=${OPTARG};;
m) mincov=${OPTARG};;
t) keeptemp=1;;
h) usage
exit 0;;
*) usage
exit 0;;
esac
done
if [[ -z ${gfa} ]] || \
[[ -z ${degree} ]]; then
usage
exit 0
else
printf "\n######################\n"
printf "## input GFA file -> ${gfa}\n"
printf "## output directory -> ${outdir}\n"
printf "## degree cutoff -> ${degree} links\n"
printf "## sequence context -> ${context} nt\n"
printf "## k-mer length -> ${kmer}\n"
printf "## minimum coverage -> ${mincov}\n"
printf "######################\n\n"
fi
if [[ ! -d ${outdir} ]]; then
printf "creating ${outdir}\n" >> ${log}
mkdir -p ${outdir}
fi
# make tmp directory and initialize log file
tmp_dir=$(mktemp -d -t temp_XXXXX -p ${outdir})
log=${tmp_dir}/log.txt
printf "writing intermediate files to ${tmp_dir}\n" >> ${log}
# count occurrence of segment-orientation pairs in link lines
## FROM segment
grep -P "^L\t" ${gfa} | \
cut -f2,3 \
> ${tmp_dir}/seg_from.txt
## TO segment
grep -P "^L\t" ${gfa} | \
cut -f4,5 \
> ${tmp_dir}/seg_to.txt
## merge
cat ${tmp_dir}/seg_from.txt ${tmp_dir}/seg_to.txt | \
sort | uniq -c | sort -nr | \
sed 's/^\s*//' | sed 's/\s/\t/g' | \
awk -v d=${degree} '($1 >= d)' \
> ${tmp_dir}/segments.txt
# Progress bar function from https://stackoverflow.com/a/52581824/7976890
###
PROGRESS_BAR_WIDTH=30 # progress bar length in characters
draw_progress_bar() {
# Arguments: current value, max value, unit of measurement (optional)
local __value=$1
local __max=$2
local __unit=${3:-""} # if unit is not supplied, do not display it
# Calculate percentage
if (( $__max < 1 )); then __max=1; fi # anti zero division protection
local __percentage=$(( 100 - ($__max*100 - $__value*100) / $__max ))
# Rescale the bar according to the progress bar width
local __num_bar=$(( $__percentage * $PROGRESS_BAR_WIDTH / 100 ))
# Draw progress bar
printf "["
for b in $(seq 1 $__num_bar); do printf "#"; done
for s in $(seq 1 $(( $PROGRESS_BAR_WIDTH - $__num_bar ))); do printf " "; done
printf "] $__percentage%% ($__value / $__max $__unit)\r"
}
###
# for each high-degree segment, pull sequence from segment lines
totsegs=$(wc -l < ${tmp_dir}/segments.txt)
line=0
while read segment; do
## separate fields for each segment line
deg=$(echo "$segment" | awk -F$'\t' '{print $1}')
seg=$(echo "$segment" | awk -F$'\t' '{print $2}')
ori=$(echo "$segment" | awk -F$'\t' '{print $3}')
sqn=$(grep -P "^S\t${seg}\t" ${gfa} | cut -f3)
kci=$(grep -P "^S\t${seg}\t" ${gfa} | awk '{for(i=4;i<=NF;i++){if($i~/^KC/){a=$i}} print a}' | sed 's/KC:i://')
sqL=$(printf ${sqn} | wc -c)
cov=$(echo "scale=2; ${kci} / (${sqL} - ${kmer} + 1)" | bc)
## extract segment sequence given orientation and context
## add sequence and header to output fasta if coverage >= mincov
if (( $(echo "${cov} < ${mincov}" | bc -l) )); then
printf "Segment ${seg} has coverage below the set threshold (${cov} < ${mincov})\n" >> ${log}
elif [[ ${sqL} -le ${context} ]]; then
tsq=${sqn}
header=">segment${seg}_degree=${deg}_${ori}_cov=${cov}"
printf "${header}\n${tsq}\n" >> ${outdir}/segments.fasta
printf "Segment ${seg} is <= context length (${sqL} <= ${context}) -- exporting full segment\n" >> ${log}
elif [[ ${ori} = "+" ]]; then
tsq=${sqn:0:${context}}
header=">segment${seg}_degree=${deg}_${ori}_cov=${cov}"
printf "${header}\n${tsq}\n" >> ${outdir}/segments.fasta
elif [[ ${ori} = "-" ]]; then
tsq=${sqn: -${context}}
header=">segment${seg}_degree=${deg}_${ori}_cov=${cov}"
printf "${header}\n${tsq}\n" >> ${outdir}/segments.fasta
else
printf "ERROR: link orientation not found for segment ${seg}, aborting\n" | tee -a ${log}
exit 1
fi
((line+=1))
draw_progress_bar ${line} ${totsegs} "segments"
done < ${tmp_dir}/segments.txt
if [[ ${keeptemp} -ne 1 ]]; then
rm -rf ${tmp_dir}
fi
printf "\nDone!\n"