/
cds.bash
executable file
·79 lines (72 loc) · 2.28 KB
/
cds.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/bash
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES, $DATASET
set -e
SCRIPT="cds"
# shellcheck source=scripts/miga.bash
. "$MIGA/scripts/miga.bash" || exit 1
cd "$PROJECT/data/06.cds"
# Initialize
miga date > "$DATASET.start"
# Gunzip (if necessary)
if [[ -e "../05.assembly/$DATASET.LargeContigs.fna.gz" \
&& ! -e "../05.assembly/$DATASET.LargeContigs.fna" ]] ; then
gzip -d "../05.assembly/$DATASET.LargeContigs.fna.gz"
miga add_result -P "$PROJECT" -D "$DATASET" -r assembly -f
fi
# Run Prodigal
TYPE=$(miga ls -P "$PROJECT" -D "$DATASET" -m type | cut -f 2)
case "$TYPE" in
metagenome|virome|plasmid)
prodigal -a "${DATASET}.faa" -d "${DATASET}.fna" -o "${DATASET}.gff3" \
-f gff -q -i "../05.assembly/${DATASET}.LargeContigs.fna" -p meta
;;
*)
P_LEN=0
BEST_CT=0
echo "# Codon table selection:" > "${DATASET}.ct.t"
for ct in 11 4 ; do
prodigal -a "${DATASET}.faa.$ct" -d "${DATASET}.fna.$ct" \
-o "${DATASET}.gff3.$ct" -f gff -q -p single -g "$ct" \
-i "../05.assembly/${DATASET}.LargeContigs.fna"
C_LEN=$(grep -v '^>' "${DATASET}.faa.$ct" \
| perl -pe 's/[^A-Z]//ig' | wc -c | awk '{print $1}')
echo "# codon table $ct total length: $C_LEN aa" \
>> "${DATASET}.ct.t"
if [[ $C_LEN -gt $(($P_LEN * 11 / 10)) ]] ; then
for x in faa fna gff3 ; do
mv "${DATASET}.$x.$ct" "${DATASET}.$x"
done
P_LEN=$C_LEN
BEST_CT=$ct
else
rm "$DATASET".*."$ct"
fi
done
echo "Selected codon table: $BEST_CT"
;;
esac
# Clean Prodigal noisy deflines
for i in faa fna ; do
perl -pe 's/>.*ID=([^;]+);.*/>gene_$1/' "$DATASET.$i" > "$DATASET.$i.t"
mv "$DATASET.$i.t" "$DATASET.$i"
done
perl -pe 's/ID=([0-9]+_[0-9]+);/ID=gene_$1;/' "$DATASET.gff3" \
> "$DATASET.gff3.t"
mv "$DATASET.gff3.t" "$DATASET.gff3"
if [[ -e "${DATASET}.ct.t" ]] ; then
cat "${DATASET}.ct.t" >> "${DATASET}.gff3"
rm "${DATASET}.ct.t"
fi
# Gzip
for ext in gff3 faa fna ; do
[[ -e "$DATASET.$ext" ]] && gzip -9 -f "$DATASET.$ext"
done
# Finalize
miga date > "${DATASET}.done"
cat <<VERSIONS \
| miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f --stdin-versions
=> MiGA
$(miga --version)
=> Prodigal
$(prodigal -v 2>&1 | grep . | perl -pe 's/^Prodigal //')
VERSIONS