Skip to content

Commit

Permalink
Add normalize_gisaid_fasta.sh for issue nextstrain#53
Browse files Browse the repository at this point in the history
  • Loading branch information
Brian Pardy committed Feb 25, 2020
1 parent b5ef0fc commit d3c90c7
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions scripts/normalize_gisaid_fasta.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -e
GISAID_SARSCOV2_IN=$1
GISAID_SARSCOV2_OUT=$2
MIN_LENGTH=$3

if [[ ! -r "$GISAID_SARSCOV2_IN" ]]
then
echo "$0: input $GISAID_SARSCOV2_IN not found"
exit 1
fi

if [[ -z "$MIN_LENGTH" ]]
then
echo "Using default minimum length of 15000"
MIN_LENGTH=15000
fi

echo "Normalizing GISAID file $GISAID_SARSCOV2_IN to $GISAID_SARSCOV2_OUT (min length $MIN_LENGTH)"

# Remove leading 'BetaCoV' and 'BetaCov' from sequence names
# Remove embedded spaces in sequence names (Hong Kong sequences)
# Remove trailing |EPI_ISL_id|datestamp from sequence names
# Remove sequences shorter than minimum length
# Eliminate duplicate sequences (keep only the first seen)

cat $GISAID_SARSCOV2_IN |
sed 's/^>BetaCoV\//>/gi' | # remove leading BetaCo[vV]
sed 's/ //g' | # remove embedded spaces
sed 's/|.*$//' | # remove trailing metadata
awk "BEGIN{RS=\">\";FS=\"\n\"}length>$MIN_LENGTH{print \">\"\$0}" | # remove short seqs
awk 'BEGIN{RS=">";FS="\n"}!x[$1]++{print ">"$0}' | # remove duplicates
grep -v '^>*$' > $GISAID_SARSCOV2_OUT

exit 0

0 comments on commit d3c90c7

Please sign in to comment.