This is the main documentation for the software.
if [ -z "${CSC_LOGLVL-}" ]; then
CSC_LOGLVL="INFO"
fi
fancy_message() (
if [ -z "${1}" ] || [ -z "${2}" ]; then
return
fi
RED="\e[31m"
GREEN="\e[32m"
YELLOW="\e[33m"
MAGENTA="\e[35m"
RESET="\e[0m"
MESSAGE_TYPE=""
MESSAGE=""
MESSAGE_TYPE="${1}"
MESSAGE="${2}"
case ${MESSAGE_TYPE} in
info) printf " [${GREEN}+${RESET}] %s\n" "${MESSAGE}" ;;
progress) printf " [${GREEN}+${RESET}] %s" "${MESSAGE}" ;;
recommend) printf " [${MAGENTA}!${RESET}] %s\n" "${MESSAGE}" ;;
warn) printf " [${YELLOW}*${RESET}] WARNING! %s\n" "${MESSAGE}" ;;
error) printf " [${RED}!${RESET}] ERROR! %s\n" "${MESSAGE}" ;;
fatal)
printf " [${RED}!${RESET}] ERROR! %s\n" "${MESSAGE}"
exit 1
;;
*) printf " [?] UNKNOWN: %s\n" "${MESSAGE}" ;;
esac
)
get_log_level() {
lvl="$1"
case $lvl in
debug | DEBUG | d | D)
lvl="0"
;;
info | INFO | I | i)
lvl="1"
;;
warning | warn | WARNING | WARN | W | w)
lvl="2"
;;
error | err | ERROR | ERR | E | e)
lvl="3"
;;
esac
echo $lvl
}
LOGLVL=$(get_log_level $CSC_LOGLVL)
# if [ "$LOGLVL" = 0 ]; then set -xv; fi
log() {
level=$1
message=$2
loglvl=$(get_log_level "$level")
if [ "$loglvl" -ge "$LOGLVL" ]; then
case $loglvl in
0 | debug)
fancy_message "info" "$level $message"
;;
1 | info)
fancy_message "info" "$level $message"
;;
2 | warn)
fancy_message "warn" "$level $message"
;;
3 | err)
fancy_message "error" "$level $message"
;;
esac
fi
}
. ./logger.sh
mkdir -p data
cd data
# HGNC approved symbols
wget -O- -q http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt | gzip -c > hgnc.gz
# map synonyms and previous symbols to current symbols
OTHERSYMBOLS="
prev
alias
"
echo "$OTHERSYMBOLS" | sed '/^$/d' | while read -r other;
do
current=$(zcat -f hgnc.gz | sed 1q | tr $'\t' '\n' | nl | grep -m1 "symbol" | cut -f1 | tr -d " ")
other_col=$(zcat -f hgnc.gz | sed 1q | tr $'\t' '\n' | nl | grep "$other"_symbol | cut -f1 | tr -d " ")
zcat -f hgnc.gz | tr -d '"' | awk -F "\t" -v current=$current -v other_col=$other_col '{split($other_col, others,"|"); for (o in others) {printf "%s\t%s\n", others[o], $current}}' | gzip -c > $other.gz
done
# HGNC withdrawn symbols
wget -O- -q http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/withdrawn.txt | gzip -c > withdrawn.gz
# Entrez gene symbols
wget -O- -q https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz > entrez.gz
# Annotation files, download and parse
annotations="
RefSeq GRCh37 latest https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz
RefSeq GRCh38 latest https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz
RefSeq T2T latest https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/110/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz
Ensembl GRCh37 release97 https://ftp.ensembl.org/pub/grch37/release-97/gff3/homo_sapiens/Homo_sapiens.GRCh37.87.gff3.gz
Ensembl GRCh37 latest https://ftp.ensembl.org/pub/grch37/release-109/gff3/homo_sapiens/Homo_sapiens.GRCh37.87.gff3.gz
Ensembl GRCh38 latest https://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.109.chr_patch_hapl_scaff.gff3.gz
Ensembl T2T latest https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_07/Homo_sapiens-GCA_009914755.4-2022_07-genes.gff3.gz
"
echo "$annotations" | sed '/^$/d' | while read -r source assembly version url; do
log "INFO" "Downloading $source annotation for $version version $assembly assembly.";
log "INFO" "Downloading from $url";
case "$source" in
"RefSeq")
wget -q -O- "$url" \
| zcat \
| awk -F"\t" \
'
/^#!/ {print}
/^##/ {next}
$3~/gene/ {
# sub(/^NC_[0]+/, "chr");
# sub(/^chr23/, "chrX"); sub(/^chr24/, "chrY");
# split($1,chrom,".");
split($9,info,"gene=");
split(info[2],gene,";");
# printf "%s\t%s\t%s\t%s\n", chrom[1], $4, $5, gene[1]}
printf "%s\t%s\t%s\t%s\n", $1, $4, $5, gene[1]}' |
gzip -c > $source.$assembly.$version.bed.gz &&
log "INFO" "BED file created at $source.$assembly.$version.bed.gz" ||
log "ERROR" "An error occured while creating BED file at $source.$assembly.$version.bed.gz";
;;
"Ensembl")
wget -q -O- "$url" | zcat | awk -F"\t" \
'
/^#!/ {print}
/^##/ {next}
$3~/gene/ {
split($9,info,"Name=");
split(info[2],gene,";");
printf "chr%s\t%s\t%s\t%s\n", $1, $4, $5, gene[1]}' |
gzip -c > $source.$assembly.$version.bed.gz &&
log "INFO" "BED file created at $source.$assembly.$version.bed.gz" ||
log "ERROR" "An error occured while creating BED file at $source.$assembly.$version.bed.gz";
;;
esac
done
Steps:
- Validate the gene symbol.
- Check if the given symbol is in the approved symbols.
- Check if the given symbol is an alias symbol.
- Check if the given symbol is a previous symbol.
- Check if the given symbol is withdrawn, split or merged.
- Get the previous, alias or withdrawn symbols
- Check symbol in annotation sources.
- Check if prev, alias withdrawn symbols are in annotation sources.
start=$(date +%s)
set -o errexit
set -o nounset
if [ "${TRACE-0}" = "1" ]; then
set -o xtrace
fi
usage() {
echo "
This script checks given gene symbols against given assemblies
Usage
-----
$0 [OPTIONS] [SYMBOL ...]
Example
-------
$0 -a T2T -s RefSeq -v latest WNT10B WNT10A WNT10C
Options
-------
-a --assembly
Default assemblies are GRCh37, GRCh38 and T2T.
You can use multiple assemblies by quoting them together like -a "GRCh37 GRCh38"
-c --no-cross-check
Don't check annotation sources. Just check alternative gene symbols and exit.
-l --log-level
Set log level. Default is INFO. Possible values are DEBUG, INFO, WARN, ERROR, FATAL
-h --help
Display this help message and exit.
-o --only-target
By default check-geneset.sh will run for every assembly. Use this option
to check only against given target file.
-s --source
Default assemblies are GRCh37, GRCh38 and T2T.
You can use multiple assemblies by quoting them together like -a "GRCh37 GRCh38"
-t --target
Custom bed file. If file name has the format: source.assembly.version.bed,
columns in the output table will be filled accordingly.
Custom file should look like this:
chrom start end symbol
chr1 1266694 1270686 TAS1R3
chr1 1270656 1284730 DVL1
chr1 1288069 1297157 MXRA8
-v --version
There is only one version for all of the assemblies which is latest.
You can install older assemblies and specify them with this parameter.
-V --version
Print current version and exit
"
exit
}
if [ $# -eq 0 ]; then
usage
fi
cd "$(dirname "$0")"
. ./logger.sh
# Check if the data dir cached to /dev/shm
CACHEDIR="/dev/shm/CSC_DATA"
if [ -d $CACHEDIR ]; then
DATADIR="$CACHEDIR"
log "DEBUG" "Using the $CACHEDIR"
else
DATADIR="./data"
log "DEBUG" "Using the default data dir"
fi
DATADIR="./data"
# Cross checking is enabled by default.
XCHECK=1
PARSED_ARGUMENTS=$(getopt -a -o a:chs:v:Vt:o -l assembly:,no-ross-check,help,source:,version:,target:,only-target -- "$@")
VALID_ARGUMENTS=$?
if [ "$VALID_ARGUMENTS" != "0" ]; then
usage
fi
eval set -- "$PARSED_ARGUMENTS"
while :; do
case "$1" in
-a | --assembly)
CSC_ASSEMBLIES="$2"
shift 2
;;
-c | --no-cross-check)
XCHECK=0
shift
;;
-l | --log-level)
CSC_LOGLVL="$2"
export CSC_LOGLVL
shift 2
;;
-o | --only-target)
[ -z ${CSC_TARGETS-} ] && echo "Can't use -o parameter without a target file." && exit
CSC_ASSEMBLIES="Unset"
CSC_SOURCES="Unset"
CSC_VERSIONS="Unset"
shift 1
;;
-s | --source)
CSC_SOURCES="$2"
shift 2
;;
-t | --target)
CSC_TARGETS="$2"
shift 2
# Validate target files
for target in $(echo "$CSC_TARGETS"); do
[ -f "$target" ] && log "DEBUG" "Found target file $target." || (log "ERROR" "Can't find target file $target. exiting..." && exit 1)
target_col_number=$(zcat -f "$target" | grep -v '^#' | awk '{print NF}' | uniq)
[ $(echo "$target_col_number" | wc -l) -gt 1 ] && log "ERROR" "Mismatch in target column numbers for given target file $target. There are lines with $(echo "$target_col_number" | tr "\n" " ") columns. exiting..." && exit 1
[ "$target_col_number" -ne 4 ] && log "ERROR" "Target $target has $target_col_number number of columns. Custom target files should have 4 columns: chrom start end and symbol. exiting..." && exit 1
log "DEBUG" "Custom target $target looks okay."
# We are checking if there are any non valid symbols in the custom target file
# This takes quite some time if the target file is large.
# A multiprocessing approach can be beneficial here
# Another approach might be sorting the data files alphabetically spliting the files by first one or two characters and just searching the splitted file.
# [ $(zcat -f "$target" | grep -v '^#' | awk '{print $4}' | xargs -I% ./cross-symbol-checker.sh -c % | grep -m1 "#WARNING No approved symbol found") ] && log "ERROR" "Found a non valid symbol at target file $target. exiting..." && exit 1 || log "DEBUG" "Custom target file $target symbols looks okay."
done
;;
-v | --version)
CSC_VERSIONS="$2"
shift 2
;;
-V )
echo "Cross-symbol checker v0.0.2"
exit
;;
# -- means the end of the arguments; drop this, and break out of the while loop
--)
shift
break
;;
# If invalid options were passed, then getopt should have reported an error,
# which we checked as VALID_ARGUMENTS when getopt was called...
*)
echo "Unexpected option: $1 - this should not happen."
usage
;;
esac
done
cp -r data/ /dev/shm/CSC_DATA || log "DEBUG" "Can't copy to tmpfs"
matches=""
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Startup took $runtime seconds"
get_annotation_sources() {
start=$(date +%s)
if [ -z "${CSC_SOURCES-}" ]; then
sources="
Ensembl
RefSeq
"
else
sources="$CSC_SOURCES"
fi
if [ -z "${CSC_ASSEMBLIES-}" ]; then
assemblies="
GRCh37
GRCh38
T2T
"
else
assemblies="$CSC_ASSEMBLIES"
fi
if [ -z "${CSC_VERSIONS-}" ]; then
versions="latest"
# Greps all of the versions
# versions=$(for source in $(echo "$sources"); do for assembly in $(echo "$assemblies"); do find data/ -name "$source.$assembly.*.bed.gz"; done ;done | cut -d"." -f3 | sort -u)
else
versions="$CSC_VERSIONS"
fi
# targets will look like this:
# source assembly version file_path
targets=""
for source in $(echo "$sources"); do
for assembly in $(echo "$assemblies"); do
for version in $(echo "$versions"); do
target_path="$DATADIR/$source.$assembly.$version.bed.gz"
[ -f "$target_path" ] || continue
targets="$targets\n$source\t$assembly\t$version\t$target_path"
done
done
done
if [ -z "${CSC_TARGETS-}" ]; then
custom_targets=""
else
for custom_target in $(echo "$CSC_TARGETS" | sort -u); do
# Here we check if file name has the format
# source.assembly.version.bed
custom_target_base="${custom_target##*/}"
read source assembly version bed <<-EOF
$(echo ${custom_target_base} | awk -F"." '{print $1" "$2" "$3" "$4}')
EOF
[ -z ${bed-} ] && targets="$targets\n$custom_target_base\tCustom\tCustom\t$custom_target" || targets="$targets\n$source\t$assembly\t$version\t$custom_target"
done
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME setting up annotations took $runtime seconds"
}
check_capitalization() {
start=$(date +%s)
symbol=$(echo "$1" | tr '[:lower:]' '[:upper:]' | awk '/C([1-9]|1[0-9]|2[0-2]|X|Y)ORF[0-9]+/ {gsub("ORF", "orf", $0)} 1')
if [ "$symbol" != "$1" ]; then
echo "WARNING $1 capitalization changed to $symbol"
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking capitalization took $runtime seconds"
}
check_approved() {
start=$(date +%s)
approved=$(zcat -f $DATADIR/hgnc.gz | awk -F "\t" -v symbol=$symbol '$2==symbol {print}')
if [ -z "$approved" ]; then
# Symbol is not in approved list or not a valid symbol
log "INFO" "$symbol is not in approved list :("
else
# Symbol is in approved list.
log "INFO" "$symbol is in approved list."
matches="$matches\nApproved\t$(echo "$approved" | cut -f 2)"
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking approved symbol took $runtime seconds"
}
check_alias() {
start=$(date +%s)
alias=$(zcat -f $DATADIR/alias.gz | awk -F "\t" -v symbol=$symbol '$1==symbol {print}')
if [ -z "$alias" ]; then
# Symbol is not in alias or not a valid symbol
log "INFO" "$symbol is not an alias symbol."
else
# Symbol is in alias symbols list.
log "INFO" "$symbol is an alias symbol."
matches="$matches\nAlias\t$(echo "$alias" | cut -f 2)"
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking alias symbol took $runtime seconds"
}
check_prev() {
start=$(date +%s)
prev=$(zcat -f $DATADIR/prev.gz | awk -F "\t" -v symbol=$symbol '$1==symbol {print}')
if [ -z "$prev" ]; then
# Symbol is not in previous symbols or not a valid symbol
log "INFO" "$symbol is not a previous symbol."
else
# Symbol is in previous symbols list.
log "INFO" "$symbol is a previous symbol."
matches="$matches\nPrev\t$(echo "$prev" | cut -f 2)"
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking previous symbol took $runtime seconds"
}
check_withdrawn() {
start=$(date +%s)
withdrawn=$(zcat -f $DATADIR/withdrawn.gz | awk -F "\t" -v symbol=$symbol '$3==symbol {print}')
if [ -z "$withdrawn" ]; then
# Symbol is not withdrawn or not a valid symbol
log "INFO" "$symbol is not in withdrawn list."
else
# Symbol is withdrawn/merged/split
echo "$withdrawn" | read -r ID STATUS SYMBOL REPORTS
case STATUS in
"Entry Withdrawn")
log "INFO" "WITHDRAWN $symbol is gone!"
;;
"Merged/Split")
echo "$REPORTS" |
tr ', ' '\n' |
sed '/^$/d;s/|/ /g' |
while read -r NEWID NEWSYMBOL NEWSTATUS; do
case "$NEWSTATUS" in
"Entry Withdrawn")
log "INFO" "MERGED/SPLIT $symbol has been $STATUS into $NEWSYMBOL which itself also got withdrawn. ;("
# matches="$matches\nWithdrawn but it got withdrawn too."
;;
"Approved")
log "INFO" "MERGED/SPLIT $symbol now lives on with the name $NEWSYMBOL."
matches="$matches\nWithdrawn$NEWSYMBOL"
;;
esac
done
;;
esac
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking withdrawn symbol took $runtime seconds"
}
get_approved_symbol() {
start=$(date +%s)
# We collect all possible approved_symbol(s) which we expect to be only one.
# However we check in case a symbol maps to multiple symbols.
if [ $(echo "$matches" | sed '/^$/d' | sort -u | wc -l) -eq 1 ]; then # this is what we expect.
case "$matches" in
"Approved*")
log "INFO" "$symbol was already an approved symbol."
;;
"Prev*")
log "INFO" "previous symbol $symbol matched with an approved symbol."
;;
"Alias*")
log "INFO" "alias symbol $symbol matched with an approved symbol."
;;
esac
approved_symbol=$(echo $matches | sed '/^$/d' | cut -f 2)
echo "APPROVED\t$approved_symbol"
elif [ $(echo "$matches" | sed '/^$/d' | sort -u | wc -l) -gt 1 ]; then # this is what we expect.
# Some approved symbols are alias to other symbols
# We are going to handle this case by picking the
# original input.
log "WARN" "$symbol matched with multiple approved symbols! $(echo "$matches" | sed '/^$/d' | cut -f 2 | tr '\n' ' ')"
echo "WARNING $symbol matched with multiple approved symbols! $(echo "$matches" | sed '/^$/d' | cut -f 2 | tr '\n' ' ')"
while read -r found_in appr_sym; do
case $found_in in
"Approved")
log "INFO" "Orginal input $symbol already was an approved symbol. Carrying out with this symbol."
approved_symbol="$appr_sym"
echo "APPROVED\t$approved_symbol"
;;
"Prev")
log "WARN" "$symbol was also $found_in symbol for approved symbol $appr_sym."
echo "WARNING $symbol was also $found_in symbol for approved symbol $appr_sym."
;;
"Alias")
log "INFO" "$symbol was also $found_in symbol for approved symbol $appr_sym."
;;
esac
done <<-EOF
$(echo "$matches")
EOF
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking if more than one approved symbol found took $runtime seconds"
}
check_date() {
start=$(date +%s)
if [ -z "${approved_symbol-}" ]; then
log "WARN" "No approved symbol found for $symbol"
echo "WARNING No approved symbol found for $symbol"
is_date=$(date -d "$symbol" 2>&1 | grep -v "invalid")
if [ -z "$is_date" ]; then
log "INFO" "doesn't look like a date."
else
log "WARN" "This is a date"
echo "WARNING This is a date"
fi
# TODO warn about this symbol
exit
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking if any approved symbol found took $runtime seconds"
}
get_alias_prev_withdrawn() {
start=$(date +%s)
unset alias
alias=$(zcat -f $DATADIR/alias.gz | awk -F "\t" -v symbol=$approved_symbol '$2==symbol {print}')
if [ -z "$alias" ]; then
# Symbol is not in alias or not a valid symbol
log "INFO" "$approved_symbol has no alias symbol."
else
# Symbol is in alias symbols list.
alias_symbols="$(echo "$alias" | cut -f 1 | sed 's/^/ALIAS\t/')"
echo "$alias_symbols"
fi
unset prev
prev=$(zcat -f $DATADIR/prev.gz | awk -F "\t" -v symbol=$approved_symbol '$2==symbol {print}')
if [ -z "$prev" ]; then
log "INFO" "$approved_symbol has no prev symbol."
else
prev_symbols="$(echo "$prev" | cut -f 1 | sed 's/^/PREV\t/')"
echo "$prev_symbols"
fi
unset withdrawn
withdrawn=$(zcat -f $DATADIR/withdrawn.gz | (grep "|$approved_symbol|" || true))
if [ -z "$withdrawn" ]; then
log "INFO" "$approved_symbol has no withdrawn symbol."
else
withdrawn_symbols="$(echo "$withdrawn" | cut -f 3 | sed 's/^/WITHDRAWN\t/')"
echo "$withdrawn_symbols"
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking for other symbols took $runtime seconds"
}
check_annotation_sources() {
start=$(date +%s)
table=""
if [ -z "$approved_symbol" ]; then
log "INFO" "no approved symbol found so not checking annotation sources for approved symbol."
else
while read -r source assembly version target_file; do
# Print out gff meta data
source_info=$(zcat -f "$target_file" | grep -m 3 '^#!' | sed "s/^#!/VERSION $source $assembly /")
echo "$source_info" | while read -r line; do log "INFO" "$line"; done
echo "$source_info"
# Get the non canonical chromosomes
if [ "$assembly" != "T2T" ]; then
case "$source" in
"RefSeq")
noncanonical=$(zcat -f "$target_file" | grep -v "^#" | awk -F"\t" '{print $1}' | sort -u | grep -v '^NC');
;;
"Ensembl")
noncanonical=$(zcat -f "$target_file" | grep -v '^#' | awk -F"\t" '{print $1}' | sort -u | grep -vE 'chr([1-9]|1[0-9]|2[0-2]|X|Y|MT)');
;;
esac
fi
while read -r status new_symbol; do
start_inner=$(date +%s%N)
if [ -n "${status-}" ]; then
match=$(zcat -f "$target_file" | (grep -m1 -w "$new_symbol" || true))
if [ -z "${match:-}" ]; then
log "INFO" "$status SYMBOL $new_symbol found in $source $assembly $version"
table=""$table"Absent\t$symbol\t$approved_symbol\t$new_symbol\t$status\t$source\t$assembly\t$version\n"
else
# check_noncanonical
for contig in $(echo "${noncanonical-}"); do
if echo "$match" | grep -q "$contig"; then
log "WARN" "Symbol $new_symbol not in a canonical chromosome in $source $assembly $version"
echo "WARNING Symbol $new_symbol not in a canonical chromosome in $source $assembly $version"
fi
done
log "INFO" "$status SYMBOL $new_symbol not found in $source $assembly $version"
table=""$table"Present\t$symbol\t$approved_symbol\t$new_symbol\t$status\t$source\t$assembly\t$version\t$match\n"
fi
fi
end_inner=$(date +%s%N)
runtime_inner=$(( (end_inner - start_inner) / 1000000 ))
log "DEBUG" "TIME Checking $target_file for symbol $new_symbol took $runtime_inner milliseconds"
done <<-EOF
$(echo "$approved_symbol"| sed 's/^/APPROVED\t/')
${prev_symbols-}
${alias_symbols-}
${withdrawn_symbols-}
EOF
done <<-EOF
$(echo ${targets-} | sed '/^$/d')
EOF
fi
end=$(date +%s)
runtime=$((end - start))
log "DEBUG" "TIME Checking annotation sources took $runtime seconds"
table=$(echo "$table" | sed '/^$/d;s/^/TABLE\t/')
}
main() {
if [ $XCHECK = 0 ]; then
log "INFO" "--no-cross-check is set. Not getting annotation sources."
else
get_annotation_sources
fi
input_count=$(echo "$@" | wc -w)
versions=""
warnings=""
rows=""
for symbol in $(echo "$@"); do
check_capitalization "$symbol"
check_approved
check_alias
check_prev
check_withdrawn
get_approved_symbol
check_date
if [ $XCHECK = 0 ]; then
log "INFO" "--no-cross-check is set. Exiting without cross checking"
else
check_annotation_sources
fi
while read -r line; do
case "$line" in
"VERSION"* )
versions="$versions#$line\n"
;;
"WARNING"* )
warnings="$warnings#$line\n"
;;
"TABLE Present"* )
rows="$rows$(echo "$line" | cut -f 3-12)\n"
;;
esac
done <<-EOF
$(./cross-symbol-checker.sh "$symbol")
EOF
done
# Final checks about what is found and not.
io_diff_message="#SUMMARY No difference between input and output counts."
unmatched_symbols_message="#SUMMARY There are no unmatched symbols"
unmatched_symbols_count=$(echo "$warnings" | grep "No approved symbol found" | wc -l)
output_count=$(echo "$rows" | sed "/^$/d" | wc -l)
io_diff=$(( output_count - input_count + unmatched_symbols_count ))
if [ $unmatched_symbols_count -eq 0 ]; then
if [ $io_diff -gt 0 ]; then
io_diff_message="#SUMMARY There are $io_diff more outputs then inputs! this might happen if there are more than one symbol (e.g. both approved and an alias) for gene in the annotation"
elif [ $io_diff -lt 0 ]; then # This should not happen? Because there are no unmatched symbols
io_diff_message="#SUMMARY There are $(( io_diff * -1 )) more inputs then outputs. Check warnings for more info. The target doesn't have one or more symbols."
else
io_diff_message="#SUMMARY No difference between input and output counts."
fi
else
unmatched_symbols_message="#SUMMARY There is/are $unmatched_symbols_count unmatched symbol(s)."
if [ $io_diff -gt 0 ]; then
io_diff_message="#SUMMARY There are $io_diff more outputs then inputs! this might happen if there are more than one symbol (e.g. both approved and an alias) for gene in the annotation. Or if you selected multiple sources, assemblies or versions."
elif [ $io_diff -lt 0 ]; then
io_diff_message="#SUMMARY There are $(( io_diff * -1 )) more inputs then outputs. Check warnings for more info."
else
io_diff_message="#SUMMARY There are no duplications for the symbols."
fi
fi
echo "#COLUMN Input_symbol: Initial symbol entered.
#COLUMN Approved_symbol: Current symbol approved by HGNC for input symbol.
#COLUMN Symbol: This is the symbol found in annotation source. Pay extra attention if it's not an approved symbol.
#COLUMN Status: Status of the Symbol column. Either approved, alias, previous, or withdrawn.
#COLUMN Source: Annotation source.
#COLUMN Assembly: Target assembly.
#COLUMN Version: Version of the target source.
#COLUMN Chrom: name of the chromosome.
#COLUMN Start: start position of the gene.
#COLUMN End: end position of the gene.
#SUMMARY Number of input symbols are $input_count
#SUMMARY Number of output symbols are $output_count
$unmatched_symbols_message
$io_diff_message"
echo "$(echo "$warnings" | sed "/^$/d" | grep . || echo "#SUMMARY There were no warnings about input symbols.")"
if [ -z "${rows:-}" ]; then
echo "No output was produced!"
else
echo "$(echo "$versions" | sed "/^$/d" | sort -u | grep . || echo "#There is no version info")"
echo "Input_symbol\tApproved_symbol\tSymbol\tStatus\tSource\tAssembly\tVersion\tChrom\tStart\tEnd"
echo "$(echo "$rows" | sed "/^$/d")"
fi
}
main "$@"
test_nonexistent_target_file() {
nonexisting_file=$(mktemp)
rm non_existing_file
./check_geneset SHFM -t "$nonexistent_target_file"
}
# TODO: write more tests