Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BPC-scripts/gast_ill
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
708 lines (605 sloc)
23.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
######################################### | |
# | |
# gast: compares trimmed sequences against a reference database for assigning taxonomy | |
# | |
# Author: Susan Huse, shuse@mbl.edu | |
# | |
# Date: Fri Feb 25 11:41:10 EST 2011 | |
# | |
# Copyright (C) 2011 Marine Biological Laborotory, Woods Hole, MA | |
# | |
# This program is free software; you can redistribute it and/or | |
# modify it under the terms of the GNU General Public License | |
# as published by the Free Software Foundation; either version 2 | |
# of the License, or (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# For a copy of the GNU General Public License, write to the Free Software | |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
# or visit http://www.gnu.org/copyleft/gpl.html | |
# | |
# Keywords: gast taxonomy refssu | |
# | |
# Assumptions: | |
# | |
# Revisions: ASh, add illumina changes | |
# | |
# Programming Notes: | |
# Create all the necessary file names | |
# Run the uniques using mothur or use illumina uniqued fa | |
# vsearch against the reference database | |
# and parse the output for top hits | |
# Calculate consensus taxonomy | |
# Load the file into the database | |
######################################## | |
use strict; | |
use warnings; | |
use Taxonomy; | |
####################################### | |
# | |
# Set up usage statement | |
# | |
####################################### | |
my $script_help = " | |
gast - reads a fasta file of trimmed 16S sequences, compares each sequence to | |
a set of similarly trimmed ( or full-length ) 16S reference sequences and assigns taxonomy. | |
Now uses vsearch, needs fasta files as a reference, not .udb. Tested for Illumina. | |
\n"; | |
my $usage = " | |
Usage example: gast_ill -saveuc -nodup -in mydata.unique.nonchimeric.fa -db refv4v5.fa -rtax refv4v5.tax -out mydata.unique.nonchimeric.fa.gast -uc mydata.unique.nonchimeric.fa.uc | |
ITS1 ex: | |
(use -ignoregaps) | |
gast_ill -ignoregaps -saveuc -nodup -in my.fasta -db /xraid2-2/g454/blastdbs/gast_distributions/refits1.fa -rtax /xraid2-2/g454/blastdbs/gast_distributions/refits1.tax -out my.gast | |
Options (https://github.com/torognes/vsearch/releases/download/v2.6.2/vsearch_manual.pdf):\n"; | |
$usage .= "-----\n"; | |
# $usearch_cmd .= " -notrunclabels -gapopen 6I/1E -usearch_global $uniques_filename -strand $strand -uc_allhits -uc $uclust_filename -maxaccepts $max_accepts -maxrejects $max_rejects $curr_threads -id $min_pctid"; | |
# notrunclabels: https://www.drive5.com/usearch/manual/opt_notrunclabels.html | |
# gapopen: https://www.drive5.com/usearch/manual/aln_params.html | |
# https://www.drive5.com/usearch/manual/cmd_usearch_global.html | |
# ‑uc_allhits option (No longer supported). https://www.drive5.com/usearch/manual/opt_uc_allhits.html | |
$usage .= "Required:\n"; | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-in:", "in_filename;", "Input fasta file", "(REQUIRED)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-ref:", "ref_filename;", "reference fasta file containing unique sequences of known taxonomy. The definition line should include the ID used in the reference taxonomy file. Any other information on the definition line should be separated by a space or a '|' symbol.", "(REQUIRED)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-db:", "udb_filename;", "the same as -ref", "(REQUIRED)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-rtax:", "reftax_filename;", "reference taxa file with taxonomy for all copies of the sequences in the reference fasta file. This is a tab-delimited file, three columns, describing the taxonomy of the reference sequences. The ID matching the reference fasta, the taxonomy and the number of reference sequences with this same taxonomy.", "(REQUIRED)"); | |
$usage .= "-----\n"; | |
$usage .= "Either an output file or a mysql database table name is required:\n"; | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-out:", "out_filename;", "output filename", ""); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-table:", "gast_table;", "mysql database table", ""); | |
$usage .= "-----\n"; | |
$usage .= "Optional:\n"; | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-full:", "use_full_length = 1;", "input data will be compared against full length 16S reference sequences [default: not full length]", "(DEFAULT: 0)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-host:", "db_host;", "mysql server host name", "(DEFAULT: 'bpcdb1')"); | |
$usage .= sprintf("%-13s %-17s %-37s %-13s\n", "-ignoregaps:", "ignore_all_gaps = 1;", "", "(DEFAULT: 0)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-maj:", "majority;", "percent majority required for taxonomic consensus", "(DEFAULT: 66)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-maxa:", "max_accepts;", "www.drive5.com/usearch/manual/termination_options.html", "(DEFAULT: 15)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-maxg:", "max_gap;", "if too large a gap, tends to be indicative of chimera or other non-matches", "(DEFAULT: 10)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-maxr:", "max_rejects;", "www.drive5.com/usearch/manual/termination_options.html", "(DEFAULT: 0)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-id:", "min_pctid;", "specifies the minimum sequence identity of a hit", "(DEFAULT: 0.80)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-nodup:", "nodup = 1;", "instead of uniquing sequences with mothur just make a copy of existing file with .unique.fa", "(DEFAULT: 0)"); | |
$usage .= sprintf("%-13s %-17s %-36s %-13s\n", "-saveuc:", "save_uclust_file = 1;", "", "(DEFAULT: 0)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-sql_db:", "db_name;", "mysql database name", "(DEFAULT: 'env454')"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-strand:", "strand;", "www.drive5.com/usearch/manual/cmd_closed_ref.html [plus/both].", "(DEFAULT: 'plus')"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-termg:", "ignore_terminal_gaps = 1;", "only ignore for max gap size, still included in the distance calculations", "(DEFAULT: 0)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-terse:", "terse = 1;", "minimal output, includes only ID, taxonomy, and distance", "(DEFAULT: 0)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-threads:", "threads;", "number of threads to use, zero for all cores", "(DEFAULT: 0)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-uc:", "uclust_filename;", "", "(DEFAULT: 'input fasta file'.uc)"); | |
$usage .= sprintf("%-13s %-17s %-40s %-13s\n", "-v:", "verbose = 1;", "", "(DEFAULT: 0)"); | |
####################################### | |
# | |
# Definition statements | |
# | |
####################################### | |
my $verbose = 0; | |
my $self_cmd = join(" ", $0, @ARGV); | |
my $log_filename = "gast.log"; | |
my $in_filename; | |
my $in_prefix; | |
my $ref_filename = ''; | |
my $udb_filename = ''; | |
my $reftax_filename; | |
my $out_filename; | |
my $terse = 0; | |
my $nodup = 0; | |
# Load into a database variables | |
my $db_host = 'bpcdb1'; | |
my $db_name = 'env454'; | |
my $mysqlimport_log = "gast.mysqlimport.log"; | |
my $mysqlimport_cmd = "mysqlimport -C -v -L -h $db_host $db_name "; | |
my $gast_table; | |
# USearch variables | |
# my $usearch_cmd = "usearch6"; | |
# my $usearch_cmd = "module load vsearch; vsearch"; | |
my $usearch_cmd = "vsearch"; | |
my $min_pctid = 0.80; | |
my $max_accepts = 15; | |
my $max_rejects = 0; | |
my $strand = "plus"; | |
my $threads; | |
my $uclust_filename = ""; | |
# Parse USearch output file variables | |
my $max_gap = 10; | |
my $ignore_terminal_gaps = 0; # only ignore for max gap size, still included in the distance calculations | |
my $ignore_all_gaps = 0; | |
my $save_uclust_file = 0; | |
my $use_full_length = 0; | |
# Taxonomy variables | |
my $majority = 66; | |
####################################### | |
# | |
# Test for commandline arguments | |
# | |
####################################### | |
if (! $ARGV[0] ) | |
{ | |
print $script_help; | |
print $usage; | |
exit -1; | |
} | |
while ((scalar @ARGV > 0) && ($ARGV[0] =~ /^-/)) | |
{ | |
if ($ARGV[0] =~ /-h/) | |
{ | |
print $script_help; | |
print $usage; | |
exit 0; | |
} elsif ($ARGV[0] eq "-in") { | |
shift @ARGV; | |
$in_filename = shift @ARGV; | |
} elsif ($ARGV[0] eq "-out") { | |
shift @ARGV; | |
$out_filename = shift @ARGV; | |
} elsif ($ARGV[0] eq "-ref") { | |
shift @ARGV; | |
$ref_filename = shift @ARGV; | |
} elsif ($ARGV[0] eq "-db") { | |
shift @ARGV; | |
$udb_filename = shift @ARGV; | |
} elsif ($ARGV[0] eq "-rtax") { | |
shift @ARGV; | |
$reftax_filename = shift @ARGV; | |
} elsif ($ARGV[0] eq "-terse") { | |
shift @ARGV; | |
$terse = 1; | |
} elsif ($ARGV[0] eq "-maxg") { | |
shift @ARGV; | |
$max_gap = shift @ARGV; | |
} elsif ($ARGV[0] eq "-maj") { | |
shift @ARGV; | |
$majority = shift @ARGV; | |
} elsif ($ARGV[0] eq "-uc") { | |
shift @ARGV; | |
$uclust_filename = shift @ARGV; | |
} elsif ($ARGV[0] eq "-full") { | |
shift @ARGV; | |
$use_full_length = 1; | |
} elsif ($ARGV[0] eq "-termg") { | |
$ignore_terminal_gaps = 1; | |
shift @ARGV; | |
} elsif ($ARGV[0] eq "-ignoregaps") { | |
$ignore_all_gaps = 1; | |
shift @ARGV; | |
} elsif ($ARGV[0] eq "-sql_db") { | |
shift @ARGV; | |
$db_name = shift @ARGV; | |
} elsif ($ARGV[0] eq "-host") { | |
shift @ARGV; | |
$db_host = shift @ARGV; | |
} elsif ($ARGV[0] eq "-table") { | |
shift @ARGV; | |
$gast_table = shift @ARGV; | |
} elsif ($ARGV[0] eq "-saveuc") { | |
$save_uclust_file = 1; | |
shift @ARGV; | |
} elsif ($ARGV[0] eq "-threads") { | |
shift @ARGV; | |
$threads = shift @ARGV; | |
} elsif ($ARGV[0] eq "-maxa") { | |
shift @ARGV; | |
$max_accepts = shift @ARGV; | |
} elsif ($ARGV[0] eq "-maxr") { | |
shift @ARGV; | |
$max_rejects = shift @ARGV; | |
} elsif ($ARGV[0] eq "-strand") { | |
shift @ARGV; | |
$strand = shift @ARGV; | |
} elsif ($ARGV[0] eq "-id") { | |
shift @ARGV; | |
$min_pctid = shift @ARGV; | |
} elsif ($ARGV[0] eq "-v") { | |
$verbose = 1; | |
shift @ARGV; | |
} elsif ($ARGV[0] eq "-nodup") { | |
shift @ARGV; | |
$nodup = 1; | |
} elsif ($ARGV[0] =~ /^-/) { #unknown parameter, just get rid of it | |
print "Unknown commandline flag \"$ARGV[0]\".\n"; | |
print $usage; | |
exit -1; | |
} | |
} | |
####################################### | |
# | |
# Parse commandline arguments, ARGV | |
# | |
####################################### | |
if ( (! $in_filename) || ( (! $ref_filename) && (! $udb_filename) ) ) | |
{ | |
print "Incorrect number of arguments.\n"; | |
print "$usage\n"; | |
exit; | |
} | |
if ( ( ($out_filename) && ($gast_table) ) || ( (! $out_filename) && (! $gast_table) ) ) | |
{ | |
print "Please specify either an output file or a database table.\n"; | |
print "$usage\n"; | |
exit; | |
} | |
#Test validity of commandline arguments | |
if (! -f $in_filename) | |
{ | |
print "Unable to locate input fasta file: $in_filename.\n"; | |
exit -1; | |
} | |
if ( ($ref_filename) && (! -f $ref_filename) ) | |
{ | |
print "Unable to locate reference fasta file: $ref_filename.\n"; | |
exit -1; | |
} | |
if ( ($udb_filename) && (! -f $udb_filename) ) | |
{ | |
print "Unable to locate reference udb file: $udb_filename.\n"; | |
exit -1; | |
} | |
####################################### | |
# | |
# Create all the necessary file names | |
# | |
####################################### | |
open(LOG, ">>$log_filename") || die("Unable to write to output log file: $log_filename. Exiting.\n"); | |
print LOG "$self_cmd (" . (localtime) . ")\n"; | |
# if no outfilename, then writing to database, create tmp file to store the data | |
if (! $out_filename) | |
{ | |
$out_filename = $gast_table . int(rand(9999)) . ".txt"; | |
} | |
open(OUT, ">$out_filename") || die("Unable to write to output file: $out_filename. Exiting.\n"); | |
# determine the file prefix used by mothur(unique.seqs) | |
my $file_prefix = $in_filename; | |
my $file_suffix = $in_filename; | |
$file_prefix =~ s/(^.*)\..*$/$1/; | |
$file_suffix =~ s/^.*\.//; | |
my $uniques_filename = $file_prefix . ".unique." . $file_suffix; | |
my $names_filename = $file_prefix . ".names"; | |
if (! $uclust_filename) | |
{ | |
$uclust_filename = $uniques_filename . ".uc"; | |
} | |
####################################### | |
# | |
# Run the uniques using mothur | |
# | |
####################################### | |
my $mothur_cmd = "mothur \"#unique.seqs(fasta=$in_filename);\""; | |
# for illumina don't use mothur | |
if ($nodup == 1) | |
{ | |
run_command("cp $in_filename $uniques_filename"); | |
run_command("awk '{if(\$0 ~ /^>/) print \$0\"\t\"\$0}' $in_filename | tr -d \">\" > $names_filename"); | |
} | |
else | |
{ | |
run_command($mothur_cmd); | |
} | |
####################################### | |
# | |
# USearch against the reference database | |
# and parse the output for top hits | |
# | |
####################################### | |
# old version | |
#my $uclust_cmd = "uclust --iddef 3 --input $uniques_filename --lib $ref_filename --uc $uclust_filename --libonly --allhits --maxaccepts $max_accepts --maxrejects $max_rejects --id $min_pctid"; | |
if ($ref_filename) | |
{ | |
$usearch_cmd .= " --db $ref_filename "; | |
} else { | |
$usearch_cmd .= " --db $udb_filename"; | |
} | |
#$usearch_cmd .= " --gapopen 6I/1E --iddef 3 --global --query $uniques_filename --uc $uclust_filename --maxaccepts $max_accepts --maxrejects $max_rejects --id $min_pctid"; | |
#$usearch_cmd .= "usearch6 -db /xraid2-2/g454/blastdbs/gast_distributions/refv6.udb -gapopen 6I/1E -usearch_global J2_613_21A_CaymanRock-PERFECT_reads.fa.unique.unique -strand plus -uc_allhits -uc J2_613_21A_CaymanRock-PERFECT_reads.fa.unique.unique.uc -maxaccepts 15 -maxrejects 0 -id 0.8"; | |
my $curr_threads = ""; | |
if ($threads) | |
{ | |
$curr_threads = "-threads $threads" | |
} | |
$usearch_cmd .= " -notrunclabels -gapopen 6I/1E -usearch_global $uniques_filename -strand $strand -uc_allhits -uc $uclust_filename -maxaccepts $max_accepts -maxrejects $max_rejects $curr_threads -id $min_pctid"; | |
# $usearch_cmd .= " -notrunclabels -gapopen 6I/1E -usearch_global $uniques_filename -strand $strand -uc_allhits -uc $uclust_filename -maxaccepts $max_accepts -maxrejects $max_rejects -threads $threads -id $min_pctid"; | |
print "$usearch_cmd\n"; | |
run_command($usearch_cmd); | |
my $gast_results_ref = parse_uclust($uclust_filename); | |
####################################### | |
# | |
# Calculate consensus taxonomy | |
# | |
####################################### | |
print LOG "Assigning taxonomy\n"; | |
my $ref_taxa_ref = load_reftaxa($reftax_filename); | |
assign_taxonomy($names_filename, $gast_results_ref, $ref_taxa_ref); | |
####################################### | |
# | |
# Load the file into the database | |
# | |
####################################### | |
if ($gast_table) | |
{ | |
$mysqlimport_cmd .= "$out_filename >> $mysqlimport_log"; | |
run_command($mysqlimport_cmd); | |
run_command("rm $out_filename"); | |
} | |
if (! $save_uclust_file) | |
{ | |
run_command("rm $uclust_filename"); | |
} | |
exit; | |
########################## SUBROUTINES ####################################### | |
############################################################ | |
# | |
# Subroutine: run_command | |
# run system commands | |
# | |
# ######################################################### | |
sub run_command | |
{ | |
my $command = shift; | |
print LOG "$command\n"; | |
if ($verbose) {print "$command\n";} | |
my $command_err = system($command); | |
if ($command_err) | |
{ | |
my $err_msg = "Error $command_err encountered while running: \"$command\". Exiting."; | |
print LOG "$err_msg\n"; | |
warn "$err_msg\n"; | |
exit; | |
} | |
} | |
####################################### | |
# | |
# Subroutine: parse_uclust | |
# Parse the USearch results and grab the top hit | |
# | |
####################################### | |
sub parse_uclust | |
{ | |
my $uc_file = shift; | |
my %uc_aligns; | |
my %refs_at_pctid; | |
my %results; | |
# read in the data | |
open(UC, $uc_file); | |
while (my $line = <UC>) | |
{ | |
if ($line =~ /^H/) | |
{ | |
#It has a valid hit | |
chomp $line; | |
my @data; | |
# 0=Type, 1=ClusterNr, 2=SeqLength, 3=PctId, 4=Strand, 5=QueryStart, 6=SeedStart, 7=Alignment, 8=Sequence ID, 9= Reference ID | |
if ($nodup) # for illumina a header contains spaces | |
{ | |
@data = split(/\t+/, $line); | |
} | |
else | |
{ | |
@data = split(/\s+/, $line); | |
} | |
my $ref = $data[9]; | |
my $tax = $data[9]; | |
$ref =~ s/\|.*$//; | |
$tax =~ s/^.*\|//; | |
# store the alignment for each read / ref | |
$uc_aligns{$data[8]}{$ref} = $data[7]; | |
# create a look up for refs of a given pctID for each read | |
push (@{$refs_at_pctid{$data[8]}{$data[3]}}, $ref); | |
} | |
} | |
# | |
# For each read | |
# | |
foreach my $read (keys %uc_aligns) | |
{ | |
my $found_hit = 0; | |
# for each read, start with the max PctID | |
foreach my $pctid ( sort {$b<=>$a} keys %{$refs_at_pctid{$read}} ) | |
{ | |
foreach my $ref ( @{$refs_at_pctid{$read}{$pctid}}) | |
{ | |
# | |
# Check the alignment for large gaps and/or remove terminal gaps | |
# | |
my $original_align = $uc_aligns{$read}{$ref}; | |
my $align = $original_align; # Use this to remove terminal gaps | |
if ( ($ignore_terminal_gaps) || ($ignore_all_gaps) ) | |
{ | |
$align =~ s/^[0-9]*[DI]//; | |
$align =~ s/[0-9]*[DI]$//; | |
} | |
elsif($use_full_length) | |
{ | |
$align =~ s/^[0-9]*[I]//; | |
$align =~ s/[0-9]*[I]$//; | |
} | |
$found_hit = 1; | |
# has internal gaps | |
if ( ($use_full_length) || (! $ignore_all_gaps) ) | |
{ | |
while ($align =~ /[DI]/) | |
{ | |
$align =~ s/^[0-9]*[M]//; # leading remove matches | |
$align =~ s/^[ID]//; # remove singleton indels | |
if ($align =~ /^[0-9]*[ID]/) | |
{ | |
my $gap = $align; | |
$align =~ s/^[0-9]*[ID]//; # remove gap from aligment | |
$gap =~ s/[ID]$align//; # remove alignment from gap | |
# if too large a gap, tends to be indicative of chimera or other non-matches | |
# then skip to the next ref | |
if ($gap > $max_gap) | |
{ | |
if ($verbose) { print "Skip $ref of $read at $pctid pctid for $gap gap. \n";} | |
$found_hit = 0; | |
last; | |
} | |
} | |
} | |
if (! $found_hit) {next;} #don't print this one out. | |
} | |
# | |
# convert from percent identity to distance | |
# | |
my $dist = (int((10* (100 -$pctid)) + 0.5)) / 1000; | |
# | |
# print out the data | |
# | |
if ($verbose) {print join ("\t", $read, $ref, $pctid, $dist, $original_align) . "\n"; } | |
push ( @{$results{$read}}, [ $ref, $dist, $original_align] ) ; | |
} | |
# Don't go to lower PctIDs if found a hit at this PctID. | |
if ($found_hit) {last;} | |
} | |
} | |
return \%results; | |
} | |
####################################### | |
# | |
# Subroutine: load_reftaxa | |
# get dupes of the reference sequences and their taxonomy | |
# | |
####################################### | |
sub load_reftaxa | |
{ | |
my $tax_file = shift; | |
my %taxa; | |
my $no_freq = 0; | |
open(TAX, "<$tax_file") || die ("Unable to open reference taxonomy file: $tax_file. Exiting\n"); | |
while (my $line = <TAX>) | |
{ | |
chomp $line; | |
# 0=ref_id, 1 = count, 2 = taxa | |
my @data = split(/\t/, $line); | |
my @copies; | |
# if there is no frequency make it 1 | |
if ((!($data[2])) || ($data[2] =~ /\D/)) | |
{ | |
$data[2] = 1; | |
$no_freq = 1; | |
} | |
# foreach instance of that taxa | |
for my $i (1 .. $data[2]) | |
{ | |
unless ($i =~ /\D/) | |
{ | |
# add that taxonomy to an array | |
push(@copies, $data[1]); | |
} | |
} | |
# add that array to the array of all taxa for that ref, stored in the taxa hash | |
push (@{$taxa{$data[0]}}, @copies); | |
} | |
#if ($no_freq == 1) | |
#{ | |
# print "ERROR: There is no frequency in the .tax file! Use 1.\n"; | |
#} | |
return \%taxa; | |
} | |
####################################### | |
# | |
# Subroutine: assign_taxonomy | |
# get dupes from the names file and calculate consensus taxonomy | |
# | |
####################################### | |
sub assign_taxonomy | |
{ | |
my $names_file = shift; | |
my $results_ref = shift; | |
my %results = %$results_ref; | |
my $ref_taxa_ref = shift; | |
my %ref_taxa = %$ref_taxa_ref; | |
# print LOG 'EEE %results = '; | |
# use Data::Dumper; | |
# print LOG Dumper %results; # or \%hash to encapsulate it as a single hashref entity; | |
# print the field header lines, but not if loading table to the database | |
if (! $gast_table) | |
{ | |
if ($terse) | |
{ | |
print OUT join("\t", "read_id", "taxonomy", "distance", "rank") . "\n"; | |
} else { | |
print OUT join("\t", "read_id", "taxonomy", "distance", "rank", "refssu_count", "vote", "minrank", "taxa_counts", "max_pcts", "na_pcts", "refhvr_ids") . "\n"; | |
} | |
} | |
open(NAMES, "<$names_file") || die ("Unable to open names file: $names_file. Exiting\n"); | |
while (my $line = <NAMES>) | |
{ | |
# Parse the names information | |
chomp $line; | |
my @data = split(/\t/, $line); | |
my @dupes = split(/,/, $data[1]); | |
my $read = $data[0]; | |
my @taxObjects; | |
my $distance; | |
my %refs_for; | |
if (! exists $results{$read}) | |
{ | |
# No valid hit in the reference database | |
$results{$read} = ["Unknown", 1, "NA", 0, 0, "NA", "0;0;0;0;0;0;0;0", "0;0;0;0;0;0;0;0", "100;100;100;100;100;100;100;100"]; | |
$refs_for{$read} = [ "NA" ]; | |
} else { | |
# Create an array of taxonomy objects for all the associated refssu_ids. | |
for my $i ( 0 .. $#{$results{$read}} ) | |
{ | |
# %{$results{$read}} is a hash of an array of arrays, so use index $i to step through each array, | |
### 0= $ref, 1= $dist, 2= $original_align, 3= $taxonomy_of{$ref} ]; | |
my $ref = $results{$read}[$i][0]; | |
# grab all the taxa assigned to that ref in the database, and add to the taxObjects to be used for consensus | |
foreach my $t (@{$ref_taxa{$ref}}) | |
{ | |
push ( @taxObjects, Taxonomy->new($t) ); | |
} | |
# maintain the list of references hit | |
push ( @{$refs_for{$read}}, $results{$read}[$i][0] ); | |
# should all be the same distance | |
$distance = $results{$read}[$i][1]; | |
} | |
# Lookup the consensus taxonomy for the array | |
my @taxReturn = Taxonomy->consensus(@taxObjects, $majority); | |
# 0=taxObj, 1=winning vote, 2=minrank, 3=rankCounts, 4=maxPcts, 5=naPcts; | |
my $taxon = $taxReturn[0]->taxstring; | |
my $rank = $taxReturn[0]->depth; | |
if (! $taxon) {$taxon = "Unknown";} | |
# (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts) | |
$results{$read} = [ $taxon, $distance, $rank, scalar @taxObjects, $taxReturn[1], $taxReturn[2], $taxReturn[3], $taxReturn[4], $taxReturn[5] ] ; | |
} | |
# Replace hash with final taxonomy results, for each copy of the sequence | |
for my $d (@dupes) | |
{ | |
# @dupes includes the original as well, not just its copies | |
if ($terse) | |
{ | |
print OUT join("\t", $d, $results{$read}[0], $results{$read}[1], $results{$read}[2]) . "\n"; | |
} else { | |
print OUT join("\t", $d, @{$results{$read}}, join(",", sort @{$refs_for{$read}})) . "\n"; | |
# returns wrong results: | |
# my $temp_res = @{$results{$read}}; | |
# print OUT join("\t", $d, $$temp_res, join(",", sort @{$refs_for{$read}})) . "\n"; | |
} | |
} | |
} | |
close(NAMES); | |
close(OUT); | |
return (\%results); | |
} |