## getMito
- GitHub page: https://github.com/shenjean/getMito
- Wiki page: https://github.com/shenjean/getMito/wiki

In [73]:
# From a user-provided list of genera/species/subspecies, this script extracts the corresponding GenBank accession numbers 
# and gene names of their 12S rRNA sequences or mitochondrial sequences, if available.

# This script is interactive and takes 3 inputs, in the following order:
# 1) Input file name with extension (e.g. input.txt)
# 2) Output file prefix for output files: <prefix>_genus.hits.tsv, <prefix>_species.hits.tsv, <prefix>_exact.hits.tsv
# 3) Reference database - either 12S.ref.tsv or mitofish.ref.tsv



import sys
import os.path
from os import path

# Get the 3 inputs from user

input_file = input()
ref=tuple(open(input_file,'r'))
output_prefix = input()
reference_file = input()



# Throw an error message and exit if output file(s) already exist

full_path=str(output_prefix+"_subspecies.hits.tsv")
species_path=str(output_prefix+"_species.hits.tsv")
genus_path=str(output_prefix+"_genus.hits.tsv")


if path.exists(full_path) or path.exists(species_path) or path.exists(genus_path) :
    sys.exit("Error: Output file exists! Please rename output file and try again!")
    
# This function performs matching at the specified level and writes results to the corresponding output file
# Output files are tab-separated with the following columns:
# Query, taxonomic level, GenBank accession number, gene description

def matchme(query,level):
    count=0
    outpath=str(output_prefix+"_"+level+".hits.tsv")
    output=open(outpath,'a')
    with open(reference_file, 'r') as f:
        for line in f.readlines():
            if query in line:
                    count += 1
                    output.write("%s\t%s\t%s" % (query,level,line))
    output.close()
    print("Query:%s\tLevel:%s\t# Hits:%d" % (query,level,count))
    return;

# The while loop below goes through the input file line by line 
i = 0
seen=set()

while (i < len(ref)):
    
    # Split string in query into genus,species, and subspecies (if present)
    taxa=str(ref[i]).rsplit()
    fulltaxa=str(ref[i]).rsplit("\n")
    fullquery=str(fulltaxa[0])
    gquery=str(taxa[0])
    
    # Check if species string exist in query
    if (len(taxa)>1):
        squery=str(taxa[0]+" "+taxa[1])
    
    qcount = i+1
    print ("=== Searching user query #%d ===" % qcount)

# These if statements determine the level of matching (subspecies/species/genus) for each UNIQUE query 
    
    if (fullquery==gquery):
        if fullquery not in seen:
            matchme(query=fullquery,level="genus")
            seen.add(fullquery)
        else:
            print("Duplicate warning: Genus %s has already been processed." % fullquery)
        
    elif (fullquery==squery):
        if fullquery not in seen:
            matchme(query=fullquery,level="species")
            seen.add(fullquery)
        else:
            print("Duplicate query: Species %s has already been processed." % fullquery)
        if gquery not in seen:
            matchme(query=gquery,level="genus")
            seen.add(gquery)
        else:
            print("Duplicate query: Genus %s has already been processed." % gquery)
    
    else:
        if fullquery not in seen:
            matchme(query=fullquery,level="subspecies")
            seen.add(fullquery)
        else:
            print("Duplicate query: Species %s has already been processed." % fullquery)
        if squery not in seen:
            matchme(query=squery,level="species")
            seen.add(squery)
        else:
            print("Duplicate query: Species %s has already been processed." % squery)
        if gquery not in seen:
            matchme(query=gquery,level="genus")
            seen.add(gquery)
        else:
            print("Duplicate query: Genus %s has already been processed." % gquery)

    i += 1

print ("==== Run complete! ===")

# Check and report on the types of output files generated 

if path.exists(genus_path): 
    print ("Accession numbers of genus hits and description saved in %s" % genus_path)
else:
    print("No genus detected in input file.")
    
if path.exists(species_path):
    print ("Accession numbers of species hits and description saved in %s" % species_path)
else:
    print("No species detected in input file.")
    
    
if path.exists(full_path):
    print ("Accession numbers of subspecies hits and description saved in %s" % full_path)
else:
    print("No subspecies detected in input file.")




subspecies
pysub
mitofish.ref.tsv
=== Searching user query #1 ===
Query:Histioteuthis celetaria celetaria	Level:subspecies	# Hits:0
Query:Histioteuthis celetaria	Level:species	# Hits:0
Query:Histioteuthis	Level:genus	# Hits:0
=== Searching user query #2 ===
Query:Histioteuthis corona corona	Level:subspecies	# Hits:0
Query:Histioteuthis corona	Level:species	# Hits:0
Duplicate query: Genus Histioteuthis has already been processed.
=== Searching user query #3 ===
Query:Stomias boa boa	Level:subspecies	# Hits:0
Query:Stomias boa	Level:species	# Hits:28
Query:Stomias	Level:genus	# Hits:86
=== Searching user query #4 ===
Query:Lampadena urophaos atlantica	Level:subspecies	# Hits:0
Query:Lampadena urophaos	Level:species	# Hits:13
Query:Lampadena	Level:genus	# Hits:63
=== Searching user query #5 ===
Query:Notoscopelus elongatus kroyeri	Level:subspecies	# Hits:3
Query:Notoscopelus elongatus	Level:species	# Hits:18
Query:Notoscopelus	Level:genus	# Hits:64
=== Searching user query #6 ===
Query:Sc