## getSeq
- GitHub page: https://github.com/shenjean/getMito
- Wiki page: https://github.com/shenjean/getMito/wiki

In [6]:
# From a specified getMito output file or a user-provided list of fish mitochondrial NCBI accession numbers, this script 
# extracts the corresponding sequences and outputs a fasta file

# Thiss script is interactive and takes 3 inputs, in the following order:
# 1) Input file name with extension (e.g. input.txt)
# 2) Whether the input is a getMito output file (--getmito) or a custom list of accession numbers (--plain)
# 3) Reference database - either 12S.pickle, COI.pickle, or mitofish.pickle
# 4) Output file name



import sys
import os.path
from os import path
import pickle

# Get the 4 inputs from user

input_file = input()
acc=tuple(open(input_file,'r'))
input_type = input()
reference_file = input()
output_file = input()

# Throw an error message and exit if output file(s) already exist

if path.exists(output_file):
    sys.exit("Error: Output file exists! Please rename output file and try again!")

nohit=0
count=0

# This function does the sequence matching

def matchseq(query):
    with open(reference_file, 'rb') as handle:
        seqpickle = pickle.load(handle)

    length=(len(seqpickle))

    output=open(output_file,'a')

    if query in seqpickle:
        print("%s:\tSequence match found!" % query)
        output.write(">%s\n%s\n" % (query,seqpickle[query]))

    elif query not in seqpickle:
        print("%s:\tNo hit found!" % query)
                   
    output.close()

if "mito" in intype:
    print("== Searching for sequences from getMito output ==")
    i = 0  
    seen=set()

    while (i < len(acc)):

        # Extract accession number from MitoFish output
        
        line=str(acc[i]).rsplit("\t")
        inquery=str(line[2])

        # Match each unique query
        if inquery not in seen:
            matchseq(query=inquery)
            seen.add(inquery)
        else:
            print("Duplicate warning: Accession %s has already been processed." % inquery)
        
        i += 1

if "plain" in intype:
    print("== Searching for sequences from user's list ==")
    i = 0  
    seen=set()

    while (i < len(acc)):

        # Extract accession number from MitoFish output
        
        line=str(acc[i]).rsplit()
        inquery=str(line[0])
        # Match each unique query
        if inquery not in seen:
            matchseq(query=inquery)
            seen.add(inquery)
        else:
            print("Duplicate warning: Accession %s has already been processed." % inquery)
        
        i += 1
    

print("== Search complete! ==")




acc
plain
12S.pickle
plaintest.fasta
== Searching for sequences from user's list ==
AY484973:	No hit found!
AY484974:	No hit found!
HM114425:	No hit found!
HM114426:	No hit found!
HM114427:	No hit found!
NP_67891:	No hit found!
NP_12345:	No hit found!
NC_001727:	Sequence match found!
NC_031380:	Sequence match found!
== Search complete! ==
