#### Prepare KpH-2019703 catchsample list of species

Remove non-standard characters `Å`, `Ø`, and `é` using SublimeText:

```
cd /Users/luke/NOAA/Cruises/KpH-May2019/fishdata
cp catchsample.txt catchsample_stdchars.txt
# remove non-standard characters in catchsample_stdchars.txt
```

Parse catchsample to get lists of unique taxa: subspecies, species, genera, families (-ae), orders (-es) from field "scientificname":

```
# subspecies (three or more words)
cat catchsample_stdchars.txt | cut -d"<TAB>" -f11 | perl -lne '@F=split / /; if (@F>2) {print "@F[0] @F[1] @F[2]"}' | sort | uniq > catchsample_stdchars_subspecies_uniq.txt

# species (two words)
cat catchsample_stdchars.txt | cut -d"<TAB>" -f11 | perl -lne '@F=split / /; if (@F==2) {print "@F[0] @F[1]"}' | sort | uniq > catchsample_stdchars_species_uniq.txt

# genera (one word NOT ending in -ae or -es)
cat catchsample_stdchars.txt | cut -d"<TAB>" -f11 | perl -lne '@F=split / /; if (@F==1) {print @F[0]}' | egrep -v "^[A-Z][a-z]*ae$" | egrep -v "^[A-Z][a-z]*es$" | egrep -v "scientificname" | sort | uniq > catchsample_stdchars_genera_uniq.txt

# familes (one word ending in -ae)
cat catchsample_stdchars.txt | cut -d"<TAB>" -f11 | perl -lne '@F=split / /; if (@F==1) {print @F[0]}' | egrep "^[A-Z][a-z]*ae$" | sort | uniq > catchsample_stdchars_families_uniq.txt

# orders (one word ending in -es)
cat catchsample_stdchars.txt | cut -d"<TAB>" -f11 | perl -lne '@F=split / /; if (@F==1) {print @F[0]}' | egrep "^[A-Z][a-z]*es$" | sort | uniq > catchsample_stdchars_orders_uniq.txt
```

#### Prepare MitoFish data

MitoFish website: http://mitofish.aori.u-tokyo.ac.jp/download.html

##### Download and process mitoannotator (complete mitogenome) records:

```
# Download and unzip file in a folder named 'mitoannotations'
mkdir mitoannotations
cd mitoannotations
wget http://mitofish.aori.u-tokyo.ac.jp/files/mitoannotations.zip
unzip mitoannotations.zip

# get accession numbers
ls *.txt | cut -d '_' -f1,2 >complete.accession

# get species names
ls *.txt | cut -d '_' -f3- | sed "s/.txt/#complete mitogenomes/g" >complete.species

# Make a list of accession number and species, separated by "#"
paste -d "#" complete.accession complete.species >complete.list

```
##### Download and process complete+partial mDNA sequence file: 

```
# Download and unzip sequence file in the same "mitoannotations" folder
wget http://mitofish.aori.u-tokyo.ac.jp/files/complete_partial_mitogenomes.zip
unzip complete_partial_mitogenomes.zip

# get accession numbers
grep ">" complete_partial_mitogenomes.fa | awk -F "|" '{print $2}' >mitofish.accession

# get species names
grep ">" complete_partial_mitogenomes.fa | awk -F "|" '{print $3}' | sed "s/ (\[.*\])$//g" | tr " " "_" >mitofish.species

# Make a list of accession number and species, separated by "#"
paste -d "#" mitofish.accession mitofish.species >mitofish.list

```



#### Prepare NCBI data

NCBI blast databases ftp site: https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/

```
# Download and unzip NCBI sequence file in the "NCBI" subfolder
mkdir NCBI
cd NCBI
wget https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz
wget https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz.md5
md5sum -c nt.gz.md5 >md5sum.log
unzip nt.gz

# get accession numbers
grep ">" nt | cut -d ' ' -f1 | tr -d ">"  >nt.accession 

# get gene names
grep ">" nt | cut -d ' ' -f2- >nt.genenames 

# Make a list of accession number and species, separated by "#"
paste -d "#" nt.accession nt.genenames >nt.list

# Split the list into smaller files of 10 million rows each
split -d -l 1000000 nt.list

# Each split file will have the prefix x followed by a number e.g. x00, x01
```

#### Extract (grep) NCBI gene names from MitoFish accession (excluding complete mitogenomes)
```

for id in $(cat ../mitofish.accession); do ls x* | xargs -n 1 -P 0 grep $id >>../mitofish.genenames ; done

```

#### Extract (grep) 12S genes from NCBI records

```
grep -e "12S ribosomal" -e "12S rRNA" nt.list >12S.list
```

In [81]:
import sys
import os.path
from os import path

# Modify input file name here
input_file = 'fishdata/catchsample_stdchars_subspecies_uniq.txt'
ref=tuple(open(input_file,'r'))

# Modify output file prefix here
# 3 output files will be generated = <prefix>_genus.hits.txt, <prefix>_species.hits.txt and <prefix>_fulltaxonomy.hits.txt
# Output file is separated by "#" in the format: GenBank_Accession#Gene_description

output_prefix = 'newtest'

full_path=str(output_prefix+"_fulltaxonomy.hits.txt")
species_path=str(output_prefix+"_species.hits.txt")
genus_path=str(output_prefix+"_genus.hits.txt")

# Throw an error message and exit if output file(s) already exist

if path.exists(full_path) or path.exists(species_path) or path.exists(genus_path) :
    sys.exit("Error: Output file exists! Please rename output file and try again!")

# Create output files if all is well
                  
full_hit=open(full_path,'a')
species_hit = open(species_path,'a')
genus_hit = open(genus_path,'a')

# Create sets to hold lines already seen, for line deduplication later
full_seen=set()
species_seen=set()
genus_seen=set()
    
i = 0
while (i < len(ref)):
    
    taxa=str(ref[i]).rsplit()
    fulltaxa=str(ref[i]).rsplit("\n")
    fullquery=str(fulltaxa[0])
    squery=str(taxa[0]+" "+taxa[1])
    gquery=str(taxa[0])
    
    fullcount=0
    scount=0
    gcount=0
    
    qcount = i+1
    print ("=== Searching query #%d ===" % qcount)
    
    with open('12S.list', 'r') as f:
        for line in f.readlines():
            if fullquery in line:
                fullcount += 1
                if line not in full_seen:
                     full_hit.write(line)
                full_seen.add(line)
            if squery in line:
                scount += 1
                if line not in species_seen:
                     species_hit.write(line)
                species_seen.add(line)
            if gquery in line:
                gcount += 1 
                if line not in genus_seen:
                     genus_hit.write(line)
                genus_seen.add(line)

    print("%s hits (full taxonomy): %d" % (fullquery,fullcount))
    print("%s hits (species): %d" % (squery,scount))
    print("%s hits (genus): %d" % (gquery,gcount))
    
    i += 1

full_hit.close()
species_hit.close()
genus_hit.close()

print ("==== Run complete! ===")
print ("Accession number of full taxonomy hits and description saved in <%s_fulltaxonomy.hits.txt>" % output_prefix)
print ("Accession number of species hits and description saved in <%s_species.hits.txt>" % output_prefix)
print ("Accession number of genus hits and description saved in <%s_genus.hits.txt>" % output_prefix)




=== Searching query #1 ===
Histioteuthis celetaria celetaria hits (full taxonomy): 0
Histioteuthis celetaria hits (species): 0
Histioteuthis hits (genus): 1
=== Searching query #2 ===
Histioteuthis corona corona hits (full taxonomy): 0
Histioteuthis corona hits (species): 0
Histioteuthis hits (genus): 1
=== Searching query #3 ===
Lampadena urophaos atlantica hits (full taxonomy): 0
Lampadena urophaos hits (species): 3
Lampadena hits (genus): 5
=== Searching query #4 ===
Notoscopelus elongatus kroyeri hits (full taxonomy): 0
Notoscopelus elongatus hits (species): 0
Notoscopelus hits (genus): 7
=== Searching query #5 ===
Scopelogadus mizolepis mizolepis hits (full taxonomy): 0
Scopelogadus mizolepis hits (species): 0
Scopelogadus hits (genus): 5
=== Searching query #6 ===
Stomias boa boa hits (full taxonomy): 0
Stomias boa hits (species): 2
Stomias hits (genus): 11
==== Run complete! ===
Accession number of full taxonomy hits and description saved in <newtest_fulltaxonomy.hits.txt>
Acces