This notebook is for troubleshooting plastomes using NCBI-provided software pipelines. It was a suggestion from NCBI team rejected the annotation twice after uploading.

In [4]:
import os

# Defining variables for files
template = 'plastomes/final/template.sbt'
gb = 'plastomes/final/Crepis_callicephala.gb'
fasta = 'plastomes/in/Crepis_callicephala.fasta'

# command itself:
# table2asn -t template.sbt -i sequence.fsa

In [5]:
# check for fasta

from Bio import SeqIO, SeqRecord

try:
    with open(fasta, "r") as fasta_file:
        print(f"File {fasta} does exist.")
except FileNotFoundError:
    print(f"File '{fasta}' not found.\nConverting genbank file '{gb}' to FASTA...")
    try:
        first_record = next(SeqIO.parse(gb, "genbank"))
        SeqIO.write(first_record, fasta, "fasta")
        print(f"Successfully created '{fasta}'.")
    except FileNotFoundError:
        print(f"Error: The source GenBank file '{gb}' was not found.")
    except StopIteration:
        print(f"Error: The GenBank file '{gb}' is empty.")

File plastomes/in/Crepis_callicephala.fasta does exist.


## Run `table2asn` program

There are some recommendations on NCBI site that were not illuminated at Readme file of the program.
`table2asn` will recognize files with **the same basename** as the input sequence file. Sequences that are part of a plasmid, or an organellar chromosome, or specific nuclear chromosomes need to have that information included in the fasta definition line, in these formats:

- [location=mitochondrion]
- [location=chloroplast]

Sequences that are a complete circular chromosome or plasmid need to have the circular topology and the completeness included.

- [topology=circular] [completeness=complete]
- [topology=circular] gap at end, not circularized




In [6]:
# preparing directory
# all the files should be stored at the same directory
import os

project_dir = 'plastomes/table2asn'
tbl_file = "plastomes/final/gb2sequin_out/c_callicephala_final/05/Crepis_callicephala___149980_bp____DNA.tbl"
sqn_file = "plastomes/final/gb2sequin_out/c_callicephala_final/05/Crepis_callicephala___149980_bp____DNA(1).sqn"

SHORT_NAMES = {
    "Crepis_callicephala": "cc",
    "Crepis_purpurea": "cp",
}

def create_symlinks(project_dir: str, sourcefile: str):
    """
    Create symlinks for files to process.
    """
    basename = os.path.basename(sourcefile)
    ext= basename.rsplit(".")[-1]
    #print("ext", ext)

    # handle sql and tbl names
    basename = str(basename).rsplit("___")[0]

    #print("basename", basename)
    species = basename.rsplit(".")[0]
    #print("species", species)
    
    label = SHORT_NAMES[species]
    #print("label", label)
    target_dir = os.path.join(project_dir, label)
    #print("target_dir", target_dir)
    abs_target_dir = os.path.abspath(target_dir)
    # filename should match sequence id in FASTA header
    target_filename = os.path.join(abs_target_dir, f"{species}.{ext}")
    
    #print("target_filename", target_filename)
    abs_sourcefile = os.path.abspath(sourcefile)

    try:
        os.path.exists(sourcefile)
        try:
            # fixing first relative paths
            os.symlink(abs_sourcefile, target_filename)
            print(f"Symlink '{target_filename}' for '{abs_sourcefile}' was successfully created.")
        except FileExistsError:
            print(f"Symlink for '{sourcefile}' is already exist.")
        except PermissionError:
            print(f"You have no permissions for this action.")
    except FileExistsError:
        print(f"Source file '{sourcefile}' does not exist.")


print("Creating symbolic links:")
# create_symlinks(project_dir, gb) #  this is output file
# create_symlinks(project_dir, fasta) # the header line should be modified
create_symlinks(project_dir, tbl_file)
#create_symlinks(project_dir, sqn_file) #  this is output file


Creating symbolic links:
Symlink for 'plastomes/final/gb2sequin_out/c_callicephala_final/05/Crepis_callicephala___149980_bp____DNA.tbl' is already exist.


The header line of fasta file should follow definition line conventions. To keep previous files intact but change required data, the file should be just copied instead of linking.

In [7]:
import shutil

destination_directory = f"{project_dir}/cc"
dest_file = f"{destination_directory}/{os.path.basename(fasta).rsplit(".")[0]}.fsa"
print("destination:", dest_file)

if not os.path.exists(dest_file):
    try:
        shutil.copy(fasta, dest_file)
        print(f"File '{fasta}' copied to '{destination_directory}' successfully.")
    except FileNotFoundError:
        print("Source file not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

destination: plastomes/table2asn/cc/Crepis_callicephala.fsa


In [14]:
# testing table2asn
import subprocess
import sys


# Configuration
CONDA_ENV_NAME = "plastome_post-env"
INPUT_SUBDIR = "cc"

location = "[location=chloroplast]"
topology = "[topology=circular]"
completeness = "[completeness=complete]"
organism = "[organism=Crepis callicephala]"
code = "[gcode=11]"
pcode = "[gpcode=11]"

src_qualifiers = f"{organism} {location} {topology} {completeness} {code}"
print(src_qualifiers)

input_dir = f"{project_dir}/cc"
# Check inputs
print(f"Checking input directory...\n  {input_dir} is present: {os.path.isdir(input_dir)}")
print(f"Checking input files...\n  {template} is present: {os.path.isfile(template)}")
present_flag = False
for i in os.listdir(input_dir):
    file = os.path.join(input_dir, i)
    present_flag = os.path.exists(file)
    print(f"  {i} is present: {present_flag}")
    if present_flag == False:
        print(f"    relative path: {file}")

    


# table2asn -indir {project_dir}/cc -t {template} -j {comment}

cmd = [
    "conda", "run", "-n", "table2asn-env",
    "table2asn",
    "-indir", str(os.path.abspath(input_dir)),
    "-t", str(os.path.abspath(template)),
    "-j", f"'{src_qualifiers}'",
    "-V", "vb",
    "-verbose",
]

cmd_version = [
    "conda", "run", "-n", "table2asn-env",
    "table2asn",
    "-version",
]

print(f"Running: {' '.join(cmd)}")
#print(f"Running: {' '.join(cmd_version)}")

try:
    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    print("STDOUT:", result.stdout, flush=True)
    print("STDERR:", result.stderr, flush=True)
    print("`table2asn` completed successfully!", flush=True)
except subprocess.CalledProcessError as e:
    print(f"`table2asn` failed with exit code {e.returncode}")
    sys.exit(e.returncode)

[organism=Crepis callicephala] [location=chloroplast] [topology=circular] [completeness=complete] [gcode=11]
Checking input directory...
  plastomes/table2asn/cc is present: True
Checking input files...
  plastomes/final/template.sbt is present: True
  Crepis_callicephala.tbl is present: True
  Crepis_callicephala.fsa is present: True
Running: conda run -n table2asn-env table2asn -indir /home/asan/BIO_GENOMICS/notebooks/plastome_postprocessing/plastomes/table2asn/cc -t /home/asan/BIO_GENOMICS/notebooks/plastome_postprocessing/plastomes/final/template.sbt -j '[organism=Crepis callicephala] [location=chloroplast] [topology=circular] [completeness=complete] [gcode=11]' -V vb -verbose
STDOUT: 
STDERR: This copy of table2asn is more than 1 year old. Please download the current version if it is newer.
Will be using one threads
Recognized annotation format: five-column feature table
Falling back on built-in data for popular organisms.


`table2asn` completed successfully!


The `table2asn` did not performed well. The working directory does not contain any output files.
There the "This copy of table2asn is more than 1 year old. Please download the current version if it is newer." message appeared in STDERR. Trying to launch binary executable file from the fresh release might be a solution.

## Solution
Thwe problem was copying fasta file with `.fasta` file extension. After changing it to `.fsa` the `table2asn` performed well.

In [8]:
table2asn = "bin/table2asn.linux64"

import subprocess
import sys

cmd_version = [
    table2asn,
    "-version",
]

try:
    result = subprocess.run(cmd_version, check=True, capture_output=True, text=True)
    print("STDOUT:", result.stdout, flush=True)
    print("STDERR:", result.stderr, flush=True)
    print("`table2asn` completed successfully!", flush=True)
except subprocess.CalledProcessError as e:
    print(f"`table2asn` failed with exit code {e.returncode}")
    sys.exit(e.returncode)



STDOUT: table2asn: 1.29.324

STDERR: 
`table2asn` completed successfully!


In [10]:
import os

project_dir = 'plastomes/table2asn'
input_dir = f"{project_dir}/cc"

cmd = [
    table2asn,
    "-indir", str(os.path.abspath(input_dir)),
    "-t", str(os.path.abspath(template)),
    "-V", "vb",
    "-verbose",
    "-Z",
]

print(f"Running: {' '.join(cmd)}")

try:
    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    print("STDOUT:", result.stdout, flush=True)
    print("STDERR:", result.stderr, flush=True)
    print("`table2asn` completed successfully!", flush=True)
except subprocess.CalledProcessError as e:
    print(f"`table2asn` failed with exit code {e.returncode}")
    sys.exit(e.returncode)

Running: bin/table2asn.linux64 -indir /home/asan/BIO_GENOMICS/notebooks/plastome_postprocessing/plastomes/table2asn/cc -t /home/asan/BIO_GENOMICS/notebooks/plastome_postprocessing/plastomes/final/template.sbt -V vb -verbose -Z
STDOUT: 
STDERR: Will be using one threads
Recognized annotation format: five-column feature table

`table2asn` completed successfully!


## Summarizing results
There are 4 errors in Crepis callicephala genome of two types. The reference genome annotations were validated via GB2Sequin and compared to C. callicephala plastid genome.
### Errors

| species | accession | SEQ_INST.BadProteinStart | SEQ_FEAT.StartCodon |
| ------- | --------- | ------------------------ | ------------------- |
| Crepis callicephala | - | psbL, ndhD | "photosystem II subunit L", "NADH dehydrogenase subunit D" |
| Lactuca sativa | NC_007578 | psbL, ndhD | - |
| Lactuca sativa cv. Ramosa | PP999684 | psbL | - |
| Nicotiana tabacum | NC_001879 | psbL | - |
| Arabidopsis thaliana | NC_000932 | ndhD | - |
| Oryza sativa | NC_031333 | rpl2, rpl2 | - |

At least each fatal error should be checked manually. One of the solution for 'illegal start codon' type error might be exception defining in annotation.

### Warnings
In Crepis callicephala genome validation report

| Warning | Feature | Feature_description | Location |
| - | - | - | - |
| SEQ_FEAT.CDSgeneRange | CDS | ribosomal protein S12 <133> | rps12:[c67482-67369, 136002-136794] |
| SEQ_FEAT.CDSgeneRange | CDS | Ycf15 protein <158> | ycf15:c96959-96768 |
| SEQ_FEAT.CDSgeneRange | CDS | ribosomal protein S7 <159> | rps7:c94947-94480 |
| SEQ_FEAT.CDSgeneRange | CDS | NADH dehydrogenase subunit B <160> | ndhB:c94189-91988 |
| SEQ_FEAT.CDSgeneRange | CDS | Ycf2 protein <161> | ycf2:84073-90927 |
| SEQ_FEAT.CDSgeneRange | CDS | ribosomal protein L23 <162> | rpl23:c83722-83441 |
| SEQ_FEAT.CDSgeneRange | CDS | ribosomal protein L2 <163> | rpl2:c83422-81933 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c7316-7244 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c7316-7244 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c8577-8490 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c8577-8490 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c11817-11734 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c11817-11734 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c11992-11921 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c11992-11921 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c29889-29818 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c29889-29818 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c35029-34947 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | Gly | 35855-35925 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | 44778-44864 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | Met | c83961-83888 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | 110967-111046 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | (c132067-132025, c131268-131234) |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | Met | 147834-147908 |
| SEQ_FEAT.GeneXrefWithoutGene | exon | /number=2 | c1794-1759 |
| SEQ_FEAT.GeneXrefWithoutGene | intron | /number=1 | c4320-1795 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c4320-1795 |
| SEQ_FEAT.GeneXrefWithoutGene | exon | /number=1 | c4358-4321 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c6229-5377 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | 16596-17318 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 16596-17318 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | 26949-27665 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 26949-27665 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c30852-30143 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=2 | c42897-42165 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c43820-43128 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | 46807-47243 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 46807-47243 |
| SEQ_FEAT.GeneXrefWithoutGene | exon | /number=2 | 47244-47293 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c51689-51122 |
| SEQ_FEAT.GeneXrefWithoutGene | exon | /number=1 | c51727-51690 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=2 | c68490-67891 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=2 | c68490-67891 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c69581-68783 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 72584-73348 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c80268-79232 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c83031-82367 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c93412-92744 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | c95562-95028 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c95562-95028 |
| SEQ_FEAT.GeneXrefWithoutGene | exon | /number=1 | 99728-99770 |
| SEQ_FEAT.GeneXrefWithoutGene | intron | /number=1 | 99771-100526 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | 99771-100526 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 99771-100526 |
| SEQ_FEAT.GeneXrefWithoutGene | exon | /number=2 | 100527-100561 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | 100664-101483 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 100664-101483 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | c118053-116999 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c118053-116999 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | c131131-130312 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c131131-130312 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | c132024-131269 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | c132024-131269 |
| SEQ_FEAT.NotSpliceConsensusDonor | intron | /number=1 | 136233-136767 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 136233-136767 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 138383-139051 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | intron | /number=1 | 148764-149428 |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | ribosomal protein S16 <5> | (c6269-6230, c5376-5150) |
| SEQ_FEAT.NotSpliceConsensusDonor | CDS | RNA polymerase subunit beta' <17> | (16165-16595, 17319-18963) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | RNA polymerase subunit beta' <17> | (16165-16595, 17319-18963) |
| SEQ_FEAT.NotSpliceConsensusDonor | CDS | CF0 subunit I <27> | (26805-26948, 27666-28076) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | CF0 subunit I <27> | (26805-26948, 27666-28076) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | photosystem I assembly factor I <43> | (c43944-43821, c43127-42898, c42164-42012) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | photosystem I assembly factor I <43> | (c43944-43821, c43127-42898, c42164-42012) |
| SEQ_FEAT.NotSpliceConsensusDonor | CDS | ATP-dependent Clp protease proteolytic subunit 1 <89> | (c69652-69582, c68782-68491, c67890-67663) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | ATP-dependent Clp protease proteolytic subunit 1 <89> | (c69652-69582, c68782-68491, c67890-67663) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | ATP-dependent Clp protease proteolytic subunit 1 <89> | (c69652-69582, c68782-68491, c67890-67663) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | cytochrome b6 <99> | (72578-72583, 73349-73990) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | ribosomal protein L16 <115> | (c80277-80269, c79231-78833) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | ribosomal protein L2 <123> | (c83422-83032, c82366-81933) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | NADH dehydrogenase subunit B <129> | (c94189-93413, c92743-91988) |
| SEQ_FEAT.GeneXrefStrandProblem | CDS | ribosomal protein S12 <133> | (c67482-67369, c95793-95563, c95027-95001) |
| SEQ_FEAT.NotSpliceConsensusDonor | CDS | ribosomal protein S12 <133> | (c67482-67369, c95793-95563, c95027-95001) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | ribosomal protein S12 <133> | (c67482-67369, c95793-95563, c95027-95001) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | ribosomal protein S12 <133> | (c67482-67369, c95793-95563, c95027-95001) |
| SEQ_FEAT.NotSpliceConsensusDonor | CDS | NADH dehydrogenase subunit A <152> | (c118605-118054, c116998-116459) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | NADH dehydrogenase subunit A <152> | (c118605-118054, c116998-116459) |
| SEQ_FEAT.GeneXrefStrandProblem | CDS | Ycf15 protein <158> | 134836-135027 |
| SEQ_FEAT.GeneXrefStrandProblem | CDS | ribosomal protein S7 <159> | 136848-137315 |
| SEQ_FEAT.GeneXrefStrandProblem | CDS | NADH dehydrogenase subunit B <160> | (137606-138382, 139052-139807) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | NADH dehydrogenase subunit B <160> | (137606-138382, 139052-139807) |
| SEQ_FEAT.GeneXrefStrandProblem | CDS | Ycf2 protein <161> | c147722-140868 |
| SEQ_FEAT.GeneXrefStrandProblem | CDS | ribosomal protein L23 <162> | 148073-148354 |
| SEQ_FEAT.GeneXrefStrandProblem | CDS | ribosomal protein L2 <163> | (148373-148763, 149429-149862) |
| SEQ_FEAT.NotSpliceConsensusAcceptor | CDS | ribosomal protein L2 <163> | (148373-148763, 149429-149862) |


### Notes
For C. Callicephala, lines of type 'Note' are absent in table2asn validation output.

### Error types
#### CDSgeneRange
A CDS is overlapped by a gene feature, but is not completely contained by it.
This may be an annotation error.

#### GeneXrefWithoutGene
This feature has a gene xref, but there is no equivalent gene feature anywhere
on the record.

#### GeneXrefStrandProblem
This feature has a gene xref that points to a gene on the wrong strand.

#### MissingTrnaAA
The tRNA encoded amino acid is not set. *Explanation*: The amino acid that the tRNA carries is not included. *Suggestion*: Include the amino acid as the product of the tRNA. If the amino acid of a tRNA is unknown, use tRNA-Xxx as the product.

#### SEQ_FEAT.StartCodon and SEQ_INST.BadProteinStart

*Explanation*: The StartCodon and BadProteinStart errors are produced when the CDS is not marked as partial at its 5′ end and does not begin with a start codon.

*Suggestion*: Use the correct genetic code to get the correct translations. For example, include \[gcode=11\] for prokaryotic genome submissions. Other possible fixes include: extend the CDS to the start codon, or mark the 5′ end as partial (and extend the CDS to the end of the sequence for prokaryotic sequences), or add the /pseudo qualifier to the gene to indicate that the CDS cannot be translated.

##### StartCodon
An illegal start codon was used. Some possible explanations are: (1) the
wrong genetic code may have been selected; (2) the wrong reading frame may
be in use; or (3) the coding region may be incomplete at the 5' end, in
which case a partial location should be indicated.

##### BadProteinStart
A gap symbols was found at the start of this protein Bioseq.

In [19]:
# parsing text file with validation warnings to print it in structured format.

file_2_parse = 'plastomes/table2asn/cc/Crepis_callicephala.val'

with open(file_2_parse, "r") as handle:
    for line in handle:
        warning = "-"
        feature = "-"
        gene = "-"
        location = "-"
        if "Warning:" in line and "SEQ_FEAT" in line:
            line = line.split(" ", maxsplit=3)
            warning = line[2].strip("[]")
            feature = line[-1].rsplit('FEATURE: ')[-1].split(": ")[0]
            gene = line[-1].rsplit('FEATURE: ')[-1].split("[")[0].split(": ")[-1]
            #print(line)
            location = line[-1].split("[", maxsplit=1)[1].split("] ")[0]
            # sanitizing values
            variables = [warning, feature, gene, location]
            for i, value in enumerate(variables):
                value = value.strip(" ")
                if not value or value.isspace():
                    variables[i] = "-"
                elif "|" in value:
                    variables[i] = value.replace("lcl|Crepis_callicephala:", "")
                else:
                    variables[i] = value
            warning, feature, gene, location = variables
            # printing in md-compatible format
            print(f"| {warning} | {feature} | {gene} | {location} |")

| SEQ_FEAT.CDSgeneRange | CDS | ribosomal protein S12 <133> | rps12:[c67482-67369, 136002-136794] |
| SEQ_FEAT.CDSgeneRange | CDS | Ycf15 protein <158> | ycf15:c96959-96768 |
| SEQ_FEAT.CDSgeneRange | CDS | ribosomal protein S7 <159> | rps7:c94947-94480 |
| SEQ_FEAT.CDSgeneRange | CDS | NADH dehydrogenase subunit B <160> | ndhB:c94189-91988 |
| SEQ_FEAT.CDSgeneRange | CDS | Ycf2 protein <161> | ycf2:84073-90927 |
| SEQ_FEAT.CDSgeneRange | CDS | ribosomal protein L23 <162> | rpl23:c83722-83441 |
| SEQ_FEAT.CDSgeneRange | CDS | ribosomal protein L2 <163> | rpl2:c83422-81933 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c7316-7244 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c7316-7244 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c8577-8490 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c8577-8490 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c11817-11734 |
| SEQ_FEAT.MissingTrnaAA | tRNA | - | c11817-11734 |
| SEQ_FEAT.GeneXrefWithoutGene | tRNA | - | c11992-11921 |
| SEQ_FEAT.MissingTrnaAA | tRNA 

## Checking tRNA data
### Perform re-assess using Aragorn v1.2.41
The code to use:
```shell
aragorn -v -e -gcbact -c -i -o $output $fasta
```
`-gcbact`   Use Bacterial/Plant chloroplast genetic code.

`-c`        Assume that each sequence has a circular topology. Search wraps around each end.

`-d`        Double. Search both strands of each sequence. Default setting

`-i`        Search for tRNA genes with introns in anticodon loop with maximum length 3000 bases. Minimum intron length is 0 bases.

`-e`        Print out score for each reported gene.

`-v`        Verbose. Prints out information during search to STDERR.

`-o <outfile>`    Print output to <outfile>. If <outfile> already exists, it is overwritten. By default all output goes to stdout.

In [2]:
import subprocess
import sys

fasta = "plastomes/table2asn/cc/Crepis_callicephala.fsa"
gb = "plastomes/table2asn/cc/Crepis_callicephala.gbf"
aragorn_out = "plastomes/table2asn/cc/Crepis_callicephala.aragorn.trnas.txt"

aragorn_cmd = [
    "conda", "run", "-n", "aragorn-env",
    "aragorn",
    "-v", "-e",
    "-gcbact", 
    "-c", "-i", 
    "-o", aragorn_out, 
    gb,
]

try:
    result = subprocess.run(aragorn_cmd, check=True, capture_output=True, text=True)
    print("STDOUT:", result.stdout, flush=True)
    print("STDERR:", result.stderr, flush=True)
    print("`ARAGORN` run completed successfully!", flush=True)
except subprocess.CalledProcessError as e:
    print(f"`ARAGORN` failed with exit code {e.returncode}")
    sys.exit(e.returncode)

STDOUT: 
STDERR: Crepis_callicephala ' '.
149980 nucleotides in sequence
Mean G+C content = 37.7%
Using bacterial/plant chloroplast genetic code
Searching from 3120 to 16881
tRNA-Cys(gca) at [9331,9402] (107.847)
tRNA-Cys(gca) [9331,9402] (116.002) replacing tRNA-Cys(gca) [9331,9402] (107.847)
tRNA-Glu(ttc) at c[11921,11992] (104.658)
tRNA-Glu(ttc) c[11920,11993] (115.411) replacing tRNA-Glu(ttc) c[11921,11992] (104.658)
tRNA-Tyr(gta) at c[11734,11817] (105.503)
tRNA-Tyr(gta) c[11732,11818] (117.555) replacing tRNA-Tyr(gta) c[11734,11817] (105.503)
tRNA-Asp(gtc) at c[11547,11620] (108.811)
tRNA-Asp(gtc) c[11545,11621] (117.885) replacing tRNA-Asp(gtc) c[11547,11620] (108.811)
tRNA-Ser(gct) at c[8491,8577] (104.217)
tRNA-Ser(gct) c[8491,8578] (116.445) replacing tRNA-Ser(gct) c[8491,8577] (104.217)
tRNA-Ser(gct) c[8490,8577] (123.247) replacing tRNA-Ser(gct) c[8491,8578] (116.445)
tRNA-Gln(ttg) at c[7245,7316] (105.114)
tRNA-Gln(ttg) c[7245,7316] (105.602) replacing tRNA-Gln(ttg) c[7245

In [22]:
# parse Aragorn output
import pandas as pd

aragorn_out = "plastomes/table2asn/cc/Crepis_callicephala.aragorn.trnas.txt"
list_parsed = []

with open(aragorn_out, "r") as handle:
    start_line = None
    for i, line in enumerate(handle):
        line_parts = []
        if "Aragorn" in line:
            if "GenBank" in line and "Aragorn" in line:
                line_parts = line.strip(" ").split()
                print(line_parts)
                if len(line_parts) == 2:
                    start_line = i
                    print(f"start line index: {start_line}")
        if start_line:
            if i == start_line:
                pass
            else:
                gb_label = None
                gb_coord = None
                arag_note = None
                arag_label = None
                arag_coord = None
                arag_score = None
                line = line.split("\n")[0]
                #print(i, line)
                inconsistent = False
                split_initial = line.split(" ", maxsplit=1)
                consistency_flag = split_initial[0]
                if len(split_initial) < 2:
                    break
                else:
                    remain_part = line.split(" ", maxsplit=1)[1].strip()
                    line_parts.append(consistency_flag)
                    line_parts.extend(remain_part.split(maxsplit=5))
                    #print(line_parts)
                    if "*" in line_parts[0]:
                        inconsistent = True
                    gb_label = line_parts[1]
                    gb_coord = line_parts[2]
                    if 'Not' in line_parts[3]:
                        pass
                    else:
                        arag_label = line_parts[3]
                        arag_coord = line_parts[4]
                        arag_score = line_parts[5]
                        #print(f"'line_parts' length: {len(line_parts)}")
                        if len(line_parts) > 6:
                            arag_note = line_parts[-1]
                    #print(f"{inconsistent}\t{gb_label}\t{gb_coord}\t{arag_label}\t{arag_coord}\t{arag_score}\t{arag_note}")
                    line_dict = {
                        'inconsistent': inconsistent,
                        'gb_label': gb_label,
                        'gb_coord': gb_coord,
                        'arag_label': arag_label,
                        'arag_coord': arag_coord,
                        'arag_score': arag_score,
                        'arag_note': arag_note
                    }

                    list_parsed.append(line_dict)
                    #print(line_dict)

df_aragorn = pd.DataFrame.from_dict(list_parsed)
display(df_aragorn)

['GenBank', 'to', 'Aragorn', 'comparison']
['GenBank', 'Aragorn']
start line index: 1663


Unnamed: 0,inconsistent,gb_label,gb_coord,arag_label,arag_coord,arag_score,arag_note
0,False,tRNA-His,"c(3,77)",tRNA-His(gtg),"c[3,77]",115.233,
1,True,tRNA-Lys,"c(1759,1794)",tRNA-Lys(ttt),"c[1759,4358]",103.307,sequence length mismatch
2,True,tRNA-???,"c(7244,7316)",tRNA-Gln(ttg),"c[7245,7316]",116.786,amino acceptor mismatch
3,True,tRNA-???,"c(8490,8577)",tRNA-Ser(gct),"c[8490,8577]",123.247,amino acceptor mismatch
4,False,tRNA-Cys,"(9331,9402)",tRNA-Cys(gca),"[9331,9402]",116.002,
5,False,tRNA-Asp,"c(11547,11620)",tRNA-Asp(gtc),"c[11545,11621]",117.885,
6,True,tRNA-???,"c(11734,11817)",tRNA-Tyr(gta),"c[11732,11818]",117.555,amino acceptor mismatch
7,True,tRNA-???,"c(11921,11992)",tRNA-Glu(ttc),"c[11920,11993]",115.411,amino acceptor mismatch
8,True,tRNA-???,"c(29818,29889)",tRNA-Arg(tct),"c[29818,29889]",117.452,amino acceptor mismatch
9,True,tRNA-Gly,"c(30095,30142)",tRNA-Ser(cga),"c[30094,30875]",101.783,amino acceptor and sequence length mismatch


In [24]:
# decomposing coords represented like "c(3,77)" or "(9331,9402)"
# the coords data are stored at the 'gb_coord' column
# in the string, "c" symbol is mentioned for complement-stranded coords
import numpy as np

# decomposing 'gb_coord'
df_aragorn['gb_strand'] = np.where(
    df_aragorn['gb_coord'].str.startswith("c"), 
    "-", "+"
    )

coord = df_aragorn['gb_coord'].str.strip("[(c)] ").str.split(",", expand=True)
df_aragorn["gb_start"] = coord[0].astype(int)
df_aragorn["gb_end"] = coord[1].astype(int)

# decomposing 'arag_coord'
# one row contains None for 'arag_coord' field
df_aragorn['arag_strand'] = np.where(
    df_aragorn["arag_coord"].notna() & 
    df_aragorn['arag_coord'].str.startswith("c"), 
    "-", "+"
    )

coord = df_aragorn['arag_coord'].where(df_aragorn["arag_coord"].notna()).str.lstrip("c").str.strip("[] ").str.split(",", expand=True)
df_aragorn["arag_start"] = pd.to_numeric(coord[0], downcast='signed', errors="coerce").fillna(0).astype(int)
df_aragorn["arag_end"] = pd.to_numeric(coord[1], downcast='signed', errors="coerce").fillna(0).astype(int)

display(df_aragorn.head())

Unnamed: 0,inconsistent,gb_label,gb_coord,arag_label,arag_coord,arag_score,arag_note,gb_strand,gb_start,gb_end,arag_strand,arag_start,arag_end
0,False,tRNA-His,"c(3,77)",tRNA-His(gtg),"c[3,77]",115.233,,-,3,77,-,3,77
1,True,tRNA-Lys,"c(1759,1794)",tRNA-Lys(ttt),"c[1759,4358]",103.307,sequence length mismatch,-,1759,1794,-,1759,4358
2,True,tRNA-???,"c(7244,7316)",tRNA-Gln(ttg),"c[7245,7316]",116.786,amino acceptor mismatch,-,7244,7316,-,7245,7316
3,True,tRNA-???,"c(8490,8577)",tRNA-Ser(gct),"c[8490,8577]",123.247,amino acceptor mismatch,-,8490,8577,-,8490,8577
4,False,tRNA-Cys,"(9331,9402)",tRNA-Cys(gca),"[9331,9402]",116.002,,+,9331,9402,+,9331,9402


### Assessing tRNAs using tRNAscan-SE v2.0.12
The command to run:
```shell
tRNAscan-SE -B -o $output -m $trnascan_stat --detail --log $logfile -v --thread 4 $fasta
```

### Manually adding tRNA labels
According to Aragorn output both product names were added and coordinates adjusted.
Complicated thing is where amino acids for the same position mismathes. For example, in current annotation tRNA-Gly c(30095,30142), but in Aragorn annotation tRNA-Ser(cga) c[30094,30875].

In [5]:
import subprocess
import sys

fasta = "plastomes/table2asn/cc/Crepis_callicephala.fsa"
gb = "plastomes/table2asn/cc/Crepis_callicephala.gbf"
trnascan_out = "plastomes/table2asn/cc/Crepis_callicephala.trnascan.trnas.txt"
logfile = "plastomes/table2asn/cc/Crepis_callicephala.trnascan.log.txt"
trnascan_stat = "plastomes/table2asn/cc/Crepis_callicephala.trnascan.stat.txt"
trnascan_verbose_out = "plastomes/table2asn/cc/Crepis_callicephala.trnascan.verbose.txt"


trnascan_cmd = [
    "conda", "run", "-n", "trnascan-env", 
    "tRNAscan-SE", 
    "-B", 
    "-o", trnascan_out, 
    "-m", trnascan_stat, 
    "--detail", 
    "--log", logfile, 
    "-v", trnascan_verbose_out,
    fasta
]

try:
    result = subprocess.run(trnascan_cmd, check=True, capture_output=True, text=True)
    print("STDOUT:", result.stdout, flush=True)
    print("STDERR:", result.stderr, flush=True)
    print("`tRNAscan` run completed successfully!", flush=True)
except subprocess.CalledProcessError as e:
    print(f"`tRNAscan` failed with exit code {e.returncode}")
    print("STDOUT:", e.stdout)
    print("STDERR:", e.stderr)
    sys.exit(e.returncode)

STDOUT: Status: Phase I: Searching for tRNAs with HMM-enabled Infernal
Status: Phase II: Infernal verification of candidate tRNAs detected with first-pass scan


STDERR: 
tRNAscan-SE v.2.0.12 (Nov 2022) - scan sequences for transfer RNAs
Copyright (C) 2022 Patricia Chan and Todd Lowe
                   University of California Santa Cruz
Freely distributed under the GNU General Public License (GPLv3)

------------------------------------------------------------
Sequence file(s) to search:        plastomes/table2asn/cc/Crepis_callicephala.fsa
Search Mode:                       Bacterial
Results written to:                plastomes/table2asn/cc/Crepis_callicephala.trnascan.trnas.txt
Output format:                     Tabular
Searching with:                    Infernal First Pass->Infernal
Isotype-specific model scan:       Yes
Covariance model:                  /home/asan/miniconda3/envs/trnascan-env/lib/tRNAscan-SE/models/TRNAinf-bact.cm
                                   /home/asan/min

In [31]:
import pandas as pd
import os
import io

trnascan_out = "plastomes/table2asn/cc/Crepis_callicephala.trnascan.trnas.txt"

def parse_trnascan_output(file_path: str|os.PathLike):
    """
    Parse tRNAscan-SE output file and return a pandas DataFrame.
    """
    column_names = [
        "sequence_name", "tRNA_No", "start", "end", "3_letter", 
        "anticodon", "intron_start", "intron_end", "score", 
        "isotype_CM", "isotype_score", "note",
    ]
    
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Find the start of the data (should be 
    # after the dashed separator line)
    data_start = 0
    for i, line in enumerate(lines):
        if line.startswith("--------"):
            data_start = i + 1
            break
    
    data_lines = lines[data_start:]
    data_str = "".join(data_lines)
    
    df = pd.read_csv(
        io.StringIO(data_str),
        delim_whitespace=True,
        header=None,
        names=column_names,
        index_col=False
    )
    
    # Fix column types
    numeric_cols = [
        "tRNA_No", "start", "end", 
        "intron_start", "intron_end", 
        "score", "isotype_score",
        ]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

df_trnascan = parse_trnascan_output(trnascan_out)
filter_cols = [
    "tRNA_No", "start", "end", "3_letter", 
    "anticodon", "intron_start", "intron_end", "score",
]
df_trnascan = df_trnascan[filter_cols]

df_trnascan[['start', 'end']] = df_trnascan[['start', 'end']].apply(
    lambda row: sorted(row), axis=1, result_type='expand'
)

display(df_trnascan.head())

# Optional: Save to CSV
# df.to_csv("trnascan_results.csv", index=False)

  df = pd.read_csv(


Unnamed: 0,tRNA_No,start,end,3_letter,anticodon,intron_start,intron_end,score
0,1,9331,9401,Cys,GCA,0,0,59.5
1,2,31016,31080,Thr,GGT,0,0,20.0
2,3,35855,35925,Gly,GCC,0,0,57.3
3,4,44778,44864,Ser,GGA,0,0,67.7
4,5,47609,47681,Phe,GAA,0,0,61.7


In [27]:
# checking for compatibility between 
# current and Aragorn-suggested tRNA annotations
# comparing 3-letter and single-letter codes
# The amino acid table was dowloaded 
# from https://thecodingbiologist.com/posts/amino-acid-table

import pandas as pd

aa_csv = 'amino_acid_table.csv'

df_aa = pd.read_csv(aa_csv)
aa_columns = ['name', '3_letter_code', '1_letter_code']
df_aa = df_aa[aa_columns]
df_aa['3_letter'] = df_aa['3_letter_code'].str.title()
display(df_aa)

Unnamed: 0,name,3_letter_code,1_letter_code,3_letter
0,Alanine,ALA,A,Ala
1,Cysteine,CYS,C,Cys
2,Aspartic Acid,ASP,D,Asp
3,Glutamic Acid,GLU,E,Glu
4,Phenylalanine,PHE,F,Phe
5,Glycine,GLY,G,Gly
6,Histidine,HIS,H,His
7,Isoleucine,ILE,I,Ile
8,Lysine,LYS,K,Lys
9,Leucine,LEU,L,Leu


## Parsing Crepis purpurea plastid genome for tRNA labels
In order to compare the data among different sources from both species and to bring them in concordance, C. purpurea tRNA labels and positions should also be parsed.

In [20]:
from Bio import SeqIO, SeqRecord
import pandas as pd

gb_cp = 'plastomes/final/Crepis_purpurea.gb'

with open(gb_cp, "r") as handle:
    records = SeqIO.parse(handle, "genbank")
    for record in records:
        organism = record.annotations.get("organism", "")
        print(f"Parsing {organism} plastid genome...")
        trnas = []
        for f in record.features:
            if f.type == "tRNA":
                gene = f.qualifiers.get("gene", "")[0]
                product = f.qualifiers.get("product", [""])[0]
                coords = f.location
                strand = coords.strand
                start = int(coords.start)
                end = int(coords.end)
                dict_trna = {
                    'cp_gene': gene,
                    'cp_product': product,
                    'cp_strand': strand,
                    'cp_start': start,
                    'cp_end': end
                }
                trnas.append(dict_trna)
                df_cp_trna = pd.DataFrame.from_dict(trnas)

display(df_cp_trna)

Parsing Crepis purpurea plastid genome...


Unnamed: 0,cp_gene,cp_product,cp_strand,cp_start,cp_end
0,trnH-GUG,tRNA-His,-1,5,80
1,trnK-UUU,,-1,1760,4360
2,trnQ-UUG,tRNA-Gln,-1,7244,7316
3,trnS-GCU,,-1,8489,8577
4,trnC-GCA,,1,9348,9420
5,trnD-GUC,tRNA-Asp,-1,11561,11638
6,trnY-GUA,tRNA-Tyr,-1,11748,11835
7,trnE-UUC,tRNA-Glu,-1,11936,12010
8,trnR-UCU,,-1,29853,29925
9,trnS-CGA,,-1,30129,30911


## Combining tRNA data from all sources

In [36]:
import pandas as pd
import numpy as np

def merge_trna_dataframes(df_aragorn, df_trnascan, df_cp_trna, tolerance=15):
    """
    Merge three tRNA dataframes based on start position correspondence (±tolerance bp)
    """
    
    # Extract required columns from each dataframe
    # From df_aragorn
    aragorn_cols = ['gb_label', 'arag_label', 'gb_strand', 'gb_start', 'gb_end', 
                    'arag_strand', 'arag_start', 'arag_end']
    df_arag = df_aragorn[aragorn_cols].copy()
    
    # From df_trnascan (with 'tscan_' prefix)
    trnascan_cols = ['start', 'end', '3_letter', 'anticodon']
    df_tscan = df_trnascan[trnascan_cols].copy()
    df_tscan.columns = ['tscan_' + col for col in df_tscan.columns]
    
    # df_cp_trna - keep all columns as is
    df_cp = df_cp_trna.copy()
    
    # Create a list to store matched rows
    merged_rows = []
    
    # Track which rows have been matched
    arag_matched = set()
    tscan_matched = set()
    cp_matched = set()
    
    # Function to check if two positions match within tolerance
    def positions_match(pos1, pos2, tol=tolerance):
        return abs(pos1 - pos2) <= tol
    
    # First, find all matches between the three dataframes
    for i, arag_row in df_arag.iterrows():
        arag_start = arag_row['gb_start']
        matched_row = arag_row.to_dict()
        
        # Find matching trnascan rows
        tscan_match = None
        for j, tscan_row in df_tscan.iterrows():
            if j not in tscan_matched and positions_match(arag_start, tscan_row['tscan_start']):
                tscan_match = j
                tscan_matched.add(j)
                matched_row.update(tscan_row.to_dict())
                break
        
        # Find matching cp_trna rows
        cp_match = None
        for k, cp_row in df_cp.iterrows():
            if k not in cp_matched and positions_match(arag_start, cp_row['cp_start']):
                cp_match = k
                cp_matched.add(k)
                matched_row.update(cp_row.to_dict())
                break
        
        arag_matched.add(i)
        merged_rows.append(matched_row)
    
    # Add unmatched trnascan rows
    for j, tscan_row in df_tscan.iterrows():
        if j not in tscan_matched:
            matched_row = tscan_row.to_dict()
            tscan_start = tscan_row['tscan_start']
            
            # Check if it matches with cp_trna
            for k, cp_row in df_cp.iterrows():
                if k not in cp_matched and positions_match(tscan_start, cp_row['cp_start']):
                    cp_matched.add(k)
                    matched_row.update(cp_row.to_dict())
                    break
            
            merged_rows.append(matched_row)
    
    # Add unmatched cp_trna rows
    for k, cp_row in df_cp.iterrows():
        if k not in cp_matched:
            merged_rows.append(cp_row.to_dict())
    
    # Create the final merged dataframe
    df_merged = pd.DataFrame(merged_rows)
    
    # Sort by the first available start position
    def get_sort_key(row):
        if pd.notna(row.get('gb_start')):
            return row['gb_start']
        elif pd.notna(row.get('tscan_start')):
            return row['tscan_start']
        elif pd.notna(row.get('cp_start')):
            return row['cp_start']
        else:
            return float('inf')
    
    df_merged['sort_key'] = df_merged.apply(get_sort_key, axis=1)
    df_merged = df_merged.sort_values('sort_key').drop('sort_key', axis=1)
    df_merged = df_merged.reset_index(drop=True)
    
    return df_merged

# Example usage:
df_comb = merge_trna_dataframes(df_aragorn, df_trnascan, df_cp_trna, tolerance=40)
display(df_comb)

Unnamed: 0,gb_label,arag_label,gb_strand,gb_start,gb_end,arag_strand,arag_start,arag_end,tscan_start,tscan_end,tscan_3_letter,tscan_anticodon,cp_gene,cp_product,cp_strand,cp_start,cp_end
0,tRNA-His,tRNA-His(gtg),-,3,77,-,3,77,4.0,77.0,His,GTG,trnH-GUG,tRNA-His,-1,5,80
1,tRNA-Lys,tRNA-Lys(ttt),-,1759,1794,-,1759,4358,,,,,trnK-UUU,,-1,1760,4360
2,tRNA-???,tRNA-Gln(ttg),-,7244,7316,-,7245,7316,7245.0,7316.0,Gln,TTG,trnQ-UUG,tRNA-Gln,-1,7244,7316
3,tRNA-???,tRNA-Ser(gct),-,8490,8577,-,8490,8577,8490.0,8577.0,Ser,GCT,trnS-GCU,,-1,8489,8577
4,tRNA-Cys,tRNA-Cys(gca),+,9331,9402,+,9331,9402,9331.0,9401.0,Cys,GCA,trnC-GCA,,1,9348,9420
5,tRNA-Asp,tRNA-Asp(gtc),-,11547,11620,-,11545,11621,11547.0,11620.0,Asp,GTC,trnD-GUC,tRNA-Asp,-1,11561,11638
6,tRNA-???,tRNA-Tyr(gta),-,11734,11817,-,11732,11818,11734.0,11817.0,Tyr,GTA,trnY-GUA,tRNA-Tyr,-1,11748,11835
7,tRNA-???,tRNA-Glu(ttc),-,11921,11992,-,11920,11993,11921.0,11992.0,Glu,TTC,trnE-UUC,tRNA-Glu,-1,11936,12010
8,tRNA-???,tRNA-Arg(tct),-,29818,29889,-,29818,29889,29818.0,29889.0,Arg,TCT,trnR-UCU,,-1,29853,29925
9,tRNA-Gly,tRNA-Ser(cga),-,30095,30142,-,30094,30875,,,,,trnS-CGA,,-1,30129,30911


### Combine with codone table data

In [41]:
import pandas as pd
import re

df_comb['arag_aa'] = df_comb['arag_label'].str.split("-", expand=True)[1].str.split("(", expand=True)[0]


df_comb = df_comb.merge(
        df_aa[['3_letter', '1_letter_code']],
        how='left',
        left_on='arag_aa',
        right_on='3_letter'
    )

display(df_comb)

Unnamed: 0,gb_label,arag_label,gb_strand,gb_start,gb_end,arag_strand,arag_start,arag_end,tscan_start,tscan_end,tscan_3_letter,tscan_anticodon,cp_gene,cp_product,cp_strand,cp_start,cp_end,arag_aa,3_letter,1_letter_code
0,tRNA-His,tRNA-His(gtg),-,3,77,-,3,77,4.0,77.0,His,GTG,trnH-GUG,tRNA-His,-1,5,80,His,His,H
1,tRNA-Lys,tRNA-Lys(ttt),-,1759,1794,-,1759,4358,,,,,trnK-UUU,,-1,1760,4360,Lys,Lys,K
2,tRNA-???,tRNA-Gln(ttg),-,7244,7316,-,7245,7316,7245.0,7316.0,Gln,TTG,trnQ-UUG,tRNA-Gln,-1,7244,7316,Gln,Gln,Q
3,tRNA-???,tRNA-Ser(gct),-,8490,8577,-,8490,8577,8490.0,8577.0,Ser,GCT,trnS-GCU,,-1,8489,8577,Ser,Ser,S
4,tRNA-Cys,tRNA-Cys(gca),+,9331,9402,+,9331,9402,9331.0,9401.0,Cys,GCA,trnC-GCA,,1,9348,9420,Cys,Cys,C
5,tRNA-Asp,tRNA-Asp(gtc),-,11547,11620,-,11545,11621,11547.0,11620.0,Asp,GTC,trnD-GUC,tRNA-Asp,-1,11561,11638,Asp,Asp,D
6,tRNA-???,tRNA-Tyr(gta),-,11734,11817,-,11732,11818,11734.0,11817.0,Tyr,GTA,trnY-GUA,tRNA-Tyr,-1,11748,11835,Tyr,Tyr,Y
7,tRNA-???,tRNA-Glu(ttc),-,11921,11992,-,11920,11993,11921.0,11992.0,Glu,TTC,trnE-UUC,tRNA-Glu,-1,11936,12010,Glu,Glu,E
8,tRNA-???,tRNA-Arg(tct),-,29818,29889,-,29818,29889,29818.0,29889.0,Arg,TCT,trnR-UCU,,-1,29853,29925,Arg,Arg,R
9,tRNA-Gly,tRNA-Ser(cga),-,30095,30142,-,30094,30875,,,,,trnS-CGA,,-1,30129,30911,Ser,Ser,S
