This notebook is for troubleshooting plastomes using NCBI-provided software pipelines. It was a suggestion from NCBI team rejected the annotation twice after uploading.

In [22]:
import os

# Defining variables for files
template = 'plastomes/final/template.sbt'
gb = 'plastomes/final/Crepis_callicephala.gb'
fasta = 'plastomes/in/Crepis_callicephala.fasta'

# command itself:
# table2asn -t template.sbt -i sequence.fsa

In [23]:
# check for fasta

from Bio import SeqIO, SeqRecord

try:
    with open(fasta, "r") as fasta_file:
        print(f"File {fasta} does exist.")
except FileNotFoundError:
    print(f"File '{fasta}' not found.\nConverting genbank file '{gb}' to FASTA...")
    try:
        first_record = next(SeqIO.parse(gb, "genbank"))
        SeqIO.write(first_record, fasta, "fasta")
        print(f"Successfully created '{fasta}'.")
    except FileNotFoundError:
        print(f"Error: The source GenBank file '{gb}' was not found.")
    except StopIteration:
        print(f"Error: The GenBank file '{gb}' is empty.")

File plastomes/in/Crepis_callicephala.fasta does exist.


## Run `table2asn` program

There are some recommendations on NCBI site that were not illuminated at Readme file of the program.
`table2asn` will recognize files with **the same basename** as the input sequence file. Sequences that are part of a plasmid, or an organellar chromosome, or specific nuclear chromosomes need to have that information included in the fasta definition line, in these formats:

- [location=mitochondrion]
- [location=chloroplast]

Sequences that are a complete circular chromosome or plasmid need to have the circular topology and the completeness included.

- [topology=circular] [completeness=complete]
- [topology=circular] gap at end, not circularized




In [None]:
# preparing directory
# all the files should be stored at the same directory
import os

project_dir = 'plastomes/table2asn'
tbl_file = "plastomes/final/gb2sequin_out/c_callicephala_final/05/Crepis_callicephala___149980_bp____DNA.tbl"
sqn_file = "plastomes/final/gb2sequin_out/c_callicephala_final/05/Crepis_callicephala___149980_bp____DNA(1).sqn"

SHORT_NAMES = {
    "Crepis_callicephala": "cc",
    "Crepis_purpurea": "cp",
}

def create_symlinks(project_dir: str, sourcefile: str):
    """
    Create symlinks for files to process.
    """
    basename = os.path.basename(sourcefile)
    ext= basename.rsplit(".")[-1]
    #print("ext", ext)

    # handle sql and tbl names
    basename = str(basename).rsplit("___")[0]

    #print("basename", basename)
    species = basename.rsplit(".")[0]
    #print("species", species)
    
    label = SHORT_NAMES[species]
    #print("label", label)
    target_dir = os.path.join(project_dir, label)
    #print("target_dir", target_dir)
    abs_target_dir = os.path.abspath(target_dir)
    # filename should match sequence id in FASTA header
    target_filename = os.path.join(abs_target_dir, f"{species}.{ext}")
    
    #print("target_filename", target_filename)
    abs_sourcefile = os.path.abspath(sourcefile)

    try:
        os.path.exists(sourcefile)
        try:
            # fixing first relative paths
            os.symlink(abs_sourcefile, target_filename)
            print(f"Symlink '{target_filename}' for '{abs_sourcefile}' was successfully created.")
        except FileExistsError:
            print(f"Symlink for '{sourcefile}' is already exist.")
        except PermissionError:
            print(f"You have no permissions for this action.")
    except FileExistsError:
        print(f"Source file '{sourcefile}' does not exist.")


print("Creating symbolic links:")
# create_symlinks(project_dir, gb) #  this is output file
# create_symlinks(project_dir, fasta) # the header line should be modified
create_symlinks(project_dir, tbl_file)
#create_symlinks(project_dir, sqn_file) #  this is output file


Creating symbolic links:
Symlink for 'plastomes/in/Crepis_callicephala.fasta' is already exist.
Symlink for 'plastomes/final/gb2sequin_out/c_callicephala_final/05/Crepis_callicephala___149980_bp____DNA.tbl' is already exist.


The header line of fasta file should follow definition line conventions. To keep previous files intact but change required data, the file should be just copied instead of linking.

In [31]:
import shutil

destination_directory = f"{project_dir}/cc"

try:
    shutil.copy(fasta, destination_directory)
    print(f"File '{fasta}' copied to '{destination_directory}' successfully.")
except FileNotFoundError:
    print("Source file not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File 'plastomes/in/Crepis_callicephala.fasta' copied to 'plastomes/table2asn/cc' successfully.


In [26]:
# testing table2asn
import subprocess
import sys


# Configuration
CONDA_ENV_NAME = "plastome_post-env"
INPUT_SUBDIR = "cc"

location = "[location=chloroplast]"
topology = "[topology=circular]"
completeness = "[completeness=complete]"
organism = "[organism=Crepis callicephala]"
code = "[gcode=11]"
pcode = "[gpcode=11]"

src_qualifiers = f"{organism} {location} {topology} {completeness} {code}"
print(src_qualifiers)

input_dir = f"{project_dir}/cc"
# Check inputs
print(f"Checking input directory...\n  {input_dir} is present: {os.path.isdir(input_dir)}")
print(f"Checking input files...\n  {template} is present: {os.path.isfile(template)}")
present_flag = False
for i in os.listdir(input_dir):
    file = os.path.join(input_dir, i)
    present_flag = os.path.exists(file)
    print(f"  {i} is present: {present_flag}")
    if present_flag == False:
        print(f"    relative path: {file}")

    


# table2asn -indir {project_dir}/cc -t {template} -j {comment}

cmd = [
    "conda", "run", "-n", "table2asn-env",
    "table2asn",
    "-indir", str(os.path.abspath(input_dir)),
    "-t", str(os.path.abspath(template)),
    "-j", f"'{src_qualifiers}'",
    "-V", "vb",
    "-verbose",
]

cmd_version = [
    "conda", "run", "-n", "table2asn-env",
    "table2asn",
    "-version",
]

print(f"Running: {' '.join(cmd)}")
#print(f"Running: {' '.join(cmd_version)}")

try:
    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    print("STDOUT:", result.stdout, flush=True)
    print("STDERR:", result.stderr, flush=True)
    print("`table2asn` completed successfully!", flush=True)
except subprocess.CalledProcessError as e:
    print(f"`table2asn` failed with exit code {e.returncode}")
    sys.exit(e.returncode)

[organism=Crepis callicephala] [location=chloroplast] [topology=circular] [completeness=complete] [gcode=11]
Checking input directory...
  plastomes/table2asn/cc is present: True
Checking input files...
  plastomes/final/template.sbt is present: True
  Crepis_callicephala.tbl is present: True
  Crepis_callicephala.fasta is present: True
Running: conda run -n table2asn-env table2asn -indir /home/asan/BIO_GENOMICS/notebooks/plastome_postprocessing/plastomes/table2asn/cc -t /home/asan/BIO_GENOMICS/notebooks/plastome_postprocessing/plastomes/final/template.sbt -j '[organism=Crepis callicephala] [location=chloroplast] [topology=circular] [completeness=complete] [gcode=11]' -V vb -verbose
`table2asn` failed with exit code 2


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


The `table2asn` did not performed well. The working directory does not contain any output files.
There the "This copy of table2asn is more than 1 year old. Please download the current version if it is newer." message appeared in STDERR. Trying to launch binary executable file from the fresh release might be a solution.

In [27]:
table2asn = "bin/table2asn.linux64"

import subprocess
import sys

cmd_version = [
    table2asn,
    "-version",
]

try:
    result = subprocess.run(cmd_version, check=True, capture_output=True, text=True)
    print("STDOUT:", result.stdout, flush=True)
    print("STDERR:", result.stderr, flush=True)
    print("`table2asn` completed successfully!", flush=True)
except subprocess.CalledProcessError as e:
    print(f"`table2asn` failed with exit code {e.returncode}")
    sys.exit(e.returncode)



STDOUT: table2asn: 1.29.324

STDERR: 
`table2asn` completed successfully!


In [29]:
import os

cmd = [
    table2asn,
    "-indir", str(os.path.abspath(input_dir)),
    "-t", str(os.path.abspath(template)),
    "-V", "vb",
    "-verbose",
]

print(f"Running: {' '.join(cmd)}")

try:
    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    print("STDOUT:", result.stdout, flush=True)
    print("STDERR:", result.stderr, flush=True)
    print("`table2asn` completed successfully!", flush=True)
except subprocess.CalledProcessError as e:
    print(f"`table2asn` failed with exit code {e.returncode}")
    sys.exit(e.returncode)

Running: bin/table2asn.linux64 -indir /home/asan/BIO_GENOMICS/notebooks/plastome_postprocessing/plastomes/table2asn/cc -t /home/asan/BIO_GENOMICS/notebooks/plastome_postprocessing/plastomes/final/template.sbt -V vb -verbose
STDOUT: 
STDERR: 
`table2asn` completed successfully!
