### Here is the development of some code that will be the part of EAGLE.ovORFs package
The check of 4 pairs - candidates into AOGs from AORFs project

In [20]:
# imports
import os
import re
import shutil
from collections import defaultdict

from Bio.Seq import Seq

In [33]:
# input
work_dir = "/media/denis/Data/Data/Bioinf/Projects/Reverse_ORFs/special/4AOGs"
genomes_fna_dir = "/media/denis/Data/Data/Bioinf/Projects/Reverse_ORFs/Base/"

fam_fna_examp = {
    "NC_006833": os.path.join(work_dir, "fna", "NC_006833.fasta"),
    "NC_011898": os.path.join(work_dir, "fna", "NC_011898.fasta"),
    "NC_013156": os.path.join(work_dir, "fna", "NC_013156.fasta"),
    "NC_014810": os.path.join(work_dir, "fna", "NC_014810.fasta"),
}

tblastn_res_tab_path = os.path.join(work_dir, "tblastn_res.bl")  # outfmt 7; tblastn has been run previously against genomes base used in AORFs project

orfs_descr = [  # this list should be formed automatically on the ovORFs detection step
    {"ORF_id": "NC_011898|:c1102-653", "ORF_start": 1102, "ORF_end": 653, "ORF_ori": "-1", "ovORFs": {"+2": None, "+3": None, "-1": {"ovORF_id": "NC_011898|:27-1349", "P-value": 0.0000014, "found_without": False}, "-2": None, "-3": None}},
    {"ORF_id": "NC_011898|:27-1349", "ORF_start": 27, "ORF_end": 1349, "ORF_ori": "+1", "ovORFs": {"+2": None, "+3": None, "-1": {"ovORF_id": "NC_011898|:c1102-653", "P-value": 0.0000000412, "found_without": False}, "-2": None, "-3": None}},
    {"ORF_id": "NC_013156|:1484335-1484625", "ORF_start": 1484335, "ORF_end": 1484625, "ORF_ori": "+1", "ovORFs": {"+2": None, "+3": None, "-1": {"ovORF_id": "NC_013156|:c1485014-1483527", "P-value": 0.000000104, "found_without": False}, "-2": None, "-3": None}},
    {"ORF_id": "NC_013156|:c1485014-1483527", "ORF_start": 1485014, "ORF_end": 1483527, "ORF_ori": "-1", "ovORFs": {"+2": None, "+3": None, "-1": {"ovORF_id": "NC_013156|:1484335-1484625", "P-value": 0.000000279, "found_without": False}, "-2": None, "-3": None}},    
    {"ORF_id": "NC_006833|:c691-290", "ORF_start": 691, "ORF_end": 290, "ORF_ori": "-1", "ovORFs": {"+2": None, "+3": None, "-1": {"ovORF_id": "NC_006833|:111-1154", "P-value": 0.000000654, "found_without": False}, "-2": None, "-3": None}},
    {"ORF_id": "NC_006833|:111-1154", "ORF_start": 111, "ORF_end": 1154, "ORF_ori": "+1", "ovORFs": {"+2": None, "+3": None, "-1": {"ovORF_id": "NC_006833|:c691-290", "P-value": 0.000000642, "found_without": False}, "-2": None, "-3": None}},    
    {"ORF_id": "NC_014810|:1059006-1059758", "ORF_start": 1059006, "ORF_end": 1059758, "ORF_ori": "+1", "ovORFs": {"+2": None, "+3": None, "-1": None, "-2": {"ovORF_id": "NC_014810|:c1059766-1058375", "P-value": 0.000000107, "found_without": False}, "-3": None}},
    {"ORF_id": "NC_014810|:c1059766-1058375", "ORF_start": 1059766, "ORF_end": 1058375, "ORF_ori": "-1", "ovORFs": {"+2": None, "+3": None, "-1": None, "-2": {"ovORF_id": "NC_014810|:1059006-1059758", "P-value": 0.01194529, "found_without": False}, "-3": None}},  # no SCA
]

In [29]:
# get fam fna
def transport_files(dir_path_from, dir_path_to, f_name_pattern, cotransform=None):
    transport_res_dict = dict()
    files_list = os.listdir(dir_path_from)
    for f_name in files_list:
        try:
            os.listdir(os.path.join(dir_path_from, f_name))
            continue
        except OSError:
            pass
        if re.match(f_name_pattern, f_name) is None:
            continue
        if callable(cotransform):
            f_id, transf_f_path = cotransform(os.path.join(dir_path_from, f_name), dir_path_to)
            if f_id:
                transport_res_dict[f_id] = transf_f_path
        else:
            shutil.copyfile(os.path.join(dir_path_from, f_name), os.path.join(dir_path_to, f_name))
            transport_res_dict[f_name] = os.path.join(dir_path_to, f_name)
    return transport_res_dict


def transform_fna_header(fna_path, dir_path_to):
    fna_f = open(fna_path)
    for line_ in fna_f:
        line = None
        line = line_.strip()
        if not line:
            continue
        if line[0] == ">":
            fna_id = line.split("|")[3].split(".")[0]
            trans_fna_f_path = os.path.join(dir_path_to, fna_id+".fasta")
            trans_fna_f = open(trans_fna_f_path, "w")
            trans_fna_f.write(">"+fna_id+"\n")
            continue
        trans_fna_f.write(line+"\n")
    fna_f.close()
    trans_fna_f.close()
    return fna_id, trans_fna_f_path


fam_fna = transport_files(dir_path_from=genomes_fna_dir, 
                          dir_path_to=os.path.join(work_dir, "fna"),
                          f_name_pattern="NC_.*\.fasta",
                          cotransform=transform_fna_header)

# WARNING: do not run code from window below!!!

In [6]:
#fix fna ids in tblastn_res_tab (not needed step in the package)
def fix_fna_ids(blast_res_path, fixed_blast_res_path):
    # blast results must be in outfmt 7 format
    blast_res_f = open(blast_res_path)
    fixed_blast_res_f = open(fixed_blast_res_path, 'w')
    first_str = False
    orf_id = None
    
    for line_ in blast_res_f:
        line = None
        line = line_.strip()
        if not line:
            continue
        if line[0] == "#":
            fixed_blast_res_f.write(line+"\n")
            first_str = False
            continue
        line_list = line.split("\t")
        if not first_str:
            first_str = True
            orf_id = None
            orf_id = line_list[0].strip()
            fixed_blast_res_f.write("\t".join(line_list)+"\n")
        elif line_list[0].strip() == "AORF:":
            line_list[0] = orf_id
            line_list[1] = line_list[1].strip().split("|")[-2].split(".")[0].strip()
            fixed_blast_res_f.write("\t".join(line_list)+"\n")
        else:
            fixed_blast_res_f.write(line+"\n")
    
    blast_res_f.close()
    fixed_blast_res_f.close()
    

fix_fna_ids(blast_res_path=tblastn_res_tab_path, fixed_blast_res_path=tblastn_res_tab_path.replace(".bl", "_fixed.bl"))

In [3]:
#functions from EAGLE.lib to manipulate fasta files
def seq_from_fasta(fasta_path, seq_id, ori=+1, start=0, end=-1):
    fasta_dict = load_fasta_to_dict(fasta_path)
    if end >= start or end == -1:
        if ori > 0:
            return fasta_dict[seq_id][start-1: end]
        else:
            return str(Seq(fasta_dict[seq_id][start-1: end]).reverse_complement())       
    else:
        if ori > 0:
            return fasta_dict[seq_id][end-1: start]
        else:
            return str(Seq(fasta_dict[seq_id][end-1: start]).reverse_complement())       
    

def load_fasta_to_dict(fasta_path):
    fasta_dict = dict()
    seq_list = list()
    title = None
    fasta_f = open(fasta_path)
    for line_ in fasta_f:
        line = None
        line = line_.strip()
        if not line:
            continue
        if line[0] == ">":
            if title:
                fasta_dict[title] = "".join(seq_list)
                seq_list = list()
                title = None
            title = line[1:]
        else:
            seq_list.append(line)
    if title:
        fasta_dict[title] = "".join(seq_list)
        seq_list = list()
        title = None
    return fasta_dict

In [35]:
def detect_indep_ORFs(tblastn_res_path, orfs_descr, fam_fna):
    tblastn_res = read_blast_out(tblastn_res_path)
    for orf in orfs_descr:
        for orf_hom in tblastn_res[orf["ORF_id"]]:
            orf_hom_seq = None
            orf_hom_seq = seq_from_fasta(fam_fna[orf_hom["subj_id"]], orf_hom["subj_id"], get_ori(orf_hom["subj_start"], orf_hom["subj_end"]), orf_hom["subj_start"], orf_hom["subj_end"])
            orf["ovORFs"] = define_indep_orf(orf_hom_seq, orf["ovORFs"])
    return orfs_descr


def read_blast_out(blast_out_path, ev_thr=0.000001, aln_l_thr=180, ident_thr=0.35):
    # reads bloust outfmt 6 or 7
    blast_res_dict = defaultdict(list)
    blast_out_f = open(blast_out_path)
    for line_ in blast_out_f:
        orf_id = None
        line = None
        line = line_.strip()
        if not line:
            continue
        if line[0] == "#":
            continue
        line_list = line.split("\t")
        ev = float(line_list[10].strip())
        aln_l = abs(int(line_list[9].strip())-int(line_list[8].strip())+1)
        ident = float(line_list[2].strip())/100.0
        if ev <= ev_thr and aln_l >= aln_l_thr and ident >= ident_thr:
            blast_res_dict[line_list[0].strip()].append({
                "subj_id": line_list[1].strip(),
                "identity": ident,
                "aln_l": aln_l,
                "evalue": ev,
                "subj_start": int(line_list[8].strip()),
                "subj_end": int(line_list[9].strip()),
            })
    return blast_res_dict


def get_ori(c1, c2):
    try:
        if int(c1) <= int(c2):
            return +1
        else:
            return -1
    except ValueError:
        raise ValueError("Coordinates are not numbers: (%s, %s)" % (str(c1), str(c2)))

        
def define_indep_orf(orf_hom_seq, orf_ovorfs, orf_l_thr=180):
    if orf_ovorfs["+2"]:
        if not orf_ovorfs["+2"]["found_without"]:
            orf_ovorfs["+2"]["found_without"] = not is_orf(orf_hom_seq[1:-3], orf_l_thr)
    if orf_ovorfs["+3"]:
        if not orf_ovorfs["+3"]["found_without"]:
            orf_ovorfs["+3"]["found_without"] = not is_orf(orf_hom_seq[2:-2], orf_l_thr)
    if orf_ovorfs["-1"]:
        if not orf_ovorfs["-1"]["found_without"]:
            orf_ovorfs["-1"]["found_without"] = not is_orf(str(Seq(orf_hom_seq).reverse_complement()), orf_l_thr)
    if orf_ovorfs["-2"]:
        if not orf_ovorfs["-2"]["found_without"]:
            orf_ovorfs["-2"]["found_without"] = not is_orf(str(Seq(orf_hom_seq[2:-2]).reverse_complement()), orf_l_thr)
    if orf_ovorfs["-3"]:
        if not orf_ovorfs["-3"]["found_without"]:
            orf_ovorfs["-3"]["found_without"] = not is_orf(str(Seq(orf_hom_seq[1:-3]).reverse_complement()), orf_l_thr)
    return orf_ovorfs


def is_orf(seq, orf_l_thr):
    stop_codons = ("tga", "taa", "tag")
    l = 0
    n = 0
    while n < len(seq):
        if seq.lower()[n: n+3] in stop_codons:
            l = 0
            n += 3
            continue
        l += 3
        if l >= orf_l_thr-3:
            return True
        n += 3
    return False
    
    
detect_indep_ORFs(tblastn_res_tab_path, orfs_descr, fam_fna)

[{'ORF_end': 653,
  'ORF_id': 'NC_011898|:c1102-653',
  'ORF_ori': '-1',
  'ORF_start': 1102,
  'ovORFs': {'+2': None,
   '+3': None,
   '-1': {'P-value': 1.4e-06,
    'found_without': True,
    'ovORF_id': 'NC_011898|:27-1349'},
   '-2': None,
   '-3': None}},
 {'ORF_end': 1349,
  'ORF_id': 'NC_011898|:27-1349',
  'ORF_ori': '+1',
  'ORF_start': 27,
  'ovORFs': {'+2': None,
   '+3': None,
   '-1': {'P-value': 4.12e-08,
    'found_without': True,
    'ovORF_id': 'NC_011898|:c1102-653'},
   '-2': None,
   '-3': None}},
 {'ORF_end': 1484625,
  'ORF_id': 'NC_013156|:1484335-1484625',
  'ORF_ori': '+1',
  'ORF_start': 1484335,
  'ovORFs': {'+2': None,
   '+3': None,
   '-1': {'P-value': 1.04e-07,
    'found_without': True,
    'ovORF_id': 'NC_013156|:c1485014-1483527'},
   '-2': None,
   '-3': None}},
 {'ORF_end': 1483527,
  'ORF_id': 'NC_013156|:c1485014-1483527',
  'ORF_ori': '-1',
  'ORF_start': 1485014,
  'ovORFs': {'+2': None,
   '+3': None,
   '-1': {'P-value': 2.79e-07,
    'found_w

In [18]:
f_name = "NC_0001fasta"
f_name_pattern="NC_.*\.fasta"
print(re.match(f_name_pattern, f_name))

None
