In [101]:
### get results of Phanotate and Prodigal and clean ORFs that are 40AA and less
# should process genome by genome but can load file and split internaly
from Bio import SeqIO
from itertools import chain

In [91]:
phan_filepath       = '/Users/kszczepaniak/Data/Phage/phage-pipeline-input-300/genome-seqs-phan.fa'
phan_verified_fpath = '/Users/kszczepaniak/Data/Phage/phage-pipeline-input-300/genome-seqs-phan-verified.fa'
prod_filepath       = '/Users/kszczepaniak/Data/Phage/phage-pipeline-input-300/genome-seqs-prod.faa'
orf_len_cutoff      = 3*40 # phanotate seqs reported in nt, therefore cutoff is 3 * nb_of_AA
orf_overlap_cutoff  = 0.9

In [102]:
# find all phanotate ORFs shorter than desired orf_len_cutoff and store in dict (genome.id == keys)
phan_orfs_short = {}

phanf = SeqIO.parse(phan_filepath, "fasta")
for i, record in enumerate(phanf):
    if len(record.seq) < orf_len_cutoff:
        genome_id     = '.'.join(record.id.split('.')[:-1])
        start_pos_raw = int(record.description.split('START=')[-1].split(']')[0])
        stop_pos_raw  = int(record.id.split('.')[-1])
        # rearrange so the smaller pos is always first (strandness stored in other value)
        start_pos     = start_pos_raw if start_pos_raw < stop_pos_raw else stop_pos_raw
        stop_pos      = stop_pos_raw if start_pos_raw < stop_pos_raw else start_pos_raw
        orf_len       = len(record.seq)
        strand        = '+' if start_pos_raw < stop_pos_raw else '-'
        if genome_id in phan_orfs_short.keys():
            phan_orfs_short[genome_id].append([start_pos, stop_pos, orf_len, strand]) # orf start, stop, len
        else:
            phan_orfs_short[genome_id] = [[start_pos, stop_pos, orf_len, strand]]

print('Genomes with short ORFs:', len(phan_orfs_short))
print('Total short ORFs:', len(list(chain(*[ v for v in phan_orfs_short.values() ]))))

Genomes with short ORFs: 313
Total short ORFs: 4181


In [103]:
# parse Prodigal results file and store in dict (genome.id == keys)
prod_orfs = {}

prodf = SeqIO.parse(prod_filepath, "fasta")
for record in prodf:
    genome_id   = '_'.join(record.id.split('_')[:-1])
    record_desc = record.description.split('#')
    start_pos = int(record_desc[1])
    stop_pos  = int(record_desc[2])
    orf_len   = len(record.seq)*3 # prodigal returns AA seqs when Phanotate nt, this is to match seq lengths
    strand    = '+' if int(record_desc[3]) == 1 else '-'
        
    if genome_id in prod_orfs.keys():
        prod_orfs[genome_id].append([start_pos, stop_pos, orf_len, strand]) # orf start, stop, len
    else:
        prod_orfs[genome_id] = [[start_pos, stop_pos, orf_len, strand]]

In [105]:
# for every short ORF: go through prodigal data and fing two closest ORFs;
# check if either of them overlaps with short one in more than orf_overlap_cutoff;
# if yes: keep ORF, if not: remove

def find_two_closest_orfs(sorf, prod_orfs_genome):
    sorf_start = sorf[0]
    close_porf = []
    for porf in prod_orfs_genome:
        porf_start = porf[0]
        if sorf_start >= porf_start:
            close_porf = porf
        else:
            if close_porf: # if close_porf non-empty, i.e. sORF not in very beginning of genome:
                try:
                    return [close_porf, prod_orfs_genome[prod_orfs_genome.index(close_porf)+1]]
                except IndexError: # this was the last pORF, only return first one
                    return [close_porf]
            else:
                return [prod_orfs_genome[0]] # sORF is at the beginning of genome, return first pORF from Prodigal
    
    return [prod_orfs_genome[-1]] # if loop is complete without return it means sORF was at the end of genome

def check_close_ORFs_overlap(sorf, closest_prod_orfs, orf_overlap_cutoff):
    
    sorf_span = set([ i for i in range(sorf[0], sorf[1]+1)])
    for corf in closest_prod_orfs:
        corf_span    = set([ i for i in range(corf[0], corf[1]+1)])
        orfs_overlap = len(sorf_span.intersection(corf_span)) / len(sorf_span.union(corf_span))
        if orfs_overlap >= orf_overlap_cutoff:
            return True
    
    return False

#############################

phan_orfs_short_verified = {}

for genome_id, short_orfs in phan_orfs_short.items():
    phan_orfs_short_verified[genome_id] = []
#     print('>>>', genome_id)
    prod_orfs_genome = prod_orfs[genome_id]
#     print(prod_orfs_genome)
    if not prod_orfs_genome: # if no ORFs were detected then ignore genome and return no ORFs
        continue
    for sorf in short_orfs:
        closest_prod_orfs = find_two_closest_orfs(sorf, prod_orfs_genome)
#         print('SORF', sorf)
#         print(closest_prod_orfs)
        
        if check_close_ORFs_overlap(sorf, closest_prod_orfs, orf_overlap_cutoff):
            phan_orfs_short_verified[genome_id].append(sorf[0]) # store only start_pos, it is unique in genome
        
print('Total short ORFs after verification:', len(list(chain(*[ v for v in phan_orfs_short_verified.values() ]))))

Total short ORFs after verification: 308


In [106]:
# save new Phanotate file with only verified ORFs

phanf         = SeqIO.parse(phan_filepath, "fasta")
phan_verified = []

for record in phanf:
    if len(record.seq) < orf_len_cutoff:
        # load data from record
        genome_id     = '.'.join(record.id.split('.')[:-1])
        start_pos_raw = int(record.description.split('START=')[-1].split(']')[0])
        # rearrange so the smaller pos is always first (strandness stored in other value)
        start_pos     = start_pos_raw if start_pos_raw < stop_pos_raw else stop_pos_raw
        verified_orfs_genome = phan_orfs_short_verified[genome_id]
        if start_pos in verified_orfs_genome:
            phan_verified.append(record)
    else:
        phan_verified.append(record)
        
SeqIO.write(phan_verified, phan_verified_fpath, "fasta")

44379