In [1]:
import os
import regex as re
import pandas as pd
import math
from Bio import SeqIO
from Bio import motifs
import warnings
from tqdm import tqdm

def extra_upORF(seq_record, upORFs = '', uplength = 20, downlength = 0): #提取阅读框上游序列
    if uplength + downlength <= 0:
        raise ValueError("The length of the upORF cannot be zero or negative.")
        return upORFs
    if 'circular' in seq_record.annotations['topology']:
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    if (feature.location.end - feature.location.start) == len(seq_record.seq) and len(feature.location.parts) >= 2:
                        start_index = feature.location.parts[0].start
                    else:
                        start_index = feature.location.start
                    if (start_index - uplength) < 0:
                        start_index = start_index + len(seq_record.seq)
                        upORF_seq = (seq_record.seq+seq_record.seq)[(start_index - uplength):(start_index + downlength)]
                    elif (start_index + downlength) >= len(seq_record.seq):
                        upORF_seq = (seq_record.seq+seq_record.seq)[(start_index - uplength):(start_index + downlength)]
                    else:
                        upORF_seq = seq_record.seq[(start_index - uplength):(start_index + downlength)]
                else:
                    if (feature.location.end - feature.location.start) == len(seq_record.seq) and len(feature.location.parts) >= 2:
                        end_index = feature.location.parts[0].end
                    else:
                        end_index = feature.location.end
                    if (end_index + uplength) >= len(seq_record.seq):
                        upORF_seq = (seq_record.seq+seq_record.seq)[(end_index - downlength):(end_index + uplength)].reverse_complement()
                    elif (end_index - downlength) < 0:
                        end_index = end_index + len(seq_record.seq)
                        upORF_seq = (seq_record.seq+seq_record.seq)[(end_index - downlength):(end_index + uplength)].reverse_complement()
                    else:
                        upORF_seq = seq_record.seq[(end_index - downlength):(end_index + uplength)].reverse_complement()
                if len(upORF_seq) < (uplength + downlength):
                    continue
                elif upORF_seq.strip('ATCG') != '':
                    continue
                else:
                    try:
                        upORFs += '>' + feature.qualifiers['locus_tag'][0] + '\n' + str(upORF_seq) + '\n'
                    except:
                        upORFs += '>' + feature.qualifiers['protein_id'][0] + '\n' + str(upORF_seq) + '\n'
        return upORFs
    else:
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    upORF_seq = seq_record.seq[max(0,feature.location.start - uplength):max(0,feature.location.start + downlength)]
                else:
                    upORF_seq = seq_record.seq[max(0,feature.location.end - downlength):(feature.location.end + uplength)].reverse_complement()
                if len(upORF_seq) < (uplength + downlength):
                    continue
                elif upORF_seq.strip('ATCG') != '':
                    continue
                else:
                    try:
                        upORFs += '>' + feature.qualifiers['locus_tag'][0] + '\n' + str(upORF_seq) + '\n'
                    except:
                        upORFs += '>' + feature.qualifiers['protein_id'][0] + '\n' + str(upORF_seq) + '\n'
        return upORFs

warnings.filterwarnings('ignore')

try: os.makedirs('/home/zhongshitong/data/up-ORF-motif_simple')
except: pass
os.chdir('/home/zhongshitong/data/up-ORF-motif_simple')

In [2]:
ass_name = 'Deinococcus radiodurans'
assembly = 'GCF_020546685.1'

try: os.makedirs(f'/home/zhongshitong/data/up-ORF-motif_simple/assembly/{ass_name}')
except: pass
os.chdir(f'/home/zhongshitong/data/up-ORF-motif_simple/assembly/{ass_name}')
seq_data = SeqIO.parse(f'/home/zhongshitong/data/genomes/Deinococcus-Thermus_genomes/data/{assembly}/genomic.gbff', format="gb")
upORF_file = open('upORFseqs.txt', 'w+')
upORFs = ''
for seq_record in seq_data:
    upORFs += extra_upORF(seq_record)

upORF_file.write(upORFs)
upORF_file.close()

os.system('nohup meme upORFseqs.txt -dna -w 6 -mod zoops -nmotifs 2 -brief 50000 -nostatus -p 1 -nostatus > /dev/null 2>&1')

0

In [3]:
uplen, downlen = 300, 150
os.chdir(f'/home/zhongshitong/data/up-ORF-motif_simple/assembly/{ass_name}')
seq_data = SeqIO.parse(f'/home/zhongshitong/data/genomes/Deinococcus-Thermus_genomes/data/{assembly}/genomic.gbff', format="gb")
upORF_file = open(f'upORFseqs_{uplen}_{downlen}.txt', 'w+')
upORFs = ''
for seq_record in seq_data:
    upORFs += extra_upORF(seq_record, uplength = uplen, downlength = downlen)

upORF_file.write(upORFs)
upORF_file.close()

seq_res = SeqIO.parse(f'upORFseqs_{uplen}_{downlen}.txt',"fasta")
seq_ids = [seq.id for seq in SeqIO.parse(f'upORFseqs_{uplen}_{downlen}.txt',"fasta")]
with tqdm(total = len(seq_ids), desc='Program', leave=True, ncols=100, unit='B', unit_scale=True) as pbar:
    columns = ['motif_id', 'motif_alt_id', 'sequence_name', 'start', 'stop', 'strand', 'score', 'p-value', 'q-value', 'matched_sequence']
    full_match_result = pd.DataFrame(columns = columns)
    os.system(f'nohup fimo --thresh 0.015 --norc meme_out/meme.html upORFseqs_{uplen}_{downlen}.txt > /dev/null 2>&1')
    match_result = pd.read_csv(f'fimo_out/fimo.tsv', sep = '\t|;', engine = 'python', comment='#')
    for seq in seq_res:
        temp_data = pd.DataFrame(columns = columns)
    
        temp_upORF = str(seq.seq)
        temp_match_result = match_result.loc[(match_result['sequence_name']==str(seq.id))&(match_result['motif_alt_id']=='MEME-1')].reset_index(drop=True)
        pbar.update(1)
        if temp_match_result.empty:
            continue
        for j in range(0, len(temp_match_result['motif_id'])):
            if temp_match_result['stop'][j] + 20 < uplen: #排除终止密码子的打断
                down_seq = temp_upORF[temp_match_result['stop'][j] + 1: uplen]
                stopcodes = ['TAG', 'TAA', 'TGA']
                td = temp_match_result.loc[[j]]
                for stopcode in stopcodes:
                    if stopcode in down_seq:
                        find_result = re.finditer(stopcode, down_seq, overlapped = True)
                        for find_re in find_result:
                            if (len(temp_upORF) - re.search(down_seq, temp_upORF).span()[1] + len(down_seq) - find_re.span()[1] - downlen) % 3 == 0:
                                td = pd.DataFrame(columns = columns)
                                break
                if td.empty: continue
            down_seq = temp_upORF[temp_match_result['stop'][j] + 1: temp_match_result['stop'][j] + 20]
            if 'ATG' in down_seq or 'GTG' in down_seq: #查找下游起始密码子
                find_result = re.finditer('[AG]TG', down_seq, overlapped = True)
                for find_re in find_result:
                    if (len(temp_upORF) - re.search(down_seq, temp_upORF).span()[1] + len(down_seq) - find_re.span()[1] - downlen) % 3 == 0:
                        temp_data = pd.concat([temp_data, temp_match_result.loc[[j]]], ignore_index=True)
                        break
        if temp_data.empty:
            continue
        else:
            best_match_index = 0
            startcode_distance = max(uplen, downlen)
            temp_data.sort_values(by='start').reset_index(drop=True)
            for k in range(0, len(temp_data['motif_id'])):
                if k%2 == 0: j = k
                else: j = len(temp_data['motif_id']) - k
                if abs((temp_data['start'][j] + temp_data['stop'][j])/2 - uplen) < startcode_distance:
                    best_match_index = j
                    startcode_distance = abs((temp_data['start'][j] + temp_data['stop'][j])/2 - uplen)
                if -20 < (temp_data['start'][j] + temp_data['stop'][j])/2 - uplen < 0 or (temp_data['p-value'][j] < 0.006 and 'TA' in temp_data['motif_id'][j]):
                    break
            full_match_result = pd.concat([full_match_result, temp_data.loc[[best_match_index]]], ignore_index=True)
    
    for i in range(0, len(full_match_result['motif_id'])):
        full_match_result.loc[i, 'start'] = full_match_result['start'][i] - uplen
        full_match_result.loc[i, 'stop'] = full_match_result['stop'][i] - uplen
    full_match_result.to_csv('full_match_result.csv')

Program: 100%|███████████████████████████████████████████████████| 3.15k/3.15k [00:30<00:00, 104B/s]


In [4]:
os.chdir(f'/home/zhongshitong/data/up-ORF-motif_simple/assembly/{ass_name}')
seq_data = SeqIO.parse(f'/home/zhongshitong/data/genomes/Deinococcus-Thermus_genomes/data/{assembly}/genomic.gbff', format="gb")
CDS_file = open('CDS.txt', 'w+')
TA_CDS_file = open('TA_CDS.txt', 'w+')
CDS = ''
TA_CDS = ''

TA_info = [name for name in full_match_result['sequence_name']]

for seq_record in seq_data:
    for feature in seq_record.features:
        if feature.type == 'CDS':
            try:
                CDS += '>'+feature.qualifiers['locus_tag'][0]+'\n'+feature.qualifiers['translation'][0]+'\n'
            except:
                continue
            if feature.qualifiers['locus_tag'][0] in TA_info:
                TA_CDS += '>'+feature.qualifiers['locus_tag'][0]+'\n'+feature.qualifiers['translation'][0]+'\n'

CDS_file.write(CDS)
CDS_file.close()
TA_CDS_file.write(TA_CDS)
TA_CDS_file.close()

In [5]:
ass_info = {'Calidithermus chliarophilus':'GCF_000430045.1', 'Deinococcus geothermalis': 'GCF_000196275.1', 
            'Deinobacterium chartae':'GCF_014202645.1', 'Meiothermus silvanus':'GCF_000092125.1', 
            'Marinithermus hydrothermalis':'GCF_000195335.1', 'Oceanithermus profundus':'GCF_000183745.1', 
            'Thermus thermophilus':'GCF_000091545.1', 'Truepera radiovictrix':'GCF_000092425.1'}

with tqdm(total = len(ass_info), desc='Program', leave=True, ncols=100, unit='B', unit_scale=True) as pbar:
    for ass_name in ass_info:
        try: os.makedirs(f'/home/zhongshitong/data/up-ORF-motif_simple/assembly/{ass_name}')
        except: pass
        assembly = ass_info[ass_name]
        os.chdir(f'/home/zhongshitong/data/up-ORF-motif_simple/assembly/{ass_name}')
        seq_data = SeqIO.parse(f'/home/zhongshitong/data/genomes/Deinococcus-Thermus_genomes/data/{assembly}/genomic.gbff', format="gb")
        upORF_file = open('upORFseqs.txt', 'w+')
        upORFs = ''
        for seq_record in seq_data:
            upORFs += extra_upORF(seq_record, uplength = 20)

        upORF_file.write(upORFs)
        upORF_file.close()

        os.system('nohup meme upORFseqs.txt -dna -w 10 -mod zoops -nmotifs 2 -brief 50000 -nostatus -p 1 -nostatus > /dev/null 2>&1')
        pbar.update(1)

Program: 100%|████████████████████████████████████████████████████| 8.00/8.00 [10:22<00:00, 77.8s/B]
