In [1]:
import os
import pandas as pd
from Bio.Seq import Seq
from tqdm import tqdm
import re
from Bio import Entrez
from Bio import SeqIO
from Bio import motifs
import multiprocessing
import warnings

def RNA_blast(CDS_name, CDS_seq, taxonomy, que):
    try: os.makedirs(f'/home/zhongshitong/data/RNA_seq_result/{taxonomy}/temp/{CDS_name}')
    except: pass

    os.chdir(f'/home/zhongshitong/data/RNA_seq_result/{taxonomy}/temp/{CDS_name}')

    query_sequence = '>'+CDS_name+'\n'+CDS_seq
    query_file = open('RNAseq_queries.fasta','w+')
    query_file.write(query_sequence)
    query_file.close()
    
    os.system(f'blastn -query RNAseq_queries.fasta -db /home/zhongshitong/data/RNA_seq_data/{RNAseq_file} -out RNA_blastn_results.txt -evalue 1e-30 -outfmt 6 -max_target_seqs 50000 -num_threads 1')
    head = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
    align_result = pd.read_csv('RNA_blastn_results.txt', sep = '\t|;', engine = 'python', header = None, names = head)
    align_result = align_result[align_result['mismatch']==0]
    align_result = align_result[align_result['gapopen']==0]
    align_result = align_result[align_result['sstart']<align_result['send']]
    
    align_result = align_result.reset_index(drop=True)
    arr = query_sequence.split('\n')
    seq = ''
    for i in range(1, len(arr)):
        seq = seq + arr[i]
    count_arr = []
    for i in range(0, len(seq)): 
        count_arr.append(0)
    for i in range(0, len(align_result['sseqid'])):
        for j in range(align_result['qstart'][i] -1, align_result['qend'][i]):
            count_arr[j] += 1
    que.put([count_arr, CDS_name, CDS_seq])

def extra_CDSseq(seq_record, CDSseq = '', uplength = 400, downlength = 400):
    if 'circular' in seq_record.annotations['topology']:
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    if (feature.location.end - feature.location.start) == len(seq_record.seq) and len(feature.location.parts) >= 2:
                        start_index, end_index = feature.location.parts[0].start - len(seq_record.seq), feature.location.parts[-1].end
                    else:
                        start_index, end_index = feature.location.start, feature.location.end
                    if (start_index - uplength) < 0:
                        start_index = start_index + len(seq_record.seq)
                        end_index = end_index + len(seq_record.seq)
                        CDS_seq = (seq_record.seq+seq_record.seq)[(start_index - uplength):(end_index + downlength)]
                    elif (end_index + downlength) >= len(seq_record.seq):
                        CDS_seq = (seq_record.seq+seq_record.seq)[(start_index - uplength):(end_index + downlength)]
                    else:
                        CDS_seq = seq_record.seq[(start_index - uplength):(end_index + downlength)]
                else:
                    if (feature.location.end - feature.location.start) == len(seq_record.seq) and len(feature.location.parts) >= 2:
                        end_index, start_index = feature.location.parts[0].end, feature.location.parts[-1].start - len(seq_record.seq)
                    else:
                        end_index, start_index = feature.location.end, feature.location.start
                    if (end_index + uplength) >= len(seq_record.seq):
                        CDS_seq = (seq_record.seq+seq_record.seq)[(start_index - downlength):(end_index + uplength)].reverse_complement()
                    elif (start_index - downlength) < 0:
                        start_index = start_index + len(seq_record.seq)
                        end_index = end_index + len(seq_record.seq)
                        CDS_seq = (seq_record.seq+seq_record.seq)[(start_index - downlength):(end_index + uplength)].reverse_complement()
                    else:
                        CDS_seq = seq_record.seq[(start_index - downlength):(end_index + uplength)].reverse_complement()
                try:
                    CDSseq += '>' + feature.qualifiers['locus_tag'][0] + '\n' + str(CDS_seq) + '\n'
                except:
                    CDSseq += '>' + feature.qualifiers['protein_id'][0] + '\n' + str(CDS_seq) + '\n'
    else:
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    CDS_seq = seq_record.seq[max(0,feature.location.start - uplength):(feature.location.end + downlength)]
                else:
                    CDS_seq = seq_record.seq[(feature.location.start - downlength):(feature.location.end + uplength)].reverse_complement()
                try:
                    CDSseq += '>' + feature.qualifiers['locus_tag'][0] + '\n' + str(CDS_seq) + '\n'
                except:
                    CDSseq += '>' + feature.qualifiers['protein_id'][0] + '\n' + str(CDS_seq) + '\n'
    return CDSseq

warnings.filterwarnings('ignore')

In [2]:
taxonomy, assembly = 'Deinococcus radiodurans', 'GCF_020546685.1'

tax_n = taxonomy.replace(' ','_')
RNAseq_file = f'{tax_n}_RNAseq.blastdb'

seq_data = SeqIO.parse(f'/home/zhongshitong/data/genomes/Deinococcus-Thermus_genomes/data/{assembly}/genomic.gbff', format="gb")

try: os.makedirs(f'/home/zhongshitong/data/RNA_seq_result/{taxonomy}')
except: pass
os.chdir(f'/home/zhongshitong/data/RNA_seq_result/{taxonomy}')

CDSseq_file = open('CDSseqs_test.txt', 'w+')
CDSseqs = ''
for seq_record in seq_data:
    CDSseqs += extra_CDSseq(seq_record)

CDSseq_file.write(CDSseqs)
CDSseq_file.close()

CDSseq_data = SeqIO.parse(f'CDSseqs_test.txt', 'fasta')
CDSseq_ids = [seq.id for seq in SeqIO.parse(f'CDSseqs_test.txt', 'fasta')]

handle = open(f"/home/zhongshitong/data/up-ORF-motif_simple/assembly/{taxonomy}/meme_out/meme.xml")
record = motifs.parse(handle, "meme")
data = {'TA_names':[],'TA':[],'AG_names':[],'AG':[]}

manager = multiprocessing.Manager()
que = manager.Queue()
par = 32
tot = len(CDSseq_ids)
pool = multiprocessing.Pool(par)

for seq in CDSseq_data:
    CDS_name = seq.id
    CDS_seq = str(seq.seq)
    pool.apply_async(RNA_blast,(CDS_name, CDS_seq, taxonomy, que))

pool.close()

re_arr = ''
count = 0
with tqdm(total = len(CDSseq_ids), desc='Program', leave=True, ncols=100, unit='B', unit_scale=True) as pbar:
    while True:
        if not que.empty():
            result = que.get(True)
            count_arr, CDS_name, CDS_seq = result[0], result[1], result[2]
            seq_arr = ''
            cou_arr = ''
            for i in range(len(count_arr)):
                seq_arr += CDS_seq[i] + ' '
                cou_arr += str(count_arr[i]) + ' '
            re_arr += CDS_name+'\n'+seq_arr+'\n'+cou_arr+'\n'
            count_arr = count_arr[400:len(count_arr)-400]
            pbar.update(1)
            count += 1
            for motif in record:
                if 'TA' in motif.degenerate_consensus:
                    id_n = 'TA'
                elif 'AG' in motif.degenerate_consensus:
                    id_n = 'AG'
                found = False
                for instance in motif.instances:
                    if CDS_name == instance.sequence_name:
                        found = True
                        break
                if found:
                    data[id_n].append(sum(count_arr)/len(count_arr))
                    data[f'{id_n}_names'].append(CDS_name)
            if count == tot:
                break
        else:
            continue
            
pool.join()
os.system(f'rm -rf "/home/zhongshitong/data/RNA_seq_result/{taxonomy}/temp"')

os.chdir(f'/home/zhongshitong/data/RNA_seq_result/{taxonomy}')
re_arr_file = open(f'RNAseq_data_detail.txt', 'w+')
re_arr_file.write(re_arr)
re_arr_file.close()

TA_data = {'TA_names': data['TA_names'], 'TA_data': data['TA'], }
pd.DataFrame(TA_data).to_csv(f'TA-RNAseq_data.csv')
AG_data = {'AG_names': data['AG_names'], 'AG_data': data['AG'], }
pd.DataFrame(AG_data).to_csv(f'AG-RNAseq_data.csv')

Program: 100%|██████████████████████████████████████████████████| 3.15k/3.15k [10:08<00:00, 5.17B/s]


In [3]:
taxonomy, assembly = 'Thermus thermophilus', 'GCF_000091545.1'

tax_n = taxonomy.replace(' ','_')
RNAseq_file = f'{tax_n}_RNAseq_clean.blastdb'

seq_data = SeqIO.parse(f'/home/zhongshitong/data/genomes/Deinococcus-Thermus_genomes/data/{assembly}/genomic.gbff', format="gb")

try: os.makedirs(f'/home/zhongshitong/data/RNA_seq_result/{taxonomy}')
except: pass
os.chdir(f'/home/zhongshitong/data/RNA_seq_result/{taxonomy}')

CDSseq_file = open('CDSseqs_test.txt', 'w+')
CDSseqs = ''
for seq_record in seq_data:
    CDSseqs += extra_CDSseq(seq_record)

CDSseq_file.write(CDSseqs)
CDSseq_file.close()

CDSseq_data = SeqIO.parse(f'CDSseqs_test.txt', 'fasta')
CDSseq_ids = [seq.id for seq in SeqIO.parse(f'CDSseqs_test.txt', 'fasta')]

handle = open(f"/home/zhongshitong/data/up-ORF-motif_simple/assembly/{taxonomy}/meme_out/meme.xml")
record = motifs.parse(handle, "meme")
data = {'TA_names':[],'TA':[],'AG_names':[],'AG':[]}

manager = multiprocessing.Manager()
que = manager.Queue()
par = 32
tot = len(CDSseq_ids)
pool = multiprocessing.Pool(par)

for seq in CDSseq_data:
    CDS_name = seq.id
    CDS_seq = str(seq.seq)
    pool.apply_async(RNA_blast,(CDS_name, CDS_seq, taxonomy, que))

pool.close()

re_arr = ''
count = 0
with tqdm(total = len(CDSseq_ids), desc='Program', leave=True, ncols=100, unit='B', unit_scale=True) as pbar:
    while True:
        if not que.empty():
            result = que.get(True)
            count_arr, CDS_name, CDS_seq = result[0], result[1], result[2]
            seq_arr = ''
            cou_arr = ''
            for i in range(len(count_arr)):
                seq_arr += CDS_seq[i] + ' '
                cou_arr += str(count_arr[i]) + ' '
            re_arr += CDS_name+'\n'+seq_arr+'\n'+cou_arr+'\n'
            count_arr = count_arr[400:len(count_arr)-400]
            pbar.update(1)
            count += 1
            for motif in record:
                if 'TA' in motif.degenerate_consensus:
                    id_n = 'TA'
                elif 'AG' in motif.degenerate_consensus:
                    id_n = 'AG'
                found = False
                for instance in motif.instances:
                    if CDS_name == instance.sequence_name:
                        found = True
                        break
                if found:
                    data[id_n].append(sum(count_arr)/len(count_arr))
                    data[f'{id_n}_names'].append(CDS_name)
            if count == tot:
                break
        else:
            continue
            
pool.join()
os.system(f'rm -rf "/home/zhongshitong/data/RNA_seq_result/{taxonomy}/temp"')

os.chdir(f'/home/zhongshitong/data/RNA_seq_result/{taxonomy}')
re_arr_file = open(f'RNAseq_data_detail.txt', 'w+')
re_arr_file.write(re_arr)
re_arr_file.close()

TA_data = {'TA_names': data['TA_names'], 'TA_data': data['TA'], }
pd.DataFrame(TA_data).to_csv(f'TA-RNAseq_data.csv')
AG_data = {'AG_names': data['AG_names'], 'AG_data': data['AG'], }
pd.DataFrame(AG_data).to_csv(f'AG-RNAseq_data.csv')

Program: 100%|██████████████████████████████████████████████████| 2.23k/2.23k [13:21<00:00, 2.78B/s]
