In [53]:
import re
import os
import pandas as pd

def load_fasta(fasta_path):
    """Without Using Biopython"""
    fasta = open(fasta_path, 'r').read()
    record_dict = dict()
    records = fasta.split('>')
    for rec in records:
        if rec:
            rec_lines = rec.split('\n')
            header = rec_lines[0]
            seq = ''.join(rec_lines[1:]).replace(' ', '')
            record_dict[header] = seq
    return record_dict

def check_seq_for_motif(seq):
    """
    Check seq for motif
    Input:
    seq: str, sequence
    motif: str, motif to search seq for
    """
    pattern = '(AT){3,}'
    r_all = re.findall(pattern, seq)
    r_first = re.search(pattern, seq)
    at_count = len(r_all)
    if r_first:
        start, end = r_first.span()
        print(f'AT repeat discovered {at_count} time(s)\n')
        print(f'AT repeat first found at positions {start} - {end}')

    else:
        print(f'Unable to find AT repeat\n\n')
        
    return at_count
        
        
def check_all_seqs(fasta_path):
    """
    Input:
    fasta_path: str, path to fasta file
    motif: str, motif to search for within fasta
    
    """
    if os.path.exists(fasta_path):
        record_dict = load_fasta(fasta_path)
    else:
        print(f'Fasta path {fasta_path} does not exist.')
        raise OSError
    total_count = 0
    N_records = 0
    for header in record_dict:
        N_records += 1
        print(f'Fasta header: {header}\n')
        tcount = check_seq_for_motif(record_dict[header])
        total_count += tcount
    print(f'AT repeat discovered {total_count} time(s) across {N_records} records.\n\n\n')

    
def parse_embl(embl_path, write=True):
    embl = open(embl_path, 'r').read()
    embl_records = embl.split('//')
    embl_dict = dict()
    for record in embl_records[:-1]:

        embl_lines = record.split('\n')
        seq_id = None
        description = None
        seq_start = None
        for ind, line in enumerate(embl_lines):
            if line[:2] == 'ID':
                seq_id = line[2:].strip()
            elif line[:2] == 'DE':
                description = line[2:].strip()
            elif line[:2] == 'SQ':
                seq_start = ind + 1
                break
        sequence = ''.join([line for line in embl_lines[seq_start:]]).replace(' ', '')
        sequence = re.sub('[0-9]+', '', sequence).upper()
        embl_dict[seq_id] = {'descrption': description, 'sequence': sequence}
    embl_df = pd.DataFrame(embl_dict).T.reset_index()
    embl_df.rename(columns={'index': 'Seq_ID'}, inplace=True)
    if write is True:
        embl_df.to_csv('embl_info.csv')
    return embl_df
        

check_all_seqs('Test.txt')
embl_df = parse_embl('EMBL_records.txt')

embl_df


Fasta header: TestSeq

AT repeat discovered 1 time(s)

AT repeat first found at positions 89 - 105
AT repeat discovered 1 time(s) across 1 records.





Unnamed: 0,Seq_ID,descrption,sequence
0,M91373; SV 1; linear; mRNA; STD; PLN; 1131 BP.,"Cucumis sativus peroxidase mRNA, complete cds.",ACCAGAGAAGACCCCATTTGCAGTATCAAAAATGGGTTTACCTAAA...
1,M57705; SV 1; linear; mRNA; STD; ROD; 237 BP.,"Rat truncated thyroid peroxidase mRNA, 3' end.",CATCGATCATGACATTGCTCTCACACCACAGAGCACCAGCACAGCA...


# Using a Custom wrapper through the sh library 
### Only available in unix-like systems

In [None]:

from sh import blastn
from Bio.Blast.Applications import NcbiblastnCommandline as bioblastn
from Bio import SearchIO
import pandas as pd


def blastn_wrapper(query, out, db='protein', task='megablast', perc_identity=92, qcov_hsp_perc=100,
                           max_target_seqs=15, threads=3, buffer=False, word_size=15):

    """
    Input:
    query: str, path to fasta file of universal regions
    out: str, path to output blast tab file
    perc_identity: int, percentage identical to query entries
    qcov_hsp_perc: int, percentage of query entry covered by hits
    max_target_seqs: int, number of hits to show in tab file
    Used for specificity analysis of final universal regions.
    """

    if not db:
        db = 'protein'

    outfmt = "6" # qseqid sseqid pident qcovhsp length mismatch gapopen qstart qend sstart send evalue bitscore"

    blastn('-db', db, '-perc_identity', perc_identity, '-task', task, '-remote',
               '-qcov_hsp_perc', qcov_hsp_perc, '-dust', 'no', '-max_target_seqs', max_target_seqs,
               '-out', out, '-query', query, '-outfmt', outfmt, '-word_size', word_size, '-gapopen', 5, '-gapextend', 2,
            '-evalue', 10)

fasta_path = 'mitochondrial.fasta'
blast_output = 'mitochondrial.tab'
blastn_wrapper(fasta_path, blast_output, db = 'nt')



# Using Biopythons Blast Wrapper

In [None]:
blast_txt = 'mitochondrial.txt'
blast_cline = bioblastn(query=fasta_path, out=blast_txt, task='megablast', remote=True, db='nt',)
stdout, stderr = blast_cline()


# Parsing using biopython

In [40]:
bltab = SearchIO.parse('mitochondrial.txt', 'blast-text', comments=True)
for line in bltab:
    print(line)

# Parsing using Pandas

In [None]:
headers = ['query', 'subject', 'pc_identity', 'qcov_hsp', 'aln_length', 'mismatches', 'gaps_opened',
                   'query_start', 'query_end', 'subject_start', 'subject_end', 'e_value', 'bitscore']
pd.read_csv('mitochondrial.tab', '\t', skiprows=7, header=headers)