In [1]:
import pathlib
import collections
import re
import os.path
import pylab as plt
import numpy as np
import pandas as pd
import matplotlib
from io import StringIO
from Bio.Seq import Seq
from Bio import SeqIO
import matplotlib.pyplot as plt
from dna_features_viewer import BiopythonTranslator
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import subprocess

export BLASTDB=path_to/blastnr

In [2]:
PRJ='project_name'
DATA_PATH=f'/path-to-data/'
sra='sra.fasta'
DB='nr'
#standard table
TABLE=1
#minimum protein length
MIN_PRO_LEN=20
#Expect value (E) for saving hits
E_VAL=0.001
NUM_THREADS=8

In [4]:
#local BLAST
def searchx(idx, query_string, database_path, alfile, hspfile):

    #Number of aligned sequences to keep. 
    max_tgt_seqs=1
    #Maximum number of HSPs (alignments) to keep for any single query-subject pair.
    #If this option is not set (None), BLAST shows all HSPs meeting the expect value criteria.
    max_hsps=1
    blastn_cline = NcbiblastpCommandline(
                                         db=database_path, 
                                         evalue=E_VAL,
                                         outfmt=5, 
                                         max_target_seqs=max_tgt_seqs, 
                                         max_hsps=max_hsps,
                                         num_threads=NUM_THREADS
                                        )
    out, err = blastn_cline(stdin=query_string)
    io_result = StringIO(out)
    blast_records = list(NCBIXML.parse(io_result))
    for blast_record in blast_records:
        if len(blast_record.alignments) == 0:
            continue
        else:
            alignment = blast_record.alignments[0]
            title = alignment.title
            query_length = blast_record.query_letters
            alfile.write(f"id: {idx}, title: {title}, accession: {alignment.accession}\n")
            print(f"id: {idx}, title: {title}, accession: {alignment.accession}")
            for hsp in alignment.hsps:
                hspfile.write(f"id: {idx}, title: {title}, accession: {alignment.accession}, hit_id: {alignment.hit_id}, length: {alignment.length}, query_length {blast_record.query_letters}, score: {hsp.score}, expect: {hsp.expect}, align_length: {hsp.align_length}, bits: {hsp.bits}, query: {hsp.query}, sbjct: {hsp.sbjct}, query_start: {hsp.query_start}, query_end: {hsp.query_end}, sbjct_start: {hsp.sbjct_start}, sbjct_end: {hsp.sbjct_end}\n")
    return 

In [5]:
def remote_blastx(sequence_data):
    result_handle = NCBIWWW.qblast("blastx", DB, sequence_data)
    blast_results = result_handle.read()
    return blast_results

In [6]:
def get_reads(f_path, r_file):
    readlist=[]
    idlist=[]
    try:
        for record in SeqIO.parse(f_path+r_file, "fasta"):
            idlist.append(record.id)
            readlist.append(record.seq)
        return idlist, readlist
    except FileNotFoundError as e:
        return None,None

In [7]:
def nt_to_prot(seq):
    #takes nucleotides, returns protein 
    rc= seq.reverse_complement()
    p_rc = rc.translate()
    p_seq = seq.translate()

    return p_seq, p_rc

In [8]:
def run_remote_blast(sra):
    print(sra)

    blast_path=DATA_PATH+'blast/'
    pathlib.Path(blast_path).mkdir(exist_ok=True)

    idlist, seqlist=get_reads(DATA_PATH, sra)
    if idlist is not None:
        forfs=collections.OrderedDict()
        rorfs=collections.OrderedDict()
        for i, seq in zip(idlist, seqlist):
            contig=i.replace('>','')
            forfs[contig]=seq
            rorfs[contig]=seq.reverse_complement()

        fresult_file=f'{blast_path}{sra.split(".fasta")[0]}_fwdk_{DB}_remote.txt'


        #open file first and write as we go
        fresultf= open(fresult_file,"a")
        continue_ = False
        for key, orfl in forfs.items():
            if key == "CoV-bait-4008":
                continue_ = True
            if continue_:
                #idx, query_string, database_path, outfile
                print(f'key: {key}, orfl: {orfl}')
                blast_results=remote_blastx(orfl)
                fresultf.write(f'<Key>{key}</Key>\n')
                fresultf.write(f'<Untranslated_Input>{orfl}</Untranslated_Input>\n')
                fresultf.write(blast_results)
                fresultf.flush()
        fresultf.close()


In [1]:
run_remote_blast(sra)


NameError: name 'run_remote_blast' is not defined