## Use BLAST2 to check for sequences



In [1]:
import os
import time
import pandas as pd
import numpy as np
import pathlib
from io import StringIO
from Bio import SeqIO
from Bio.Cluster import distancematrix
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import subprocess
from recan.simgen import Simgen
from matplotlib import pyplot as plt
import seaborn as sns
from os import listdir
from os.path import isfile, join

In [2]:
pd.options.display.max_seq_items = 2000



```
makeblastdb -in SRR5722761_final.contigs.fa -dbtype nucl -parse_seqids -out /mnt/1TB_ssd/Data/BLAST/asm_db/SRR5722761_megahit_default
```


```
export BLASTDB=/mnt/1TB_ssd/Data/BLAST/asm_db
```

In [3]:
PROJECT_CODE='PRJNA396502'

BASE_PATH=f'/mnt/1TB_0/Data/Assembly/'

In [4]:
SRA_LIST=['SRR5885860']

### BSL3/4 organisms and vector search and Bat CoV search

In [5]:
ORGANISM_CSVS=['/mnt/1TB_0/Data/fasta/organisms_all.csv', '/mnt/1TB_0/Data/fasta/cov_common_name.csv']
CSV_PATHS=['/mnt/1TB_0/Data/fasta/complete_nucleotide/','/mnt/1TB_0/Data/fasta/common_name_fasta/']

In [6]:
def make_db(asm_path, sra_code):
    cmd = f"makeblastdb -in {asm_path+sra_code}_final.contigs.fa -dbtype nucl -parse_seqids -out /mnt/1TB_ssd/Data/BLAST/asm_db/{sra_code}_megahit_default"
    subprocess.check_call(cmd, shell=True)

In [7]:
cmd = "export BLASTDB=/mnt/1TB_ssd/Data/BLAST/asm_db"
subprocess.check_call(cmd, shell=True)

0

In [8]:
def check_path(sequences, fasta_path):
    for fname in sequences:
        if not os.path.isfile(fasta_path+fname):
            print(f'File non existant: {fasta_path}{fname}')

In [9]:
def get_data(df):
    sequences=df['File_Name']
    accessions=df['Accession']
    assert len(sequences)==len(accessions)
    return sequences, accessions

In [10]:
def run_blast2(sequences, accessions, sra_code, fasta_path, out_file_path):
    for g, asc in zip(sequences, accessions):
        out_file=f'{sra_code}_{asc}_BLAST2.txt'
        db=f'{sra_code}_megahit_default'
        cmd = f"blastn -query {fasta_path+g} -db {db} -num_threads 6 -out {out_file_path+out_file} -evalue 0.001 -max_target_seqs 100 -perc_identity 90 -max_hsps 10"
        subprocess.check_call(cmd, shell=True)

In [11]:
def cleanup(sequences, accessions, sra_code, out_file_path):
    del_list=[]
    for g, asc in zip(sequences, accessions):
        out_file=f'{sra_code}_{asc}_BLAST2.txt'
        with open(out_file_path+out_file) as myfile:
            if '***** No hits found *****' in myfile.read():
                del_list.append(out_file)
    for f in del_list:
        if os.path.exists(out_file_path+f):
            os.remove(out_file_path+f)
        else:
            print(f"{f} does not exist")

In [12]:
def blast_workflow(sequences, accessions,csv_path, search_name, rebuild_db):
    match_dict={}
    for sra_code in SRA_LIST:
        start_time=time.time()
        print(f'Running blast2 on {sra_code}')
        matches=[]
        data_path=f'{BASE_PATH}{PROJECT_CODE}/{sra_code}/'
        asm_path=data_path+'megahit_default/'
        b_path=data_path+f'megahit_asm_BLAST2/'
        pathlib.Path(b_path).mkdir(exist_ok=True)
        out_file_path=data_path+f'megahit_asm_BLAST2/{search_name}/'
        pathlib.Path(out_file_path).mkdir(exist_ok=True)
        if rebuild_db:
            make_db(asm_path, sra_code)
        elif not os.path.isfile(f'/mnt/1TB_ssd/Data/BLAST/asm_db/{sra_code}_megahit_default.ndb'):
            make_db(asm_path, sra_code)
        run_blast2(sequences, accessions, sra_code, csv_path, out_file_path)
        cleanup(sequences, accessions, sra_code, out_file_path)
        matchfiles = [f for f in listdir(out_file_path) if isfile(join(out_file_path, f))]
        for f in matchfiles:
            asc=f.split('_',1)[1]
            asc=asc.split('_BLAST2.txt')[0]
            try:
                idx = accessions.tolist().index(asc.strip())
                match=sequences[idx]
                matches.append(match)
            except ValueError as e:
                print(f'accessions:{str(accessions.tolist())}')
                print(f'asc: {asc}, file: {f} error: {e}')
                raise ValueError
        match_dict[sra_code]=matches
        print(f'Elapsed: {time.time()-start_time}')
    return match_dict
                

In [13]:
def main_worflow(organism_csv, csv_path, search_name, rebuild_db):
    result_list=[]
    df=pd.read_csv(organism_csv)
    sequences, accessions=get_data(df)
    match_dict=blast_workflow(sequences, accessions,csv_path, search_name, rebuild_db)
    result_list.append(match_dict)
    return result_list

## BSL organism hits

In [14]:
bsl_result_list=main_worflow(ORGANISM_CSVS[0],CSV_PATHS[0], search_name='bsl', rebuild_db=True)

Running blast2 on SRR5885860
Elapsed: 326.1996157169342


In [15]:
bsl_result_list

[{'SRR5885860': ['NC_000007_14_CFTR_gene.fa',
   'JN950623_1_Mus_musculus_targeted_non-conditional_lacZ-tagged_mutant_allele_Herc1_tm1e_EUCOMM_Wtsi_transgenic.fa',
   'KJ473820_1_BtPa-BetaCoV_GD2013_complete_genome.fa',
   'MK787297_1_cloning_vector_pR6K-KCA.fa',
   'MK280359_1_Homo_sapiens_lncAB370_3_lncRNA_gene_complete_sequence.fa',
   'MK279923_1_Homo_sapiens_lncAB599_3_lncRNA_gene_complete_sequence.fa',
   'MK280367_1_Homo_sapiens_lncAB371_6_lncRNA_gene_complete_sequence.fa',
   'MN996867_1_Cloning_vector_pcDNA3_1_complete_sequence.fa',
   'NC_006549_1_Singapore_grouper_iridovirus_complete_genome.fa',
   'MN611520_1_Pipistrellus_abramus_bat_coronavirus_HKU5-related_isolate_BY140568_complete_genome.fa',
   'NC_000004_12_ABCG2_gene.fa',
   'NC_000005_10_TERT_gene.fa',
   'EU410304_1_Synthetic_Vaccinia_virus_clone_GLV-1h68_complete_genome.fa',
   'NC_009020_1_HKU5_complete_genome.fa',
   'MF164268_1_Homo_sapiens_clone_BAC_JH12_genomic_sequence.fa']}]

### BSL list of organisms we are shearching for

In [16]:
pd.options.display.max_rows = 2000
df=pd.read_csv(ORGANISM_CSVS[0])
df

Unnamed: 0,Name,Accession,File_Name,category
0,Bacteriophage_S13_complete_genome,AF274751_1,AF274751_1_Bacteriophage_S13_complete_genome.fa,BSL_organism
1,Human_endogenous_retrovirus_H_HERV-H_env62_pro...,AJ289709_1,AJ289709_1_Human_endogenous_retrovirus_H_HERV-...,BSL_organism
2,Porcine_hemagglutinating_encephalomyelitis_vir...,DQ011855_1,DQ011855_1_Porcine_hemagglutinating_encephalom...,BSL_organism
3,Bat_SARS_coronavirus HKU3-1_complete_genome,DQ022305_2,DQ022305_2_Bat_SARS_coronavirus HKU3-1_complet...,BSL_organism
4,Coliphage_ID45_complete_genome,DQ079883_1,DQ079883_1_Coliphage_ID45_complete_genome.fa,BSL_organism
5,Coliphage_WA11_complete_genome,DQ079895_1,DQ079895_1_Coliphage_WA11_complete_genome.fa,BSL_organism
6,bat_SARS_coronavirus_HKU3-2_complete_genome,DQ084199_1,DQ084199_1_bat_SARS_coronavirus_HKU3-2_complet...,BSL_organism
7,bat_SARS_coronavirus_HKU3-3_complete_genome,DQ084200_1,DQ084200_1_bat_SARS_coronavirus_HKU3-3_complet...,BSL_organism
8,Synthetic_Vaccinia_virus_clone_GLV-1h68_comple...,EU410304_1,EU410304_1_Synthetic_Vaccinia_virus_clone_GLV-...,BSL_organism
9,Cote_d-Ivoire_ebolavirus_complete_genome,FJ217162_1,FJ217162_1_Cote_d-Ivoire_ebolavirus_complete_g...,BSL_organism


## Bat Spikes

In [17]:
bat_result_list=main_worflow(ORGANISM_CSVS[1],CSV_PATHS[1], search_name='batcov', rebuild_db=False)

Running blast2 on SRR5885860
Elapsed: 43.773359298706055


In [18]:
bat_result_list

[{'SRR5885860': []}]

### Bat spikes and misc other virueses searching for in second run

In [19]:
df_bat=pd.read_csv(ORGANISM_CSVS[1])
df_bat

Unnamed: 0,Name,Accession,File_Name,category
0,AAP13441_1_SARS-CoV_Urbani_apike,AAP13441_1_SARS-CoV_Urbani_apike,AAP13441_1_SARS-CoV_Urbani_apike.fa,misc
1,AB981587_1_Influenza_virus_type_A_A-NWS-33_H1N...,AB981587_1_Influenza_virus_type_A_A-NWS-33_H1N...,AB981587_1_Influenza_virus_type_A_A-NWS-33_H1N...,misc
2,AcCoV-JC34_spike_YP_009380521,AcCoV-JC34_spike_YP_009380521,AcCoV-JC34_spike_YP_009380521.fa,misc
3,BM48-31_BGR_2008,BM48-31_BGR_2008,BM48-31_BGR_2008.fa,misc
4,BtRl-BetaCoV_SC2018,BtRl-BetaCoV_SC2018,BtRl-BetaCoV_SC2018.fa,misc
5,BtRs-BetaCoV_HuB2013,BtRs-BetaCoV_HuB2013,BtRs-BetaCoV_HuB2013.fa,misc
6,civet014_AAU04661,civet014_AAU04661,civet014_AAU04661.fa,misc
7,CoV_ZC45,CoV_ZC45,CoV_ZC45.fa,misc
8,CoV_ZXC21,CoV_ZXC21,CoV_ZXC21.fa,misc
9,CoV_ZXC21_,CoV_ZXC21_,CoV_ZXC21_.fa,misc
