In [1]:
import os
import time
import pandas as pd
import numpy as np
import pathlib
from io import StringIO
from Bio import SeqIO
from Bio.Cluster import distancematrix
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import subprocess
from matplotlib import pyplot as plt

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

MagicBlast:

- raw reads to nt
- contigs to nt
- raw reads to gsa_virus
- contigs to gsa_virus

In [3]:
PRJ='PRJNA605983'
DATA_PATH=f'/mnt/1TB_0/Data/Assembly/{PRJ}/'
DB='influenza_ph7n9'

In [4]:
def clean_string(s):
    s = s.replace(',', '').strip()
    return s

In [5]:
def get_contigs_ascessions(contigs_file):
    contigs=[]
    accessions=[]
    with open(contigs_file, 'r') as temp_f:
        for line in temp_f:
            if len(line.strip())>0:
                vals=line.split("\t")
                contigs.append(clean_string(vals[0]))
                accessions.append(clean_string(vals[2]))
    return  contigs, accessions


In [29]:
def get_raw_sam_ascessions(reads_file, machine_id='v300043428'):
    accessions=[]
    with open(reads_file, 'r') as temp_f:
        for line in temp_f:
            if len(line.strip())>0:
                if line.startswith(machine_id) or line.startswith(f'@{machine_id}'):
                    vals=line.split("\t")
                    accessions.append(clean_string(vals[2]))
    return accessions

In [30]:
def get_val_count(accessions):
    values, counts = np.unique(accessions, return_counts=True)
    idx = np.argsort(counts)[::-1]
    values = np.array(values)[idx]
    counts = np.array(counts)[idx]
    return values, counts, idx

In [31]:
def get_titles(df, values):
    titles=[]
    for v in values:
        l=df.loc[df.accession==v, 'title'].tolist()
        titles.append(l[0])
    return titles

In [32]:
def get_accession_dat(values, dbname='nt'):
    vdats=[]
    for v in values:
        try:
            vdat=!blastdbcmd -db $dbname -entry $v
        except Exception as e:
            if dbname!='nt':
                vdat=!blastdbcmd -db 'nt' -entry $v
        if vdat is not None:
            vdats.append(vdat[0].split(',')[0])
    return vdats

In [33]:
def get_accession_title(values, df):
    accessions=df.accession.tolist()
    descriptions=df.description.tolist()
    titles=[]
    for v in values:
        i=accessions.index(v)
        titles.append(descriptions[i])
    assert len(values)==len(titles)
    return titles
        

## Raw Reads

In [34]:
df_titles=pd.read_csv('/mnt/1TB_0/Data/fasta/combined/influenza_pH7N9_titles.csv', names=['accession','description'])

In [35]:
df_titles

Unnamed: 0,accession,description
0,NC_026425.1,Influenza A virus (A/Shanghai/02/2013(H7N9)) ...
1,NC_026422.1,Influenza A virus (A/Shanghai/02/2013(H7N9)) ...
2,NC_026427.1,Influenza A virus (A/Shanghai/02/2013(H7N9)) ...
3,NC_026423.1,Influenza A virus (A/Shanghai/02/2013(H7N9)) ...
4,NC_026428.1,Influenza A virus (A/Shanghai/02/2013(H7N9)) ...
5,NC_026426.1,Influenza A virus (A/Shanghai/02/2013(H7N9)) ...
6,NC_004910.1,Influenza A virus pb2 gene for polymerase Pb2...
7,NC_004909.1,Influenza A virus na gene for neuraminidase ...
8,NC_002728.1,Nipah virus complete genome
9,NC_012532.1,Zika virus complete genome


In [36]:
#sra_list= ['SRR11092059','SRR11092060']

In [37]:
sra_list= ['SRR11092059','SRR11092060','SRR11092061','SRR11092062','SRR11092063','SRR11092064','SRR11092056','SRR11092057','SRR11092058']

In [38]:
for sra in sra_list:
    out_path=f'{DATA_PATH}{sra}/magic_blast/'
    sam_out=f'{out_path}{sra}_magicBLAST_{DB}.sam'
    machine_id='v300043428'
    if sra in ['SRR11092056','SRR11092057','SRR11092058', 'SRR11092064']:
        machine_id='M04943'
    accessions=get_raw_sam_ascessions(sam_out, machine_id)
    values, counts, idx = get_val_count(accessions)
    titles=get_accession_title(values, df_titles)
    print(f'\n{sra}')
    for t, v, c in zip(titles, values, counts):
        print(f'{t}, {v}, {c}')


SRR11092059
 Influenza A virus (A/Shanghai/02/2013(H7N9)) segment 4 hemagglutinin (HA) gene complete cds, NC_026425.1, 16359
 Human herpesvirus 5 strain Merlin complete genome, NC_006273.2, 2582
 Nipah virus complete genome, NC_002728.1, 118
 Influenza A virus (A/Shanghai/02/2013(H7N9)) segment 1 polymerase PB2 (PB2) gene complete cds, NC_026422.1, 96
 Moloney murine leukemia virus complete genome, NC_001501.1, 70
 Simian virus 40 complete genome, NC_001669.1, 62
 Influenza A virus (A/Shanghai/02/2013(H7N9)) segment 7 matrix protein 2 (M2) and matrix protein 1 (M1) genes complete cds, NC_026427.1, 25
 Japanese encephalitis virus isolate JEV1805M complete genome, MN639770.1, 14
 Influenza A virus (A/Shanghai/02/2013(H7N9)) segment 2 polymerase PB1 (PB1) and PB1-F2 protein (PB1-F2) genes complete cds, NC_026423.1, 10
 Woodchuck hepatitis virus (WHV) complete genome  clone WHV 59, M19183.1, 6
 Influenza A virus (A/Shanghai/02/2013(H7N9)) segment 8 nuclear export protein (NEP) and nonstru

### Lane02

In [41]:
sra_list= ['SRR11092062','SRR11092063']

In [42]:
for sra in sra_list:
    out_path=f'{DATA_PATH}{sra}/magic_blast/'
    sam_out=f'{out_path}{sra}_L02_magicBLAST_{DB}.sam'
    machine_id='v300043428'
    if sra in ['SRR11092056','SRR11092057','SRR11092058', 'SRR11092064']:
        machine_id='M04943'
    accessions=get_raw_sam_ascessions(sam_out, machine_id)
    values, counts, idx = get_val_count(accessions)
    titles=get_accession_title(values, df_titles)
    print(f'\n{sra}')
    for t, v, c in zip(titles, values, counts):
        print(f'{t}, {v}, {c}')


SRR11092062
 Moloney murine leukemia virus complete genome, NC_001501.1, 6
 Rous sarcoma virus complete genome, NC_001407.1, 2

SRR11092063
