# Generating a collection data 

In this notebook, the main focuses is collecting and wrangling all the data for our series of processes. The data gathered are from two main databases known as [DBatVir](http://www.mgc.ac.cn/DBatVir/) and [NCBI Genome](https://www.ncbi.nlm.nih.gov/genome/viruses/). 

- DBatVIR -> database that contains known viral families associated with each bat
- NCBI -> used for obtaining genome and gene information

##  step 1: Collect 
The first step is to gather all the data from the databases and save it to the ./db folder. The `./db` will serve as our database when ever need to access data. This will prevent constantly sending requests to the databases. 


## Step 2: Filter

## Step 3: Profile

In [40]:
import sys
from typing import Union
from io import StringIO
from pathlib import Path
import numpy as np
import pandas as pd
import entrezpy.conduit

# vseek imports
sys.path.append("../")
import vseek.common.vseek_paths as vsp
from vseek.apis.dbat_vir_db import collect_dbatvir_data
from vseek.apis.ncbi import get_all_viral_accessions



In [2]:
#NOTE: the data has been downloaded already
# collect the data from the VBatDB
batvir_df = collect_dbatvir_data()
batvir_df

Unnamed: 0,Viruses,Viral family,From Bat,Bat diet type,Bat family,Sample type,Collection year,Sampling country,Sequence coding,References
0,Bat adeno-associated virus 07YN,Parvoviridae,unclassified Chiroptera,,,Feces,2007,China,Cap,Unpublished
1,Bat adeno-associated virus 09YN,Parvoviridae,unclassified Chiroptera,,,Feces,2009,China,Cap,Unpublished
2,Bat adeno-associated virus 1003-HB-Mr,Parvoviridae,Myotis ricketti,Insectivore and piscivore,Vespertilionidae,Feces,2007,China,Cap,"J Gen Virol 2010, 91(Pt 10):2601-9"
3,Bat adeno-associated virus 1008-HB-Mr,Parvoviridae,Myotis ricketti,Insectivore and piscivore,Vespertilionidae,Feces,2007,China,Cap,"J Gen Virol 2010, 91(Pt 10):2601-9"
4,Bat adeno-associated virus 1019-HB-Rs,Parvoviridae,Rhinolophus sinicus,Insectivore,Rhinolophidae,Feces,2007,China,Cap,"J Gen Virol 2010, 91(Pt 10):2601-9"
...,...,...,...,...,...,...,...,...,...,...
13957,Rhinolophus pusillus coronavirus ZSR42,Coronaviridae,Rhinolophus pusillus,Insectivore,Rhinolophidae,Mix,2015,China,RdRp,"Sci Rep 2017, 7(1):10917"
13958,Rhinolophus pusillus norovirus ZSR43,Caliciviridae,Rhinolophus pusillus,Insectivore,Rhinolophidae,Mix,2015,China,RdRp,"Sci Rep 2017, 7(1):10917"
13959,Rhinolophus pusillus norovirus ZSR45,Caliciviridae,Rhinolophus pusillus,Insectivore,Rhinolophidae,Mix,2015,China,RdRp,"Sci Rep 2017, 7(1):10917"
13960,Rhinolophus pusillus coronavirus ZSR6,Coronaviridae,Rhinolophus pusillus,Insectivore,Rhinolophidae,Mix,2015,China,RdRp,"Sci Rep 2017, 7(1):10917"


In [3]:
#NOTE: the data has been downloaded already
# downloading ncbi accession list
ncbi_acc_df = get_all_viral_accessions()
ncbi_acc_df

Unnamed: 0,Representative,Neighbor,Host,Selected lineage,Taxonomy name,Segment name
0,NC_003663,HQ420896,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
1,NC_003663,KY463519,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
2,NC_003663,HQ420897,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
3,NC_003663,MK035759,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
4,NC_003663,KY569019,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
...,...,...,...,...,...,...
259836,NC_062761,MZ334528,,Halorubrum virus HRTV-28,Halorubrum virus HRTV-28,segment
259837,NC_062762,MZ334526,,Halorubrum virus HRTV-29,Halorubrum virus HRTV-29,segment
259838,NC_062763,MZ334501,,"Myoviridae,Haloferacalesvirus,Halorubrum virus...",Halorubrum virus HSTV-4,segment
259839,NC_060136,OK040786,,"Siphoviridae,,Gordonia phage Kudefre",Gordonia phage Kudefre,segment


In [4]:
# Fomatting functions 
def split_lineages(lineage: str) -> list:
    split_lineage = lineage.split(",")
    family, genus = split_lineage[0], split_lineage[1]
    if family == "":
        return pd.Series([np.nan, genus])
    elif genus == "":
        return pd.Series([family, np.nan])
    elif family == "" and genus == "":
        return pd.Series([np.nan, np.nan])
    return pd.Series([family, genus])


def clean_accession(accession: str) -> str:
    if accession is np.nan:
        return np.nan

    accession = accession.split(",")
    if len(accession) > 0:
        return accession[0]

In [5]:
# filter viral family that are only found in bats
viral_fam = batvir_df["Viral family"].dropna().unique().tolist()

# filtering main data frame to only viruses found in all bats 
sel_dfs = []
for v_fam in viral_fam:
    sel_df = ncbi_acc_df.loc[ncbi_acc_df["Selected lineage"].str.contains(v_fam)]
    if len(sel_df) == 0:
        continue
    sel_dfs.append(sel_df)


# selecting viral genomes found in bats that are also found in human
sel_dfs = pd.concat(sel_dfs, axis=0)
sel_dfs = sel_dfs.loc[sel_dfs["Host"].notnull()]
sel_dfs = sel_dfs.loc[sel_dfs["Host"].isin(["human"])]

# splitting the selected lineage into Family, Genius 
sel_dfs[["family", "genus"]] = sel_dfs["Selected lineage"].apply(split_lineages)
sel_dfs["Representative"] = sel_dfs["Representative"].apply(clean_accession)
sel_dfs = sel_dfs.drop(["Selected lineage"], axis="columns")

# removing enties that do not have either a family or a genus entry
sel_dfs.loc[(sel_dfs["family"].notnull() & (sel_dfs["genus"].notnull()))]

# grouping data frames based on accession and removing duplicates
groups = sel_dfs.groupby("Representative")
cleaned_dfs = []
for name, df in groups:
    df = df.drop("Neighbor", axis="columns")
    df = df.drop_duplicates(subset=["Representative"])
    cleaned_dfs.append(df)

# generating bat-human viral genome csv
init_db = vsp.init_db_path()
save_path = str((Path(init_db) / "filtered_bat_virus.csv.gz").absolute())
cleaned_dfs = pd.concat(cleaned_dfs, axis=0)
cleaned_dfs.to_csv(save_path, compression="gzip")
cleaned_dfs

Unnamed: 0,Representative,Host,Taxonomy name,Segment name,family,genus
810,NC_001348,human,Human alphaherpesvirus 3,segment,Herpesviridae,Varicellovirus
214039,NC_001352,human,Human papillomavirus type 2,segment,Papillomaviridae,Alphapapillomavirus
208580,NC_001354,human,Human papillomavirus type 41,segment,Papillomaviridae,Nupapillomavirus
208577,NC_001356,human,Human papillomavirus type 1a,segment,Papillomaviridae,Mupapillomavirus
194084,NC_001430,human,Enterovirus D68,segment,Picornaviridae,Enterovirus
...,...,...,...,...,...,...
257436,NC_055340,human,Echarate virus,segment,Phenuiviridae,Phlebovirus
257437,NC_055341,human,Echarate virus,segment,Phenuiviridae,Phlebovirus
257464,NC_055342,human,Maldonado virus,segment,Phenuiviridae,Phlebovirus
257465,NC_055343,human,Maldonado virus,segment,Phenuiviridae,Phlebovirus


In [1]:
# processing
def _call_entrez_viral_genes(email: str, accession: str, buffer=0.5) -> str:
    """Submits request to NCBI's genes database via entrez portal

    Parameters
    ----------
    email : str
        valid email address
    accession : str
        genome accession number
    buffer : int, float
        Buffer time added when submitting a request in seconds
        Default = 0.5

    Returns
    -------
    str
        TODO: add return type information
    """

    # call genes data 

    
    # internal parser that returns a tuple of ranges 
    pass
    

def _call_entrez_viral_genome(email: str, accession: str, buffer=0.5) -> str:
    """Submits request to NCBI viral genome database via entrez portal. 

    Parameters
    ----------
    email : str
        valid email address
    accession : str
        genome accession number
    buffer : int, float
        Buffer time added when submitting a request in seconds
        Default = 0.5

    Returns
    -------
    str
        Genome fasta 
    """
    # creating a reference of the default stdout 
    old_stdout = sys.stdout

    # creating a container storing stdout 
    fasta_result = StringIO()
    sys.stdout = fasta_result
    
    # calling 
    c = entrezpy.conduit.Conduit(email)
    fetch_influenza = c.new_pipeline()
    sid = fetch_influenza.add_search({f'db' : 'nucleotide', 'term' : {accession}, 'rettype':'count', 'datetype' : 'pdat'})
    fid = fetch_influenza.add_fetch({'retmax' : 10, 'retmode' : 'text', 'rettype': 'fasta'}, dependency=sid)
    c.run(fetch_influenza)

    # store the string from the stdout into variable
    genome = fasta_result.getvalue()

    # now redict back stdout to screen
    sys.stdout = old_stdout

    return genome

def generate_viral_genome_profile(genome_db: str) -> dict:
    """Generates a json file that profiles 

    Parameters
    ----------
    genome_db : str
        path to genome database

    Returns
    -------
    dict
        viral genome profiles. Also written in JSON format in the genome database
    """
    # gene_positions = _call_entrez_viral_genes(email=email, accession=acc_id)
    pass


def get_viral_genomes(email: str, accessions: Union[str, list]) -> dict:
    """ Downloads all viral gneomes and generates a viral genome profiles

    Parameters
    ----------
    email : str
        valid email address require to send requests to the NCBI
        database using entrez.
    accession : Union[str, list]
        string or list of accession numbers 

    Return
    ------
    dict
        viral genome profile
    
    """
    if isinstance(accession, str):
        accession = accession.split()

    for acc_id in accessions:
        viral_genome = _call_entrez_viral_genome(email=email, accession=acc_id)

        # writing out fasta file
        save_path = Path(vsp.init_genome_db_path()) / acc_id / ".fasta"
        with open(save_path) as outfile:
            outfile.write(viral_genome)

    
    # generated a profile 
    pass

        

        



SyntaxError: invalid syntax (3801659977.py, line 1)

In [8]:
with open("NC_001348_fasta.fasta", "w") as outfile:
    outfile.write(result_string)


In [None]:
# generate viral genome profile
# creating a json file that contains the necessary information in order to conduct the analysis

In [6]:
c = entrezpy.conduit.Conduit("erikishere3@gmail.com")
fetch_influenza = c.new_pipeline()
sid = fetch_influenza.add_search({f'db' : 'gene', 'term' : 'NC_003045', 'rettype':'count', 'datetype' : 'pdat'})
fid = fetch_influenza.add_fetch({'retmax' : 10, 'retmode' : 'text', 'rettype': 'fasta'}, dependency=sid)
c.run(fetch_influenza)


1. S
spike structural protein [Bovine coronavirus]
Other Aliases: BCoVgp04
Other Designations: spike structural protein; surface protein
Annotation:  NC_003045.1 (23641..27732)
ID: 921689

2. HE
hemmaglutinin-esterase [Bovine coronavirus]
Other Aliases: BCoVgp03
Other Designations: hemmaglutinin-esterase
Annotation:  NC_003045.1 (22352..23626)
ID: 921684

3. ORF1ab
ORF1a polyprotein;ORF1ab polyprotein [Bovine coronavirus]
Other Aliases: BCoVgp01
Other Designations: ORF1a polyprotein;ORF1ab polyprotein; polyprotein pp1a; polyprotein pp1ab
Annotation:  NC_003045.1 (211..21494)
ID: 921688

4. M
matrix protein [Bovine coronavirus]
Other Aliases: BCoVgp09
Other Designations: matrix protein
Annotation:  NC_003045.1 (28691..29383)
ID: 921686

5. E
small membrane protein [Bovine coronavirus]
Other Aliases: BCoVgp08
Other Designations: small membrane protein
Annotation:  NC_003045.1 (28422..28676)
ID: 921685

6. BCoVgp06
4.8 kDa non-structural protein [Bovine coronavirus]
Other Aliases: BCoVgp

<entrezpy.efetch.efetch_analyzer.EfetchAnalyzer at 0x168396c10>

In [25]:
# creating a reference of the default stdout 
old_stdout = sys.stdout

# creating a container storing stdout 
fasta_result = StringIO()
sys.stdout = fasta_result

# calling 
c = entrezpy.conduit.Conduit("erikishere3@gmail.com")
fetch_influenza = c.new_pipeline()
sid = fetch_influenza.add_search({f'db' : 'gene', 'term' : 'NC_003045', 'rettype':'count', 'datetype' : 'pdat'})
fid = fetch_influenza.add_fetch({'retmax' : 10, 'retmode' : 'text', 'rettype': 'fasta'}, dependency=sid)
c.run(fetch_influenza)

# store the string from the stdout into variable
gene_metadata = fasta_result.getvalue()

# now redict back stdout to screen
sys.stdout = old_stdout

In [32]:
# gene parser 
split_data = gene_metadata.splitlines()

cleaned_metadata = []
for line_data in split_data:
    if len(line_data) == 0:
        continue
    cleaned_metadata.append(line_data)

In [39]:
chunk_size = 6
chunk_data = [cleaned_metadata[i:i+chunk_size] for i in range(0, len(cleaned_metadata), chunk_size)]

for block in chunk_data:
    anno_metadata = block[4].split()
    anno_range_meta = anno_metadata[-1].strip().replace("(", "").replace(")", "").replace("..", " ")
    annotation_range = tuple(anno_range_meta.split())
    print(annotation_range)





('23641', '27732')
('22352', '23626')
('211', '21494')
('28691', '29383')
('28422', '28676')
('27889', '28026')
('27722', '27811')
('28106', '28435')
('21504', '22340')
('29393', '30739')


In [38]:
data

[['1. S',
  'spike structural protein [Bovine coronavirus]',
  'Other Aliases: BCoVgp04',
  'Other Designations: spike structural protein; surface protein',
  'Annotation:  NC_003045.1 (23641..27732)',
  'ID: 921689'],
 ['2. HE',
  'hemmaglutinin-esterase [Bovine coronavirus]',
  'Other Aliases: BCoVgp03',
  'Other Designations: hemmaglutinin-esterase',
  'Annotation:  NC_003045.1 (22352..23626)',
  'ID: 921684'],
 ['3. ORF1ab',
  'ORF1a polyprotein;ORF1ab polyprotein [Bovine coronavirus]',
  'Other Aliases: BCoVgp01',
  'Other Designations: ORF1a polyprotein;ORF1ab polyprotein; polyprotein pp1a; polyprotein pp1ab',
  'Annotation:  NC_003045.1 (211..21494)',
  'ID: 921688'],
 ['4. M',
  'matrix protein [Bovine coronavirus]',
  'Other Aliases: BCoVgp09',
  'Other Designations: matrix protein',
  'Annotation:  NC_003045.1 (28691..29383)',
  'ID: 921686'],
 ['5. E',
  'small membrane protein [Bovine coronavirus]',
  'Other Aliases: BCoVgp08',
  'Other Designations: small membrane protein'