In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd


# vseek imports
sys.path.append("../")
import vseek.common.vseek_paths as vsp
from vseek.apis.dbat_vir_db import collect_dbatvir_data
from vseek.apis.ncbi import get_all_viral_accessions

In [2]:
#NOTE: the data has been downloaded already
# collect the data from the VBatDB
batvir_df = collect_dbatvir_data()
batvir_df

Unnamed: 0,Viruses,Viral family,From Bat,Bat diet type,Bat family,Sample type,Collection year,Sampling country,Sequence coding,References
0,Bat adeno-associated virus 07YN,Parvoviridae,unclassified Chiroptera,,,Feces,2007,China,Cap,Unpublished
1,Bat adeno-associated virus 09YN,Parvoviridae,unclassified Chiroptera,,,Feces,2009,China,Cap,Unpublished
2,Bat adeno-associated virus 1003-HB-Mr,Parvoviridae,Myotis ricketti,Insectivore and piscivore,Vespertilionidae,Feces,2007,China,Cap,"J Gen Virol 2010, 91(Pt 10):2601-9"
3,Bat adeno-associated virus 1008-HB-Mr,Parvoviridae,Myotis ricketti,Insectivore and piscivore,Vespertilionidae,Feces,2007,China,Cap,"J Gen Virol 2010, 91(Pt 10):2601-9"
4,Bat adeno-associated virus 1019-HB-Rs,Parvoviridae,Rhinolophus sinicus,Insectivore,Rhinolophidae,Feces,2007,China,Cap,"J Gen Virol 2010, 91(Pt 10):2601-9"
...,...,...,...,...,...,...,...,...,...,...
13957,Rhinolophus pusillus coronavirus ZSR42,Coronaviridae,Rhinolophus pusillus,Insectivore,Rhinolophidae,Mix,2015,China,RdRp,"Sci Rep 2017, 7(1):10917"
13958,Rhinolophus pusillus norovirus ZSR43,Caliciviridae,Rhinolophus pusillus,Insectivore,Rhinolophidae,Mix,2015,China,RdRp,"Sci Rep 2017, 7(1):10917"
13959,Rhinolophus pusillus norovirus ZSR45,Caliciviridae,Rhinolophus pusillus,Insectivore,Rhinolophidae,Mix,2015,China,RdRp,"Sci Rep 2017, 7(1):10917"
13960,Rhinolophus pusillus coronavirus ZSR6,Coronaviridae,Rhinolophus pusillus,Insectivore,Rhinolophidae,Mix,2015,China,RdRp,"Sci Rep 2017, 7(1):10917"


In [3]:
#NOTE: the data has been downloaded already
# downloading ncbi accession list
ncbi_acc_df = get_all_viral_accessions()
ncbi_acc_df

Unnamed: 0,Representative,Neighbor,Host,Selected lineage,Taxonomy name,Segment name
0,NC_003663,HQ420896,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
1,NC_003663,KY463519,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
2,NC_003663,HQ420897,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
3,NC_003663,MK035759,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
4,NC_003663,KY569019,"human,vertebrates","Poxviridae,Orthopoxvirus,Cowpox virus",Cowpox virus,segment
...,...,...,...,...,...,...
259836,NC_062761,MZ334528,,Halorubrum virus HRTV-28,Halorubrum virus HRTV-28,segment
259837,NC_062762,MZ334526,,Halorubrum virus HRTV-29,Halorubrum virus HRTV-29,segment
259838,NC_062763,MZ334501,,"Myoviridae,Haloferacalesvirus,Halorubrum virus...",Halorubrum virus HSTV-4,segment
259839,NC_060136,OK040786,,"Siphoviridae,,Gordonia phage Kudefre",Gordonia phage Kudefre,segment


In [4]:
# Fomatting functions 
def split_lineages(lineage: str) -> list:
    split_lineage = lineage.split(",")
    family, genus = split_lineage[0], split_lineage[1]
    if family == "":
        return pd.Series([np.nan, genus])
    elif genus == "":
        return pd.Series([family, np.nan])
    elif family == "" and genus == "":
        return pd.Series([np.nan, np.nan])
    return pd.Series([family, genus])


def clean_accession(accession: str) -> str:
    if accession is np.nan:
        return np.nan

    accession = accession.split(",")
    if len(accession) > 0:
        return accession[0]

In [5]:
# filter viral family that are only found in bats
viral_fam = batvir_df["Viral family"].dropna().unique().tolist()

# filtering main data frame to only viruses found in all bats 
sel_dfs = []
for v_fam in viral_fam:
    sel_df = ncbi_acc_df.loc[ncbi_acc_df["Selected lineage"].str.contains(v_fam)]
    if len(sel_df) == 0:
        continue
    sel_dfs.append(sel_df)


# selecting viral genomes found in bats that are also found in human
sel_dfs = pd.concat(sel_dfs, axis=0)
sel_dfs = sel_dfs.loc[sel_dfs["Host"].notnull()]
sel_dfs = sel_dfs.loc[sel_dfs["Host"].isin(["human"])]

# splitting the selected lineage into Family, Genius 
sel_dfs[["family", "genus"]] = sel_dfs["Selected lineage"].apply(split_lineages)
sel_dfs["Representative"] = sel_dfs["Representative"].apply(clean_accession)
sel_dfs = sel_dfs.drop(["Selected lineage"], axis="columns")

# removing enties that do not have either a family or a genus entry
sel_dfs.loc[(sel_dfs["family"].notnull() & (sel_dfs["genus"].notnull()))]

# grouping data frames based on accession and removing duplicates
groups = sel_dfs.groupby("Representative")
cleaned_dfs = []
for name, df in groups:
    df = df.drop("Neighbor", axis="columns")
    df = df.drop_duplicates(subset=["Representative"])
    cleaned_dfs.append(df)

# generating bat-human viral genome csv
init_db = vsp.init_db_path()
save_path = str((Path(init_db) / "filtered_bat_virus.csv.gz").absolute())
cleaned_dfs = pd.concat(cleaned_dfs, axis=0)
cleaned_dfs.to_csv(save_path, compression="gzip")

In [6]:
cleaned_dfs

Unnamed: 0,Representative,Host,Taxonomy name,Segment name,family,genus
810,NC_001348,human,Human alphaherpesvirus 3,segment,Herpesviridae,Varicellovirus
214039,NC_001352,human,Human papillomavirus type 2,segment,Papillomaviridae,Alphapapillomavirus
208580,NC_001354,human,Human papillomavirus type 41,segment,Papillomaviridae,Nupapillomavirus
208577,NC_001356,human,Human papillomavirus type 1a,segment,Papillomaviridae,Mupapillomavirus
194084,NC_001430,human,Enterovirus D68,segment,Picornaviridae,Enterovirus
...,...,...,...,...,...,...
257436,NC_055340,human,Echarate virus,segment,Phenuiviridae,Phlebovirus
257437,NC_055341,human,Echarate virus,segment,Phenuiviridae,Phlebovirus
257464,NC_055342,human,Maldonado virus,segment,Phenuiviridae,Phlebovirus
257465,NC_055343,human,Maldonado virus,segment,Phenuiviridae,Phlebovirus
